diff --git a/gpu/gpu.go b/gpu/gpu.go index 74160b60..defdf04d 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -16,6 +16,7 @@ import ( "os" "path/filepath" "runtime" + "strconv" "strings" "sync" "unsafe" @@ -28,6 +29,7 @@ type handles struct { deviceCount int cudart *C.cudart_handle_t nvcuda *C.nvcuda_handle_t + oneapi *C.oneapi_handle_t } const ( @@ -80,6 +82,15 @@ var NvcudaWindowsGlobs = []string{ "c:\\windows\\system*\\nvcuda.dll", } +var OneapiWindowsGlobs = []string{ + "c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll", +} + +var OneapiLinuxGlobs = []string{ + "/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*", + "/usr/lib*/libze_intel_gpu.so*", +} + // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. var CudaTegra string = os.Getenv("JETSON_JETPACK") @@ -94,6 +105,8 @@ func initGPUHandles() *handles { var cudartMgmtPatterns []string var nvcudaMgmtName string var nvcudaMgmtPatterns []string + var oneapiMgmtName string + var oneapiMgmtPatterns []string tmpDir, _ := PayloadsDir() switch runtime.GOOS { @@ -105,6 +118,8 @@ func initGPUHandles() *handles { // Aligned with driver, we can't carry as payloads nvcudaMgmtName = "nvcuda.dll" nvcudaMgmtPatterns = NvcudaWindowsGlobs + oneapiMgmtName = "ze_intel_gpu64.dll" + oneapiMgmtPatterns = OneapiWindowsGlobs case "linux": cudartMgmtName = "libcudart.so*" if tmpDir != "" { @@ -115,6 +130,8 @@ func initGPUHandles() *handles { // Aligned with driver, we can't carry as payloads nvcudaMgmtName = "libcuda.so*" nvcudaMgmtPatterns = NvcudaLinuxGlobs + oneapiMgmtName = "libze_intel_gpu.so" + oneapiMgmtPatterns = OneapiLinuxGlobs default: return gpuHandles } @@ -141,6 +158,18 @@ func initGPUHandles() *handles { return gpuHandles } } + + oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns) + if len(oneapiLibPaths) > 0 { + deviceCount, oneapi, libPath := LoadOneapiMgmt(oneapiLibPaths) + if oneapi != nil { + slog.Debug("detected Intel GPUs", "library", libPath, "count", deviceCount) + gpuHandles.oneapi = oneapi + gpuHandles.deviceCount = deviceCount + return gpuHandles + } + } + return gpuHandles } @@ -181,39 +210,53 @@ func GetGPUInfo() GpuInfoList { if cpuVariant == "" && runtime.GOARCH == "amd64" { continue } - gpuInfo := GpuInfo{ - Library: "cuda", - } - var driverMajor int - var driverMinor int - if gpuHandles.cudart != nil { - C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo) - } else { - C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo) - driverMajor = int(gpuHandles.nvcuda.driver_major) - driverMinor = int(gpuHandles.nvcuda.driver_minor) - } - if memInfo.err != nil { - slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) - C.free(unsafe.Pointer(memInfo.err)) - continue - } - if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) { - slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor)) - continue - } - gpuInfo.TotalMemory = uint64(memInfo.total) - gpuInfo.FreeMemory = uint64(memInfo.free) - gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) - gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) - gpuInfo.MinimumMemory = cudaMinimumMemory - gpuInfo.DependencyPath = depPath - gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) - gpuInfo.DriverMajor = int(driverMajor) - gpuInfo.DriverMinor = int(driverMinor) + if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil { + gpuInfo := GpuInfo{ + Library: "cuda", + } + var driverMajor int + var driverMinor int + if gpuHandles.cudart != nil { + C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo) + } else { + C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo) + driverMajor = int(gpuHandles.nvcuda.driver_major) + driverMinor = int(gpuHandles.nvcuda.driver_minor) + } + if memInfo.err != nil { + slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) + C.free(unsafe.Pointer(memInfo.err)) + continue + } + if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) { + slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor)) + continue + } + gpuInfo.TotalMemory = uint64(memInfo.total) + gpuInfo.FreeMemory = uint64(memInfo.free) + gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) + gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) + gpuInfo.MinimumMemory = cudaMinimumMemory + gpuInfo.DependencyPath = depPath + gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) + gpuInfo.DriverMajor = int(driverMajor) + gpuInfo.DriverMinor = int(driverMinor) - // TODO potentially sort on our own algorithm instead of what the underlying GPU library does... - resp = append(resp, gpuInfo) + // TODO potentially sort on our own algorithm instead of what the underlying GPU library does... + resp = append(resp, gpuInfo) + } + if gpuHandles.oneapi != nil { + gpuInfo := GpuInfo{ + Library: "oneapi", + } + C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo) + var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. + memInfo.free = C.uint64_t(totalFreeMem) + gpuInfo.TotalMemory = uint64(memInfo.total) + gpuInfo.FreeMemory = uint64(memInfo.free) + gpuInfo.ID = strconv.Itoa(i) + resp = append(resp, gpuInfo) + } } // Then AMD @@ -348,6 +391,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) { return 0, nil, "" } +func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) { + var resp C.oneapi_init_resp_t + resp.oh.verbose = getVerboseState() + for _, libPath := range oneapiLibPaths { + lib := C.CString(libPath) + defer C.free(unsafe.Pointer(lib)) + C.oneapi_init(lib, &resp) + if resp.err != nil { + slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err)) + C.free(unsafe.Pointer(resp.err)) + } else { + return int(resp.num_devices), &resp.oh, libPath + } + } + return 0, nil, "" +} + func getVerboseState() C.uint16_t { if envconfig.Debug { return C.uint16_t(1) @@ -368,6 +428,8 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { return cudaGetVisibleDevicesEnv(l) case "rocm": return rocmGetVisibleDevicesEnv(l) + case "oneapi": + return oneapiGetVisibleDevicesEnv(l) default: slog.Debug("no filter required for library " + l[0].Library) return "", "" diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h index 2fa86f8d..482b81a6 100644 --- a/gpu/gpu_info.h +++ b/gpu/gpu_info.h @@ -62,6 +62,7 @@ void cpu_check_ram(mem_info_t *resp); #include "gpu_info_cudart.h" #include "gpu_info_nvcuda.h" +#include "gpu_info_oneapi.h" #endif // __GPU_INFO_H__ #endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_oneapi.c b/gpu/gpu_info_oneapi.c new file mode 100644 index 00000000..4be90e80 --- /dev/null +++ b/gpu/gpu_info_oneapi.c @@ -0,0 +1,214 @@ +#ifndef __APPLE__ + +#include "gpu_info_oneapi.h" + +#include + +void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) +{ + ze_result_t ret; + resp->err = NULL; + const int buflen = 256; + char buf[buflen + 1]; + int i; + struct lookup + { + char *s; + void **p; + } l[] = { + {"zesInit", (void *)&resp->oh.zesInit}, + {"zesDriverGet", (void *)&resp->oh.zesDriverGet}, + {"zesDeviceGet", (void *)&resp->oh.zesDeviceGet}, + {"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties}, + {"zesDeviceEnumMemoryModules", + (void *)&resp->oh.zesDeviceEnumMemoryModules}, + {"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties}, + {"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState}, + {NULL, NULL}, + }; + + resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY); + if (!resp->oh.handle) + { + char *msg = LOAD_ERR(); + snprintf(buf, buflen, + "Unable to load %s library to query for Intel GPUs: %s\n", + oneapi_lib_path, msg); + free(msg); + resp->err = strdup(buf); + return; + } + + // TODO once we've squashed the remaining corner cases remove this log + LOG(resp->oh.verbose, + "wiring Level-Zero management library functions in %s\n", + oneapi_lib_path); + + for (i = 0; l[i].s != NULL; i++) + { + // TODO once we've squashed the remaining corner cases remove this log + LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s); + + *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s); + if (!l[i].p) + { + resp->oh.handle = NULL; + char *msg = LOAD_ERR(); + LOG(resp->oh.verbose, "dlerr: %s\n", msg); + UNLOAD_LIBRARY(resp->oh.handle); + snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg); + free(msg); + resp->err = strdup(buf); + return; + } + } + + ret = (*resp->oh.zesInit)(0); + if (ret != ZE_RESULT_SUCCESS) + { + LOG(resp->oh.verbose, "zesInit err: %d\n", ret); + UNLOAD_LIBRARY(resp->oh.handle); + resp->oh.handle = NULL; + snprintf(buf, buflen, "oneapi vram init failure: %d", ret); + resp->err = strdup(buf); + } + + (*resp->oh.zesDriverGet)(&resp->num_devices, NULL); + + return; +} + +void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp) +{ + ze_result_t ret; + resp->err = NULL; + uint64_t totalMem = 0; + uint64_t usedMem = 0; + const int buflen = 256; + char buf[buflen + 1]; + int i, d, m; + + if (h.handle == NULL) + { + resp->err = strdup("Level-Zero handle not initialized"); + return; + } + + uint32_t driversCount = 0; + ret = (*h.zesDriverGet)(&driversCount, NULL); + if (ret != ZE_RESULT_SUCCESS) + { + snprintf(buf, buflen, "unable to get driver count: %d", ret); + resp->err = strdup(buf); + return; + } + LOG(h.verbose, "discovered %d Level-Zero drivers\n", driversCount); + + zes_driver_handle_t *allDrivers = + malloc(driversCount * sizeof(zes_driver_handle_t)); + (*h.zesDriverGet)(&driversCount, allDrivers); + + resp->total = 0; + resp->free = 0; + + for (d = 0; d < driversCount; d++) + { + uint32_t deviceCount = 0; + ret = (*h.zesDeviceGet)(allDrivers[d], &deviceCount, NULL); + if (ret != ZE_RESULT_SUCCESS) + { + snprintf(buf, buflen, "unable to get device count: %d", ret); + resp->err = strdup(buf); + free(allDrivers); + return; + } + + LOG(h.verbose, "discovered %d Level-Zero devices\n", deviceCount); + + zes_device_handle_t *devices = + malloc(deviceCount * sizeof(zes_device_handle_t)); + (*h.zesDeviceGet)(allDrivers[d], &deviceCount, devices); + + for (i = 0; i < deviceCount; i++) + { + zes_device_ext_properties_t ext_props; + ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES; + ext_props.pNext = NULL; + + zes_device_properties_t props; + props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES; + props.pNext = &ext_props; + + ret = (*h.zesDeviceGetProperties)(devices[i], &props); + if (ret != ZE_RESULT_SUCCESS) + { + snprintf(buf, buflen, "unable to get device properties: %d", ret); + resp->err = strdup(buf); + free(allDrivers); + free(devices); + return; + } + + if (h.verbose) + { + // When in verbose mode, report more information about + // the card we discover. + LOG(h.verbose, "[%d] oneAPI device name: %s\n", i, + props.modelName); + LOG(h.verbose, "[%d] oneAPI brand: %s\n", i, + props.brandName); + LOG(h.verbose, "[%d] oneAPI vendor: %s\n", i, + props.vendorName); + LOG(h.verbose, "[%d] oneAPI S/N: %s\n", i, + props.serialNumber); + LOG(h.verbose, "[%d] oneAPI board number: %s\n", i, + props.boardNumber); + } + + uint32_t memCount = 0; + ret = (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, NULL); + if (ret != ZE_RESULT_SUCCESS) + { + snprintf(buf, buflen, + "unable to enumerate Level-Zero memory modules: %d", ret); + resp->err = strdup(buf); + free(allDrivers); + free(devices); + return; + } + + LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount); + + zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t)); + (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, mems); + + for (m = 0; m < memCount; m++) + { + zes_mem_state_t state; + state.stype = ZES_STRUCTURE_TYPE_MEM_STATE; + state.pNext = NULL; + ret = (*h.zesMemoryGetState)(mems[m], &state); + if (ret != ZE_RESULT_SUCCESS) + { + snprintf(buf, buflen, "unable to get memory state: %d", ret); + resp->err = strdup(buf); + free(allDrivers); + free(devices); + free(mems); + return; + } + + resp->total += state.size; + resp->free += state.free; + } + + free(mems); + } + + free(devices); + } + + free(allDrivers); +} + +#endif // __APPLE__ diff --git a/gpu/gpu_info_oneapi.h b/gpu/gpu_info_oneapi.h new file mode 100644 index 00000000..9db9fae0 --- /dev/null +++ b/gpu/gpu_info_oneapi.h @@ -0,0 +1,211 @@ +#ifndef __APPLE__ +#ifndef __GPU_INFO_ONEAPI_H__ +#define __GPU_INFO_ONEAPI_H__ +#include "gpu_info.h" + +#define ZE_MAX_DEVICE_NAME 256 +#define ZE_MAX_DEVICE_UUID_SIZE 16 +#define ZES_STRING_PROPERTY_SIZE 64 +#define ZE_BIT(_i) (1 << _i) + +// Just enough typedef's to dlopen/dlsym for memory information +typedef enum ze_result_t +{ + ZE_RESULT_SUCCESS = 0, + // Other values omitted for now... +} ze_result_t; + +typedef uint8_t ze_bool_t; +typedef struct _zes_driver_handle_t *zes_driver_handle_t; +typedef struct _zes_device_handle_t *zes_device_handle_t; +typedef struct _zes_mem_handle_t *zes_mem_handle_t; + +typedef enum _ze_structure_type_t +{ + ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff +} ze_structure_type_t; + +typedef enum _zes_structure_type_t +{ + ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1, + ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb, + ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e, + ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d, + ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff +} zes_structure_type_t; + +typedef enum _zes_mem_type_t +{ + ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff +} zes_mem_type_t; + +typedef enum _zes_mem_loc_t +{ + ZES_MEM_LOC_SYSTEM = 0, + ZES_MEM_LOC_DEVICE = 1, + ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff +} zes_mem_loc_t; + +typedef enum _zes_mem_health_t +{ + ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff +} zes_mem_health_t; + +typedef struct _ze_device_uuid_t +{ + uint8_t id[ZE_MAX_DEVICE_UUID_SIZE]; +} ze_device_uuid_t; + +typedef struct _zes_uuid_t +{ + uint8_t id[ZE_MAX_DEVICE_UUID_SIZE]; +} zes_uuid_t; + +typedef enum _ze_device_type_t +{ + ZE_DEVICE_TYPE_GPU = 1, + ZE_DEVICE_TYPE_CPU = 2, + ZE_DEVICE_TYPE_FPGA = 3, + ZE_DEVICE_TYPE_MCA = 4, + ZE_DEVICE_TYPE_VPU = 5, + ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff +} ze_device_type_t; + +typedef enum _zes_device_type_t +{ + ZES_DEVICE_TYPE_GPU = 1, + ZES_DEVICE_TYPE_CPU = 2, + ZES_DEVICE_TYPE_FPGA = 3, + ZES_DEVICE_TYPE_MCA = 4, + ZES_DEVICE_TYPE_VPU = 5, + ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff +} zes_device_type_t; + +typedef uint32_t ze_device_property_flags_t; +typedef enum _ze_device_property_flag_t +{ + ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0), + ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1), + ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2), + ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3), + ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff +} ze_device_property_flag_t; + +typedef uint32_t zes_device_property_flags_t; +typedef enum _zes_device_property_flag_t +{ + ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0), + ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1), + ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2), + ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3), + ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff +} zes_device_property_flag_t; + +typedef struct _ze_device_properties_t +{ + ze_structure_type_t stype; + void *pNext; + ze_device_type_t type; + uint32_t vendorId; + uint32_t deviceId; + ze_device_property_flags_t flags; + uint32_t subdeviceId; + uint32_t coreClockRate; + uint64_t maxMemAllocSize; + uint32_t maxHardwareContexts; + uint32_t maxCommandQueuePriority; + uint32_t numThreadsPerEU; + uint32_t physicalEUSimdWidth; + uint32_t numEUsPerSubslice; + uint32_t numSubslicesPerSlice; + uint32_t numSlices; + uint64_t timerResolution; + uint32_t timestampValidBits; + uint32_t kernelTimestampValidBits; + ze_device_uuid_t uuid; + char name[ZE_MAX_DEVICE_NAME]; +} ze_device_properties_t; + +typedef struct _zes_device_properties_t +{ + zes_structure_type_t stype; + void *pNext; + ze_device_properties_t core; + uint32_t numSubdevices; + char serialNumber[ZES_STRING_PROPERTY_SIZE]; + char boardNumber[ZES_STRING_PROPERTY_SIZE]; + char brandName[ZES_STRING_PROPERTY_SIZE]; + char modelName[ZES_STRING_PROPERTY_SIZE]; + char vendorName[ZES_STRING_PROPERTY_SIZE]; + char driverVersion[ZES_STRING_PROPERTY_SIZE]; +} zes_device_properties_t; + +typedef struct _zes_device_ext_properties_t +{ + zes_structure_type_t stype; + void *pNext; + zes_uuid_t uuid; + zes_device_type_t type; + zes_device_property_flags_t flags; +} zes_device_ext_properties_t; + +typedef struct _zes_mem_properties_t +{ + zes_structure_type_t stype; + void *pNext; + zes_mem_type_t type; + ze_bool_t onSubdevice; + uint32_t subdeviceId; + zes_mem_loc_t location; + uint64_t physicalSize; + int32_t busWidth; + int32_t numChannels; +} zes_mem_properties_t; + +typedef struct _zes_mem_state_t +{ + zes_structure_type_t stype; + const void *pNext; + zes_mem_health_t health; + uint64_t free; + uint64_t size; +} zes_mem_state_t; + +typedef struct oneapi_handle +{ + void *handle; + uint16_t verbose; + ze_result_t (*zesInit)(int); + ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers); + ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount, + zes_device_handle_t *phDevices); + ze_result_t (*zesDeviceGetProperties)(zes_device_handle_t hDevice, + zes_device_properties_t *pProperties); + ze_result_t (*zesDeviceEnumMemoryModules)(zes_device_handle_t hDevice, + uint32_t *pCount, + zes_mem_handle_t *phMemory); + ze_result_t (*zesMemoryGetProperties)(zes_mem_handle_t hMemory, + zes_mem_properties_t *pProperties); + ze_result_t (*zesMemoryGetState)(zes_mem_handle_t hMemory, + zes_mem_state_t *pState); + +} oneapi_handle_t; + +typedef struct oneapi_init_resp +{ + char *err; // If err is non-null handle is invalid + int num_devices; + oneapi_handle_t oh; +} oneapi_init_resp_t; + +typedef struct oneapi_version_resp +{ + ze_result_t status; + char *str; // Contains version or error string if status != 0 +} oneapi_version_resp_t; + +void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp); +void oneapi_check_vram(oneapi_handle_t rh, mem_info_t *resp); + +#endif // __GPU_INFO_INTEL_H__ +#endif // __APPLE__ diff --git a/gpu/gpu_oneapi.go b/gpu/gpu_oneapi.go new file mode 100644 index 00000000..9864bde5 --- /dev/null +++ b/gpu/gpu_oneapi.go @@ -0,0 +1,21 @@ +//go:build linux || windows + +package gpu + +import ( + "log/slog" + "strings" +) + +func oneapiGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { + ids := []string{} + for _, info := range gpuInfo { + if info.Library != "oneapi" { + // TODO shouldn't happen if things are wired correctly... + slog.Debug("oneapiGetVisibleDevicesEnv skipping over non-sycl device", "library", info.Library) + continue + } + ids = append(ids, info.ID) + } + return "ONEAPI_DEVICE_SELECTOR", "level_zero:" + strings.Join(ids, ",") +} diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index c20a2568..c03a5c16 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -206,6 +206,36 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then fi +if [ -z "${ONEAPI_ROOT}" ]; then + # Try the default location in case it exists + ONEAPI_ROOT=/opt/intel/oneapi +fi + +if [ -d "${ONEAPI_ROOT}" ]; then + echo "OneAPI libraries detected - building dynamic OneAPI library" + init_vars + source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI + CC=icx + CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL=ON -DLLAMA_SYCL_F16=OFF" + BUILD_DIR="../build/linux/${ARCH}/oneapi" + EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" + DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it + build + + # copy oneAPI dependencies + for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do + cp "${dep}" "${BUILD_DIR}/bin/" + done + cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/" + compress +fi + if [ -z "${ROCM_PATH}" ]; then # Try the default location in case it exists ROCM_PATH=/opt/rocm diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 9bdfb9d3..553c358b 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -289,6 +289,49 @@ function build_cuda() { } } +function build_oneapi() { + if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${env:ONEAPI_ROOT}")) { + # Get oneAPI version + $script:ONEAPI_VERSION = icpx --version + $script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?\d+\.\d+\.\d+)').Value + if ($null -ne $script:ONEAPI_VERSION) { + $script:ONEAPI_VARIANT = "_v" + $script:ONEAPI_VERSION + } + init_vars + $script:buildDir = "../build/windows/${script:ARCH}/oneapi$script:ONEAPI_VARIANT" + $script:distDir ="$script:DIST_BASE\oneapi$script:ONEAPI_VARIANT" + $script:cmakeDefs += @( + "-G", "MinGW Makefiles", + "-DLLAMA_SYCL=ON", + "-DCMAKE_C_COMPILER=icx", + "-DCMAKE_CXX_COMPILER=icx", + "-DCMAKE_BUILD_TYPE=Release" + ) + + Write-Host "Building oneAPI" + build + # Ninja doesn't prefix with config name + if ($null -ne $script:DUMPBIN) { + & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | Select-String ".dll" + } + sign + install + + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}" + } else { + Write-Host "Skipping oneAPI generation step" + } +} + function build_rocm() { if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) { $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename @@ -356,6 +399,7 @@ if ($($args.count) -eq 0) { build_cpu_avx build_cpu_avx2 build_cuda + build_oneapi build_rocm }