support ollama run on Intel GPUs

2024-05-24 11:18:27 +08:00 · 2024-05-24 11:18:27 +08:00 · fd5971be0b
commit fd5971be0b
parent 7ca71a6b0f
7 changed files with 615 additions and 32 deletions
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -16,6 +16,7 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
@ -28,6 +29,7 @@ type handles struct {
 	deviceCount int
 	cudart      *C.cudart_handle_t
 	nvcuda      *C.nvcuda_handle_t
+	oneapi      *C.oneapi_handle_t
 }

 const (
@ -80,6 +82,15 @@ var NvcudaWindowsGlobs = []string{
 	"c:\\windows\\system*\\nvcuda.dll",
 }

+var OneapiWindowsGlobs = []string{
+	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
+}
+
+var OneapiLinuxGlobs = []string{
+	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
+	"/usr/lib*/libze_intel_gpu.so*",
+}
+
 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
@ -94,6 +105,8 @@ func initGPUHandles() *handles {
 	var cudartMgmtPatterns []string
 	var nvcudaMgmtName string
 	var nvcudaMgmtPatterns []string
+	var oneapiMgmtName string
+	var oneapiMgmtPatterns []string

 	tmpDir, _ := PayloadsDir()
 	switch runtime.GOOS {
@ -105,6 +118,8 @@ func initGPUHandles() *handles {
 		// Aligned with driver, we can't carry as payloads
 		nvcudaMgmtName = "nvcuda.dll"
 		nvcudaMgmtPatterns = NvcudaWindowsGlobs
+		oneapiMgmtName = "ze_intel_gpu64.dll"
+		oneapiMgmtPatterns = OneapiWindowsGlobs
 	case "linux":
 		cudartMgmtName = "libcudart.so*"
 		if tmpDir != "" {
@ -115,6 +130,8 @@ func initGPUHandles() *handles {
 		// Aligned with driver, we can't carry as payloads
 		nvcudaMgmtName = "libcuda.so*"
 		nvcudaMgmtPatterns = NvcudaLinuxGlobs
+		oneapiMgmtName = "libze_intel_gpu.so"
+		oneapiMgmtPatterns = OneapiLinuxGlobs
 	default:
 		return gpuHandles
 	}
@ -141,6 +158,18 @@ func initGPUHandles() *handles {
 			return gpuHandles
 		}
 	}
+
+	oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
+	if len(oneapiLibPaths) > 0 {
+		deviceCount, oneapi, libPath := LoadOneapiMgmt(oneapiLibPaths)
+		if oneapi != nil {
+			slog.Debug("detected Intel GPUs", "library", libPath, "count", deviceCount)
+			gpuHandles.oneapi = oneapi
+			gpuHandles.deviceCount = deviceCount
+			return gpuHandles
+		}
+	}
+
 	return gpuHandles
 }

@ -181,6 +210,7 @@ func GetGPUInfo() GpuInfoList {
 		if cpuVariant == "" && runtime.GOARCH == "amd64" {
 			continue
 		}
+		if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
 			gpuInfo := GpuInfo{
 				Library: "cuda",
 			}
@ -215,6 +245,19 @@ func GetGPUInfo() GpuInfoList {
 			// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
 			resp = append(resp, gpuInfo)
 		}
+		if gpuHandles.oneapi != nil {
+			gpuInfo := GpuInfo{
+				Library: "oneapi",
+			}
+			C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
+			var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
+			memInfo.free = C.uint64_t(totalFreeMem)
+			gpuInfo.TotalMemory = uint64(memInfo.total)
+			gpuInfo.FreeMemory = uint64(memInfo.free)
+			gpuInfo.ID = strconv.Itoa(i)
+			resp = append(resp, gpuInfo)
+		}
+	}

 	// Then AMD
 	resp = append(resp, AMDGetGPUInfo()...)
@ -348,6 +391,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
 	return 0, nil, ""
 }

+func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
+	var resp C.oneapi_init_resp_t
+	resp.oh.verbose = getVerboseState()
+	for _, libPath := range oneapiLibPaths {
+		lib := C.CString(libPath)
+		defer C.free(unsafe.Pointer(lib))
+		C.oneapi_init(lib, &resp)
+		if resp.err != nil {
+			slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			return int(resp.num_devices), &resp.oh, libPath
+		}
+	}
+	return 0, nil, ""
+}
+
 func getVerboseState() C.uint16_t {
 	if envconfig.Debug {
 		return C.uint16_t(1)
@ -368,6 +428,8 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 		return cudaGetVisibleDevicesEnv(l)
 	case "rocm":
 		return rocmGetVisibleDevicesEnv(l)
+	case "oneapi":
+		return oneapiGetVisibleDevicesEnv(l)
 	default:
 		slog.Debug("no filter required for library " + l[0].Library)
 		return "", ""
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@ -62,6 +62,7 @@ void cpu_check_ram(mem_info_t *resp);

 #include "gpu_info_cudart.h"
 #include "gpu_info_nvcuda.h"
+#include "gpu_info_oneapi.h"

 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@ -0,0 +1,214 @@
+#ifndef __APPLE__
+
+#include "gpu_info_oneapi.h"
+
+#include <string.h>
+
+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
+{
+  ze_result_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  struct lookup
+  {
+    char *s;
+    void **p;
+  } l[] = {
+      {"zesInit", (void *)&resp->oh.zesInit},
+      {"zesDriverGet", (void *)&resp->oh.zesDriverGet},
+      {"zesDeviceGet", (void *)&resp->oh.zesDeviceGet},
+      {"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties},
+      {"zesDeviceEnumMemoryModules",
+       (void *)&resp->oh.zesDeviceEnumMemoryModules},
+      {"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties},
+      {"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState},
+      {NULL, NULL},
+  };
+
+  resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
+  if (!resp->oh.handle)
+  {
+    char *msg = LOAD_ERR();
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Intel GPUs: %s\n",
+             oneapi_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->oh.verbose,
+      "wiring Level-Zero management library functions in %s\n",
+      oneapi_lib_path);
+
+  for (i = 0; l[i].s != NULL; i++)
+  {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
+
+    *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
+    if (!l[i].p)
+    {
+      resp->oh.handle = NULL;
+      char *msg = LOAD_ERR();
+      LOG(resp->oh.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->oh.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->oh.zesInit)(0);
+  if (ret != ZE_RESULT_SUCCESS)
+  {
+    LOG(resp->oh.verbose, "zesInit err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->oh.handle);
+    resp->oh.handle = NULL;
+    snprintf(buf, buflen, "oneapi vram init failure: %d", ret);
+    resp->err = strdup(buf);
+  }
+
+  (*resp->oh.zesDriverGet)(&resp->num_devices, NULL);
+
+  return;
+}
+
+void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
+{
+  ze_result_t ret;
+  resp->err = NULL;
+  uint64_t totalMem = 0;
+  uint64_t usedMem = 0;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i, d, m;
+
+  if (h.handle == NULL)
+  {
+    resp->err = strdup("Level-Zero handle not initialized");
+    return;
+  }
+
+  uint32_t driversCount = 0;
+  ret = (*h.zesDriverGet)(&driversCount, NULL);
+  if (ret != ZE_RESULT_SUCCESS)
+  {
+    snprintf(buf, buflen, "unable to get driver count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  LOG(h.verbose, "discovered %d Level-Zero drivers\n", driversCount);
+
+  zes_driver_handle_t *allDrivers =
+      malloc(driversCount * sizeof(zes_driver_handle_t));
+  (*h.zesDriverGet)(&driversCount, allDrivers);
+
+  resp->total = 0;
+  resp->free = 0;
+
+  for (d = 0; d < driversCount; d++)
+  {
+    uint32_t deviceCount = 0;
+    ret = (*h.zesDeviceGet)(allDrivers[d], &deviceCount, NULL);
+    if (ret != ZE_RESULT_SUCCESS)
+    {
+      snprintf(buf, buflen, "unable to get device count: %d", ret);
+      resp->err = strdup(buf);
+      free(allDrivers);
+      return;
+    }
+
+    LOG(h.verbose, "discovered %d Level-Zero devices\n", deviceCount);
+
+    zes_device_handle_t *devices =
+        malloc(deviceCount * sizeof(zes_device_handle_t));
+    (*h.zesDeviceGet)(allDrivers[d], &deviceCount, devices);
+
+    for (i = 0; i < deviceCount; i++)
+    {
+      zes_device_ext_properties_t ext_props;
+      ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
+      ext_props.pNext = NULL;
+
+      zes_device_properties_t props;
+      props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+      props.pNext = &ext_props;
+
+      ret = (*h.zesDeviceGetProperties)(devices[i], &props);
+      if (ret != ZE_RESULT_SUCCESS)
+      {
+        snprintf(buf, buflen, "unable to get device properties: %d", ret);
+        resp->err = strdup(buf);
+        free(allDrivers);
+        free(devices);
+        return;
+      }
+
+      if (h.verbose)
+      {
+        // When in verbose mode, report more information about
+        // the card we discover.
+        LOG(h.verbose, "[%d] oneAPI device name: %s\n", i,
+            props.modelName);
+        LOG(h.verbose, "[%d] oneAPI brand: %s\n", i,
+            props.brandName);
+        LOG(h.verbose, "[%d] oneAPI vendor: %s\n", i,
+            props.vendorName);
+        LOG(h.verbose, "[%d] oneAPI S/N: %s\n", i,
+            props.serialNumber);
+        LOG(h.verbose, "[%d] oneAPI board number: %s\n", i,
+            props.boardNumber);
+      }
+
+      uint32_t memCount = 0;
+      ret = (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, NULL);
+      if (ret != ZE_RESULT_SUCCESS)
+      {
+        snprintf(buf, buflen,
+                 "unable to enumerate Level-Zero memory modules: %d", ret);
+        resp->err = strdup(buf);
+        free(allDrivers);
+        free(devices);
+        return;
+      }
+
+      LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
+
+      zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
+      (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, mems);
+
+      for (m = 0; m < memCount; m++)
+      {
+        zes_mem_state_t state;
+        state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
+        state.pNext = NULL;
+        ret = (*h.zesMemoryGetState)(mems[m], &state);
+        if (ret != ZE_RESULT_SUCCESS)
+        {
+          snprintf(buf, buflen, "unable to get memory state: %d", ret);
+          resp->err = strdup(buf);
+          free(allDrivers);
+          free(devices);
+          free(mems);
+          return;
+        }
+
+        resp->total += state.size;
+        resp->free += state.free;
+      }
+
+      free(mems);
+    }
+
+    free(devices);
+  }
+
+  free(allDrivers);
+}
+
+#endif // __APPLE__
--- a/gpu/gpu_info_oneapi.h
+++ b/gpu/gpu_info_oneapi.h
@ -0,0 +1,211 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_ONEAPI_H__
+#define __GPU_INFO_ONEAPI_H__
+#include "gpu_info.h"
+
+#define ZE_MAX_DEVICE_NAME 256
+#define ZE_MAX_DEVICE_UUID_SIZE 16
+#define ZES_STRING_PROPERTY_SIZE 64
+#define ZE_BIT(_i) (1 << _i)
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum ze_result_t
+{
+  ZE_RESULT_SUCCESS = 0,
+  // Other values omitted for now...
+} ze_result_t;
+
+typedef uint8_t ze_bool_t;
+typedef struct _zes_driver_handle_t *zes_driver_handle_t;
+typedef struct _zes_device_handle_t *zes_device_handle_t;
+typedef struct _zes_mem_handle_t *zes_mem_handle_t;
+
+typedef enum _ze_structure_type_t
+{
+  ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
+} ze_structure_type_t;
+
+typedef enum _zes_structure_type_t
+{
+  ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
+  ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
+  ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
+  ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,
+  ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
+} zes_structure_type_t;
+
+typedef enum _zes_mem_type_t
+{
+  ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
+} zes_mem_type_t;
+
+typedef enum _zes_mem_loc_t
+{
+  ZES_MEM_LOC_SYSTEM = 0,
+  ZES_MEM_LOC_DEVICE = 1,
+  ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
+} zes_mem_loc_t;
+
+typedef enum _zes_mem_health_t
+{
+  ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
+} zes_mem_health_t;
+
+typedef struct _ze_device_uuid_t
+{
+  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
+} ze_device_uuid_t;
+
+typedef struct _zes_uuid_t
+{
+  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
+} zes_uuid_t;
+
+typedef enum _ze_device_type_t
+{
+  ZE_DEVICE_TYPE_GPU = 1,
+  ZE_DEVICE_TYPE_CPU = 2,
+  ZE_DEVICE_TYPE_FPGA = 3,
+  ZE_DEVICE_TYPE_MCA = 4,
+  ZE_DEVICE_TYPE_VPU = 5,
+  ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
+} ze_device_type_t;
+
+typedef enum _zes_device_type_t
+{
+  ZES_DEVICE_TYPE_GPU = 1,
+  ZES_DEVICE_TYPE_CPU = 2,
+  ZES_DEVICE_TYPE_FPGA = 3,
+  ZES_DEVICE_TYPE_MCA = 4,
+  ZES_DEVICE_TYPE_VPU = 5,
+  ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
+} zes_device_type_t;
+
+typedef uint32_t ze_device_property_flags_t;
+typedef enum _ze_device_property_flag_t
+{
+  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
+  ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
+  ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
+  ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
+  ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+} ze_device_property_flag_t;
+
+typedef uint32_t zes_device_property_flags_t;
+typedef enum _zes_device_property_flag_t
+{
+  ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
+  ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
+  ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
+  ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
+  ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+} zes_device_property_flag_t;
+
+typedef struct _ze_device_properties_t
+{
+  ze_structure_type_t stype;
+  void *pNext;
+  ze_device_type_t type;
+  uint32_t vendorId;
+  uint32_t deviceId;
+  ze_device_property_flags_t flags;
+  uint32_t subdeviceId;
+  uint32_t coreClockRate;
+  uint64_t maxMemAllocSize;
+  uint32_t maxHardwareContexts;
+  uint32_t maxCommandQueuePriority;
+  uint32_t numThreadsPerEU;
+  uint32_t physicalEUSimdWidth;
+  uint32_t numEUsPerSubslice;
+  uint32_t numSubslicesPerSlice;
+  uint32_t numSlices;
+  uint64_t timerResolution;
+  uint32_t timestampValidBits;
+  uint32_t kernelTimestampValidBits;
+  ze_device_uuid_t uuid;
+  char name[ZE_MAX_DEVICE_NAME];
+} ze_device_properties_t;
+
+typedef struct _zes_device_properties_t
+{
+  zes_structure_type_t stype;
+  void *pNext;
+  ze_device_properties_t core;
+  uint32_t numSubdevices;
+  char serialNumber[ZES_STRING_PROPERTY_SIZE];
+  char boardNumber[ZES_STRING_PROPERTY_SIZE];
+  char brandName[ZES_STRING_PROPERTY_SIZE];
+  char modelName[ZES_STRING_PROPERTY_SIZE];
+  char vendorName[ZES_STRING_PROPERTY_SIZE];
+  char driverVersion[ZES_STRING_PROPERTY_SIZE];
+} zes_device_properties_t;
+
+typedef struct _zes_device_ext_properties_t
+{
+  zes_structure_type_t stype;
+  void *pNext;
+  zes_uuid_t uuid;
+  zes_device_type_t type;
+  zes_device_property_flags_t flags;
+} zes_device_ext_properties_t;
+
+typedef struct _zes_mem_properties_t
+{
+  zes_structure_type_t stype;
+  void *pNext;
+  zes_mem_type_t type;
+  ze_bool_t onSubdevice;
+  uint32_t subdeviceId;
+  zes_mem_loc_t location;
+  uint64_t physicalSize;
+  int32_t busWidth;
+  int32_t numChannels;
+} zes_mem_properties_t;
+
+typedef struct _zes_mem_state_t
+{
+  zes_structure_type_t stype;
+  const void *pNext;
+  zes_mem_health_t health;
+  uint64_t free;
+  uint64_t size;
+} zes_mem_state_t;
+
+typedef struct oneapi_handle
+{
+  void *handle;
+  uint16_t verbose;
+  ze_result_t (*zesInit)(int);
+  ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
+  ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
+                              zes_device_handle_t *phDevices);
+  ze_result_t (*zesDeviceGetProperties)(zes_device_handle_t hDevice,
+                                        zes_device_properties_t *pProperties);
+  ze_result_t (*zesDeviceEnumMemoryModules)(zes_device_handle_t hDevice,
+                                            uint32_t *pCount,
+                                            zes_mem_handle_t *phMemory);
+  ze_result_t (*zesMemoryGetProperties)(zes_mem_handle_t hMemory,
+                                        zes_mem_properties_t *pProperties);
+  ze_result_t (*zesMemoryGetState)(zes_mem_handle_t hMemory,
+                                   zes_mem_state_t *pState);
+
+} oneapi_handle_t;
+
+typedef struct oneapi_init_resp
+{
+  char *err; // If err is non-null handle is invalid
+  int num_devices;
+  oneapi_handle_t oh;
+} oneapi_init_resp_t;
+
+typedef struct oneapi_version_resp
+{
+  ze_result_t status;
+  char *str; // Contains version or error string if status != 0
+} oneapi_version_resp_t;
+
+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
+void oneapi_check_vram(oneapi_handle_t rh, mem_info_t *resp);
+
+#endif // __GPU_INFO_INTEL_H__
+#endif // __APPLE__
--- a/gpu/gpu_oneapi.go
+++ b/gpu/gpu_oneapi.go
@ -0,0 +1,21 @@
+//go:build linux || windows
+
+package gpu
+
+import (
+	"log/slog"
+	"strings"
+)
+
+func oneapiGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
+	ids := []string{}
+	for _, info := range gpuInfo {
+		if info.Library != "oneapi" {
+			// TODO shouldn't happen if things are wired correctly...
+			slog.Debug("oneapiGetVisibleDevicesEnv skipping over non-sycl device", "library", info.Library)
+			continue
+		}
+		ids = append(ids, info.ID)
+	}
+	return "ONEAPI_DEVICE_SELECTOR", "level_zero:" + strings.Join(ids, ",")
+}
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -206,6 +206,36 @@ if [ -d "${CUDA_LIB_DIR}" ]; then

 fi

+if [ -z "${ONEAPI_ROOT}" ]; then
+    # Try the default location in case it exists
+    ONEAPI_ROOT=/opt/intel/oneapi
+fi
+
+if [ -d "${ONEAPI_ROOT}" ]; then
+    echo "OneAPI libraries detected - building dynamic OneAPI library"
+    init_vars
+    source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
+    CC=icx
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL=ON -DLLAMA_SYCL_F16=OFF"
+    BUILD_DIR="../build/linux/${ARCH}/oneapi"
+    EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
+    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
+    build
+
+    # copy oneAPI dependencies
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
+        cp "${dep}" "${BUILD_DIR}/bin/"
+    done
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
+    compress
+fi
+
 if [ -z "${ROCM_PATH}" ]; then
    # Try the default location in case it exists
    ROCM_PATH=/opt/rocm
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -289,6 +289,49 @@ function build_cuda() {
    }
 }

+function build_oneapi() {
+  if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${env:ONEAPI_ROOT}"))  {
+    # Get oneAPI version
+    $script:ONEAPI_VERSION = icpx --version
+    $script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?<version>\d+\.\d+\.\d+)').Value
+    if ($null -ne $script:ONEAPI_VERSION) {
+      $script:ONEAPI_VARIANT = "_v" + $script:ONEAPI_VERSION
+    }
+    init_vars
+    $script:buildDir = "../build/windows/${script:ARCH}/oneapi$script:ONEAPI_VARIANT"
+    $script:distDir ="$script:DIST_BASE\oneapi$script:ONEAPI_VARIANT"
+    $script:cmakeDefs += @(
+      "-G", "MinGW Makefiles",
+      "-DLLAMA_SYCL=ON",
+      "-DCMAKE_C_COMPILER=icx",
+      "-DCMAKE_CXX_COMPILER=icx",
+      "-DCMAKE_BUILD_TYPE=Release"
+    )
+
+    Write-Host "Building oneAPI"
+    build
+    # Ninja doesn't prefix with config name
+    if ($null -ne $script:DUMPBIN) {
+      & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | Select-String ".dll"
+    }
+    sign
+    install
+
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
+  } else {
+    Write-Host "Skipping oneAPI generation step"
+  }
+}
+
 function build_rocm() {
    if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
        $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
@ -356,6 +399,7 @@ if ($($args.count) -eq 0) {
        build_cpu_avx
        build_cpu_avx2
        build_cuda
+        build_oneapi
        build_rocm
    }