Merge pull request #2279 from remy415/main

Add support for libcudart.so for CUDA devices (Adds Jetson support)
2024-03-25 12:46:28 -07:00 · 2024-03-25 12:46:28 -07:00 · 28a64e23ca
commit 28a64e23ca
parent 92d74e2f59 dfc6721b20
8 changed files with 437 additions and 82 deletions
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -23,7 +23,8 @@ import (
 )

 type handles struct {
-	cuda *C.cuda_handle_t
+	nvml   *C.nvml_handle_t
+	cudart *C.cudart_handle_t
 }

 var gpuMutex sync.Mutex
@ -33,7 +34,7 @@ var gpuHandles *handles = nil
 var CudaComputeMin = [2]C.int{5, 0}

 // Possible locations for the nvidia-ml library
-var CudaLinuxGlobs = []string{
+var NvmlLinuxGlobs = []string{
 	"/usr/local/cuda/lib64/libnvidia-ml.so*",
 	"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
 	"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
@ -41,49 +42,98 @@ var CudaLinuxGlobs = []string{
 	"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
 	"/opt/cuda/lib64/libnvidia-ml.so*",
 	"/usr/lib*/libnvidia-ml.so*",
-	"/usr/local/lib*/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
+	"/usr/local/lib*/libnvidia-ml.so*",

 	// TODO: are these stubs ever valid?
 	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
 }

-var CudaWindowsGlobs = []string{
+var NvmlWindowsGlobs = []string{
 	"c:\\Windows\\System32\\nvml.dll",
 }

+var CudartLinuxGlobs = []string{
+	"/usr/local/cuda/lib64/libcudart.so*",
+	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
+	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
+	"/usr/lib/wsl/lib/libcudart.so*",
+	"/usr/lib/wsl/drivers/*/libcudart.so*",
+	"/opt/cuda/lib64/libcudart.so*",
+	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
+	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
+	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
+	"/usr/local/cuda/lib*/libcudart.so*",
+	"/usr/lib*/libcudart.so*",
+	"/usr/local/lib*/libcudart.so*",
+}
+
+var CudartWindowsGlobs = []string{
+	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
+}
+
+// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
+// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
+var CudaTegra string = os.Getenv("JETSON_JETPACK")
+
 // Note: gpuMutex must already be held
 func initGPUHandles() {

 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing

-	gpuHandles = &handles{nil}
-	var cudaMgmtName string
-	var cudaMgmtPatterns []string
+	gpuHandles = &handles{nil, nil}
+	var nvmlMgmtName string
+	var nvmlMgmtPatterns []string
+	var cudartMgmtName string
+	var cudartMgmtPatterns []string
+
+	tmpDir, _ := PayloadsDir()
 	switch runtime.GOOS {
 	case "windows":
-		cudaMgmtName = "nvml.dll"
-		cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
-		copy(cudaMgmtPatterns, CudaWindowsGlobs)
+		nvmlMgmtName = "nvml.dll"
+		nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
+		copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
+		cudartMgmtName = "cudart64_*.dll"
+		localAppData := os.Getenv("LOCALAPPDATA")
+		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
+		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
 	case "linux":
-		cudaMgmtName = "libnvidia-ml.so"
-		cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
-		copy(cudaMgmtPatterns, CudaLinuxGlobs)
+		nvmlMgmtName = "libnvidia-ml.so"
+		nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs))
+		copy(nvmlMgmtPatterns, NvmlLinuxGlobs)
+		cudartMgmtName = "libcudart.so*"
+		if tmpDir != "" {
+			// TODO - add "payloads" for subprocess
+			cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
+		}
+		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
 	default:
 		return
 	}

 	slog.Info("Detecting GPU type")
-	cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
-	if len(cudaLibPaths) > 0 {
-		cuda := LoadCUDAMgmt(cudaLibPaths)
-		if cuda != nil {
-			slog.Info("Nvidia GPU detected")
-			gpuHandles.cuda = cuda
+	cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
+	if len(cudartLibPaths) > 0 {
+		cudart := LoadCUDARTMgmt(cudartLibPaths)
+		if cudart != nil {
+			slog.Info("Nvidia GPU detected via cudart")
+			gpuHandles.cudart = cudart
 			return
 		}
 	}
+
+	// TODO once we build confidence, remove this and the gpu_info_nvml.[ch] files
+	nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
+	if len(nvmlLibPaths) > 0 {
+		nvml := LoadNVMLMgmt(nvmlLibPaths)
+		if nvml != nil {
+			slog.Info("Nvidia GPU detected via nvidia-ml")
+			gpuHandles.nvml = nvml
+			return
+		}
+	}
+
 }

 func GetGPUInfo() GpuInfo {
@ -103,23 +153,42 @@ func GetGPUInfo() GpuInfo {

 	var memInfo C.mem_info_t
 	resp := GpuInfo{}
-	if gpuHandles.cuda != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
-		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
+	if gpuHandles.nvml != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
+		C.nvml_check_vram(*gpuHandles.nvml, &memInfo)
 		if memInfo.err != nil {
-			slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
+			slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU memory: %s", C.GoString(memInfo.err)))
 			C.free(unsafe.Pointer(memInfo.err))
 		} else if memInfo.count > 0 {
 			// Verify minimum compute capability
-			var cc C.cuda_compute_capability_t
-			C.cuda_compute_capability(*gpuHandles.cuda, &cc)
+			var cc C.nvml_compute_capability_t
+			C.nvml_compute_capability(*gpuHandles.nvml, &cc)
 			if cc.err != nil {
-				slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
+				slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU compute capability: %s", C.GoString(cc.err)))
 				C.free(unsafe.Pointer(cc.err))
 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
-				slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
+				slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
 			} else {
-				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
+				slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
+			}
+		}
+	} else if gpuHandles.cudart != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
+		C.cudart_check_vram(*gpuHandles.cudart, &memInfo)
+		if memInfo.err != nil {
+			slog.Info(fmt.Sprintf("[cudart] error looking up CUDART GPU memory: %s", C.GoString(memInfo.err)))
+			C.free(unsafe.Pointer(memInfo.err))
+		} else if memInfo.count > 0 {
+			// Verify minimum compute capability
+			var cc C.cudart_compute_capability_t
+			C.cudart_compute_capability(*gpuHandles.cudart, &cc)
+			if cc.err != nil {
+				slog.Info(fmt.Sprintf("[cudart] error looking up CUDA compute capability: %s", C.GoString(cc.err)))
+				C.free(unsafe.Pointer(cc.err))
+			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
+				slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
+				resp.Library = "cuda"
+			} else {
+				slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
 	} else {
@ -176,6 +245,11 @@ func CheckVRAM() (int64, error) {
 		if overhead < gpus*1024*1024*1024 {
 			overhead = gpus * 1024 * 1024 * 1024
 		}
+		// Assigning full reported free memory for Tegras due to OS controlled caching.
+		if CudaTegra != "" {
+			// Setting overhead for non-Tegra devices
+			overhead = 0
+		}
 		avail := int64(gpuInfo.FreeMemory - overhead)
 		slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
 		return avail, nil
@ -238,15 +312,32 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
 	return gpuLibPaths
 }

-func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
-	var resp C.cuda_init_resp_t
+func LoadNVMLMgmt(nvmlLibPaths []string) *C.nvml_handle_t {
+	var resp C.nvml_init_resp_t
 	resp.ch.verbose = getVerboseState()
-	for _, libPath := range cudaLibPaths {
+	for _, libPath := range nvmlLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
-		C.cuda_init(lib, &resp)
+		C.nvml_init(lib, &resp)
 		if resp.err != nil {
-			slog.Info(fmt.Sprintf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err)))
+			slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			return &resp.ch
+		}
+	}
+	return nil
+}
+
+func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
+	var resp C.cudart_init_resp_t
+	resp.ch.verbose = getVerboseState()
+	for _, libPath := range cudartLibPaths {
+		lib := C.CString(libPath)
+		defer C.free(unsafe.Pointer(lib))
+		C.cudart_init(lib, &resp)
+		if resp.err != nil {
+			slog.Info(fmt.Sprintf("Unable to load cudart CUDA management library %s: %s", libPath, C.GoString(resp.err)))
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			return &resp.ch
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@ -52,7 +52,8 @@ void cpu_check_ram(mem_info_t *resp);
 }
 #endif

-#include "gpu_info_cuda.h"
+#include "gpu_info_nvml.h"
+#include "gpu_info_cudart.h"

 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@ -0,0 +1,190 @@
+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
+
+#include <string.h>
+#include "gpu_info_cudart.h"
+
+void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
+  cudartReturn_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
+      {"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
+      {"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
+      {"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
+      {"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
+      {"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
+      {"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
+      {NULL, NULL},
+  };
+
+  resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
+  if (!resp->ch.handle) {
+    char *msg = LOAD_ERR();
+    LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
+    snprintf(buf, buflen,
+            "Unable to load %s library to query for Nvidia GPUs: %s",
+            cudart_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->ch.verbose, "wiring cudart library functions in %s\n", cudart_lib_path);
+  
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
+
+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+    if (!l[i].p) {
+      char *msg = LOAD_ERR();
+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->ch.handle);
+      resp->ch.handle = NULL;
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+              msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->ch.cudaSetDevice)(0);
+  if (ret != CUDART_SUCCESS) {
+    LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
+    snprintf(buf, buflen, "cudart init failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  int version = 0;
+  cudartDriverVersion_t driverVersion;
+  driverVersion.major = 0;
+  driverVersion.minor = 0;
+
+  // Report driver version if we're in verbose mode, ignore errors
+  ret = (*resp->ch.cudaDriverGetVersion)(&version);
+  if (ret != CUDART_SUCCESS) {
+    LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
+  } else {
+    driverVersion.major = version / 1000;
+    driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
+    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
+  }
+}
+
+
+void cudart_check_vram(cudart_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  cudartMemory_t memInfo = {0,0,0};
+  cudartReturn_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  if (h.handle == NULL) {
+    resp->err = strdup("cudart handle isn't initialized");
+    return;
+  }
+
+  // cudaGetDeviceCount takes int type, resp-> count is uint
+  int deviceCount;
+  ret = (*h.cudaGetDeviceCount)(&deviceCount);
+  if (ret != CUDART_SUCCESS) {
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  } else {
+    resp->count = (unsigned int)deviceCount;
+  }
+
+  resp->total = 0;
+  resp->free = 0;
+  for (i = 0; i < resp-> count; i++) {  
+    ret = (*h.cudaSetDevice)(i);
+    if (ret != CUDART_SUCCESS) {
+      snprintf(buf, buflen, "cudart device failed to initialize");
+      resp->err = strdup(buf);
+      return;
+    }
+    ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
+    if (ret != CUDART_SUCCESS) {
+      snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+
+    LOG(h.verbose, "[%d] CUDA totalMem %lu\n", i, memInfo.total);
+    LOG(h.verbose, "[%d] CUDA freeMem %lu\n", i, memInfo.free);
+
+    resp->total += memInfo.total;
+    resp->free += memInfo.free;
+  }
+}
+
+void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *resp) {
+  resp->err = NULL;
+  resp->major = 0;
+  resp->minor = 0;
+  int major = 0;
+  int minor = 0;
+  cudartReturn_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  if (h.handle == NULL) {
+    resp->err = strdup("cudart handle not initialized");
+    return;
+  }
+
+  int devices;
+  ret = (*h.cudaGetDeviceCount)(&devices);
+  if (ret != CUDART_SUCCESS) {
+    snprintf(buf, buflen, "unable to get cudart device count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  for (i = 0; i < devices; i++) {
+    ret = (*h.cudaSetDevice)(i);
+    if (ret != CUDART_SUCCESS) {
+      snprintf(buf, buflen, "cudart device failed to initialize");
+      resp->err = strdup(buf);
+      return;
+    }
+
+    ret = (*h.cudaDeviceGetAttribute)(&major, cudartDevAttrComputeCapabilityMajor, i);
+    if (ret != CUDART_SUCCESS) {
+      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    ret = (*h.cudaDeviceGetAttribute)(&minor, cudartDevAttrComputeCapabilityMinor, i);
+    if (ret != CUDART_SUCCESS) {
+      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
+      resp->err = strdup(buf);
+      return;
+    }
+      
+    // Report the lowest major.minor we detect as that limits our compatibility
+    if (resp->major == 0 || resp->major > major ) {
+      resp->major = major;
+      resp->minor = minor;
+    } else if ( resp->major == major && resp->minor > minor ) {
+      resp->minor = minor;
+    }
+  }
+}
+
+#endif  // __APPLE__
--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
@ -0,0 +1,59 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_CUDART_H__
+#define __GPU_INFO_CUDART_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum cudartReturn_enum {
+  CUDART_SUCCESS = 0,
+  CUDART_UNSUPPORTED = 1,
+  // Other values omitted for now...
+} cudartReturn_t;
+
+typedef enum cudartDeviceAttr_enum {
+  cudartDevAttrComputeCapabilityMajor = 75,
+  cudartDevAttrComputeCapabilityMinor = 76,
+} cudartDeviceAttr_t;
+
+typedef void *cudartDevice_t;  // Opaque is sufficient
+typedef struct cudartMemory_st {
+  size_t total;
+  size_t free;
+  size_t used;
+} cudartMemory_t;
+
+typedef struct cudartDriverVersion {
+  int major;
+  int minor;
+} cudartDriverVersion_t;
+
+typedef struct cudart_handle {
+  void *handle;
+  uint16_t verbose;
+  cudartReturn_t (*cudaSetDevice)(int device);
+  cudartReturn_t (*cudaDeviceSynchronize)(void);
+  cudartReturn_t (*cudaDeviceReset)(void);
+  cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *);
+  cudartReturn_t (*cudaGetDeviceCount)(int *);
+  cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
+  cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
+} cudart_handle_t;
+
+typedef struct cudart_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  cudart_handle_t ch;
+} cudart_init_resp_t;
+
+typedef struct cudart_compute_capability {
+  char *err;
+  int major;
+  int minor;
+} cudart_compute_capability_t;
+
+
+void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
+void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
+void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
+
+#endif  // __GPU_INFO_CUDART_H__
+#endif  // __APPLE__
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
@ -1,10 +1,10 @@
 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

-#include "gpu_info_cuda.h"
-
 #include <string.h>

-void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
+#include "gpu_info_nvml.h"
+
+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  nvmlReturn_t ret;
  resp->err = NULL;
  const int buflen = 256;
@ -30,20 +30,20 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
      {NULL, NULL},
  };

-  resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
+  resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
-    LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
+    LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
-             cuda_lib_path, msg);
+             nvml_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }

  // TODO once we've squashed the remaining corner cases remove this log
-  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
+  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
  
  for (i = 0; l[i].s != NULL; i++) {
    // TODO once we've squashed the remaining corner cases remove this log
@ -82,7 +82,7 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
  }
 }

-void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
+void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  nvmlDevice_t device;
  nvmlMemory_t memInfo = {0};
@ -92,7 +92,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  int i;

  if (h.handle == NULL) {
-    resp->err = strdup("nvml handle sn't initialized");
+    resp->err = strdup("nvml handle isn't initialized");
    return;
  }

@ -155,15 +155,15 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
      }
    }

-    LOG(h.verbose, "[%d] CUDA totalMem %llu\n", i, memInfo.total);
-    LOG(h.verbose, "[%d] CUDA usedMem %llu\n", i, memInfo.used);
+    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
+    LOG(h.verbose, "[%d] CUDA freeMem %ld\n", i, memInfo.free);

    resp->total += memInfo.total;
    resp->free += memInfo.free;
  }
 }

-void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
+void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
  resp->err = NULL;
  resp->major = 0;
  resp->minor = 0;
--- a/gpu/gpu_info_nvml.h
+++ b/gpu/gpu_info_nvml.h
@ -1,6 +1,6 @@
 #ifndef __APPLE__
-#ifndef __GPU_INFO_CUDA_H__
-#define __GPU_INFO_CUDA_H__
+#ifndef __GPU_INFO_NVML_H__
+#define __GPU_INFO_NVML_H__
 #include "gpu_info.h"

 // Just enough typedef's to dlopen/dlsym for memory information
@ -20,7 +20,7 @@ typedef enum nvmlBrandType_enum
    NVML_BRAND_UNKNOWN          = 0,
 } nvmlBrandType_t;

-typedef struct cuda_handle {
+typedef struct nvml_handle {
  void *handle;
  uint16_t verbose;
  nvmlReturn_t (*nvmlInit_v2)(void);
@ -35,22 +35,22 @@ typedef struct cuda_handle {
  nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int  length);
  nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int  length);
  nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
-} cuda_handle_t;
+} nvml_handle_t;

-typedef struct cuda_init_resp {
+typedef struct nvml_init_resp {
  char *err;  // If err is non-null handle is invalid
-  cuda_handle_t ch;
-} cuda_init_resp_t;
+  nvml_handle_t ch;
+} nvml_init_resp_t;

-typedef struct cuda_compute_capability {
+typedef struct nvml_compute_capability {
  char *err;
  int major;
  int minor;
-} cuda_compute_capability_t;
+} nvml_compute_capability_t;

-void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
-void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
-void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
+void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
+void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);

-#endif  // __GPU_INFO_CUDA_H__
+#endif  // __GPU_INFO_NVML_H__
 #endif  // __APPLE__
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -90,30 +90,35 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
            compress_libs
        fi

-        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
+        if [ "${ARCH}" == "x86_64" ]; then
            #
-            # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
-            # Approximately 400% faster than LCD on same CPU
+            # ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
            #
-            init_vars
-            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
-            echo "Building AVX CPU"
-            build
-            compress_libs
-        fi
+            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
+                #
+                # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
+                # Approximately 400% faster than LCD on same CPU
+                #
+                init_vars
+                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+                BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
+                echo "Building AVX CPU"
+                build
+                compress_libs
+            fi

-        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
-            #
-            # ~2013 CPU Dynamic library
-            # Approximately 10% faster than AVX on same CPU
-            #
-            init_vars
-            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
-            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
-            echo "Building AVX2 CPU"
-            build
-            compress_libs
+            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
+                #
+                # ~2013 CPU Dynamic library
+                # Approximately 10% faster than AVX on same CPU
+                #
+                init_vars
+                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+                BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
+                echo "Building AVX2 CPU"
+                build
+                compress_libs
+            fi
        fi
    fi
 else
@ -142,12 +147,21 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
    if [ -n "${CUDA_MAJOR}" ]; then
        CUDA_VARIANT=_v${CUDA_MAJOR}
    fi
-    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
+    if [ "${ARCH}" == "arm64" ]; then
+        echo "ARM CPU detected - disabling unsupported AVX instructions"
+        
+        # ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
+        #
+        # CUDA compute < 6.0 lacks proper FP16 support on ARM. 
+        # Disabling has minimal performance effect while maintaining compatibility. 
+        ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
+    fi
+    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
    EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
    build

-    # Cary the CUDA libs as payloads to help reduce dependency burden on users
+    # Carry the CUDA libs as payloads to help reduce dependency burden on users
    #
    # TODO - in the future we may shift to packaging these separately and conditionally
    #        downloading them in the install script.