From d74ce6bd4f78f8a06c97bf9b24485211c48a41d8 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sat, 6 Jan 2024 21:40:04 -0800
Subject: [PATCH] Detect very old CUDA GPUs and fall back to CPU

If we try to load the CUDA library on an old GPU, it panics and crashes
the server.  This checks the compute capability before we load the
library so we can gracefully fall back to CPU mode.
---
 gpu/gpu.go          | 16 +++++++++++++-
 gpu/gpu_info_cuda.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-
 gpu/gpu_info_cuda.h |  8 +++++++
 3 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/gpu/gpu.go b/gpu/gpu.go
index b9f6e4e0..45b55ffb 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -28,6 +28,9 @@ type handles struct {
 var gpuMutex sync.Mutex
 var gpuHandles *handles = nil
 
+// TODO verify this is the correct min version
+const CudaComputeMajorMin = 5
+
 // Note: gpuMutex must already be held
 func initGPUHandles() {
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
@@ -73,7 +76,18 @@ func GetGPUInfo() GpuInfo {
 			log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))
 			C.free(unsafe.Pointer(memInfo.err))
 		} else {
-			resp.Library = "cuda"
+			// Verify minimum compute capability
+			var cc C.cuda_compute_capability_t
+			C.cuda_compute_capability(*gpuHandles.cuda, &cc)
+			if cc.err != nil {
+				log.Printf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err))
+				C.free(unsafe.Pointer(cc.err))
+			} else if cc.major >= CudaComputeMajorMin {
+				log.Printf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor)
+				resp.Library = "cuda"
+			} else {
+				log.Printf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)
+			}
 		}
 	} else if gpuHandles.rocm != nil {
 		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c
index f071f909..9dc97bd9 100644
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -21,7 +21,7 @@ const char *cuda_lib_paths[] = {
 };
 #endif
 
-#define CUDA_LOOKUP_SIZE 5
+#define CUDA_LOOKUP_SIZE 6
 
 void cuda_init(cuda_init_resp_t *resp) {
   nvmlReturn_t ret;
@@ -39,6 +39,7 @@ void cuda_init(cuda_init_resp_t *resp) {
       {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
       {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
       {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
+      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
   };
 
   for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
@@ -123,4 +124,53 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
     resp->free += memInfo.free;
   }
 }
+
+void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
+  resp->err = NULL;
+  resp->major = 0;
+  resp->minor = 0;
+  nvmlDevice_t device;
+  int major = 0;
+  int minor = 0;
+  nvmlReturn_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  if (h.handle == NULL) {
+    resp->err = strdup("nvml handle not initialized");
+    return;
+  }
+
+  unsigned int devices;
+  ret = (*h.getCount)(&devices);
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  for (i = 0; i < devices; i++) {
+    ret = (*h.getHandle)(i, &device);
+    if (ret != NVML_SUCCESS) {
+      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
+      resp->err = strdup(buf);
+      return;
+    }
+
+    ret = (*h.getComputeCapability)(device, &major, &minor);
+    if (ret != NVML_SUCCESS) {
+      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    // Report the lowest major.minor we detect as that limits our compatibility
+    if (resp->major == 0 || resp->major > major ) {
+      resp->major = major;
+      resp->minor = minor;
+    } else if ( resp->major == major && resp->minor > minor ) {
+      resp->minor = minor;
+    }
+  }
+}
 #endif  // __APPLE__
\ No newline at end of file
diff --git a/gpu/gpu_info_cuda.h b/gpu/gpu_info_cuda.h
index 9a66a735..81995ab2 100644
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -22,6 +22,7 @@ typedef struct cuda_handle {
   nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
   nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
   nvmlReturn_t (*getCount)(unsigned int *);
+  nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
 } cuda_handle_t;
 
 typedef struct cuda_init_resp {
@@ -29,8 +30,15 @@ typedef struct cuda_init_resp {
   cuda_handle_t ch;
 } cuda_init_resp_t;
 
+typedef struct cuda_compute_capability {
+  char *err;
+  int major;
+  int minor;
+} cuda_compute_capability_t;
+
 void cuda_init(cuda_init_resp_t *resp);
 void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
+void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
 
 #endif  // __GPU_INFO_CUDA_H__
 #endif  // __APPLE__
\ No newline at end of file