From 013fd071395087019b4e05959d8249f28ac11930 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Wed, 24 Jan 2024 10:32:00 -0800
Subject: [PATCH] More logging for gpu management

Fix an ordering glitch of dlerr/dlclose and add more logging to help
root cause some crashes users are hitting. This also refines the
function pointer names to use the underlying function names instead
of simplified names for readability.
---
 gpu/gpu.go          |  4 +++-
 gpu/gpu_info_cuda.c | 44 ++++++++++++++++++++++++++------------------
 gpu/gpu_info_cuda.h | 12 ++++++------
 gpu/gpu_info_rocm.c | 35 +++++++++++++++++++++--------------
 gpu/gpu_info_rocm.h | 10 +++++-----
 5 files changed, 61 insertions(+), 44 deletions(-)

diff --git a/gpu/gpu.go b/gpu/gpu.go
index f82bab85..fb120ea5 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -40,11 +40,13 @@ var CudaLinuxGlobs = []string{
 	"/usr/lib/wsl/lib/libnvidia-ml.so*",
 	"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
 	"/opt/cuda/lib64/libnvidia-ml.so*",
-	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
 	"/usr/lib*/libnvidia-ml.so*",
 	"/usr/local/lib*/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
+
+	// TODO: are these stubs ever valid?
+	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
 }
 
 var CudaWindowsGlobs = []string{
diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c
index 5d619436..9299b22c 100644
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -4,8 +4,6 @@
 
 #include <string.h>
 
-#define CUDA_LOOKUP_SIZE 12
-
 void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
   nvmlReturn_t ret;
   resp->err = NULL;
@@ -16,24 +14,26 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
   struct lookup {
     char *s;
     void **p;
-  } l[CUDA_LOOKUP_SIZE] = {
-      {"nvmlInit_v2", (void *)&resp->ch.initFn},
-      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
-      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
-      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
-      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
-      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
+  } l[] = {
+      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
+      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
+      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
+      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
       {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
       {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
       {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
       {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
       {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
       {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
+      {NULL, NULL},
   };
 
   resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
   if (!resp->ch.handle) {
     char *msg = LOAD_ERR();
+    LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
     snprintf(buf, buflen,
              "Unable to load %s library to query for Nvidia GPUs: %s",
              cuda_lib_path, msg);
@@ -42,12 +42,19 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
     return;
   }
 
-  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
+  
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
+
     *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
     if (!l[i].p) {
-      UNLOAD_LIBRARY(resp->ch.handle);
       resp->ch.handle = NULL;
       char *msg = LOAD_ERR();
+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->ch.handle);
       snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
                msg);
       free(msg);
@@ -56,8 +63,9 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
     }
   }
 
-  ret = (*resp->ch.initFn)();
+  ret = (*resp->ch.nvmlInit_v2)();
   if (ret != NVML_SUCCESS) {
+    LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
     UNLOAD_LIBRARY(resp->ch.handle);
     resp->ch.handle = NULL;
     snprintf(buf, buflen, "nvml vram init failure: %d", ret);
@@ -87,7 +95,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
     return;
   }
 
-  ret = (*h.getCount)(&resp->count);
+  ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
   if (ret != NVML_SUCCESS) {
     snprintf(buf, buflen, "unable to get device count: %d", ret);
     resp->err = strdup(buf);
@@ -97,14 +105,14 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
   resp->total = 0;
   resp->free = 0;
   for (i = 0; i < resp->count; i++) {
-    ret = (*h.getHandle)(i, &device);
+    ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
     if (ret != NVML_SUCCESS) {
       snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
       resp->err = strdup(buf);
       return;
     }
 
-    ret = (*h.getMemInfo)(device, &memInfo);
+    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
     if (ret != NVML_SUCCESS) {
       snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
       resp->err = strdup(buf);
@@ -172,7 +180,7 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
   }
 
   unsigned int devices;
-  ret = (*h.getCount)(&devices);
+  ret = (*h.nvmlDeviceGetCount_v2)(&devices);
   if (ret != NVML_SUCCESS) {
     snprintf(buf, buflen, "unable to get device count: %d", ret);
     resp->err = strdup(buf);
@@ -180,14 +188,14 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
   }
 
   for (i = 0; i < devices; i++) {
-    ret = (*h.getHandle)(i, &device);
+    ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
     if (ret != NVML_SUCCESS) {
       snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
       resp->err = strdup(buf);
       return;
     }
 
-    ret = (*h.getComputeCapability)(device, &major, &minor);
+    ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
     if (ret != NVML_SUCCESS) {
       snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
       resp->err = strdup(buf);
diff --git a/gpu/gpu_info_cuda.h b/gpu/gpu_info_cuda.h
index 41f1d424..5b1a27f5 100644
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -23,12 +23,12 @@ typedef enum nvmlBrandType_enum
 typedef struct cuda_handle {
   void *handle;
   uint16_t verbose;
-  nvmlReturn_t (*initFn)(void);
-  nvmlReturn_t (*shutdownFn)(void);
-  nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
-  nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
-  nvmlReturn_t (*getCount)(unsigned int *);
-  nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
+  nvmlReturn_t (*nvmlInit_v2)(void);
+  nvmlReturn_t (*nvmlShutdown)(void);
+  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
+  nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *);
+  nvmlReturn_t (*nvmlDeviceGetCudaComputeCapability)(nvmlDevice_t, int* major, int* minor);
   nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int  length);
   nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int  length);
   nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int  length);
diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c
index 3bb57621..59ab0817 100644
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -4,8 +4,6 @@
 
 #include <string.h>
 
-#define ROCM_LOOKUP_SIZE 14
-
 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
   rsmi_status_t ret;
   resp->err = NULL;
@@ -15,12 +13,12 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
   struct lookup {
     char *s;
     void **p;
-  } l[ROCM_LOOKUP_SIZE] = {
-      {"rsmi_init", (void *)&resp->rh.initFn},
-      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
-      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
-      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
-      {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
+  } l[] = {
+      {"rsmi_init", (void *)&resp->rh.rsmi_init},
+      {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
+      {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
+      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
+      {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
       {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
       {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
       {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
@@ -30,6 +28,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
       {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
       {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
       {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
+      {NULL, NULL},
   };
 
   resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
@@ -43,12 +42,19 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
     return;
   }
 
-  for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
+
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
+
     *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
     if (!l[i].p) {
-      UNLOAD_LIBRARY(resp->rh.handle);
       resp->rh.handle = NULL;
       char *msg = LOAD_ERR();
+      LOG(resp->rh.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->rh.handle);
       snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
                msg);
       free(msg);
@@ -57,8 +63,9 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
     }
   }
 
-  ret = (*resp->rh.initFn)(0);
+  ret = (*resp->rh.rsmi_init)(0);
   if (ret != RSMI_STATUS_SUCCESS) {
+    LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
     UNLOAD_LIBRARY(resp->rh.handle);
     resp->rh.handle = NULL;
     snprintf(buf, buflen, "rocm vram init failure: %d", ret);
@@ -141,13 +148,13 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
     }
 
     // Get total memory - used memory for available memory
-    ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
+    ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
     if (ret != RSMI_STATUS_SUCCESS) {
       snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
       resp->err = strdup(buf);
       return;
     }
-    ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
+    ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
     if (ret != RSMI_STATUS_SUCCESS) {
       snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
       resp->err = strdup(buf);
@@ -170,7 +177,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
   }
   rsmi_version_t ver;
   rsmi_status_t ret;
-  ret = h.versionGetFn(&ver);
+  ret = h.rsmi_version_get(&ver);
   if (ret != RSMI_STATUS_SUCCESS) {
     snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
     resp->status = 1;
diff --git a/gpu/gpu_info_rocm.h b/gpu/gpu_info_rocm.h
index f2a5b782..0a8d50c0 100644
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -25,11 +25,11 @@ typedef enum rsmi_memory_type {
 typedef struct rocm_handle {
   void *handle;
   uint16_t verbose;
-  rsmi_status_t (*initFn)(uint64_t);
-  rsmi_status_t (*shutdownFn)(void);
-  rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
-  rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
-  rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
+  rsmi_status_t (*rsmi_init)(uint64_t);
+  rsmi_status_t (*rsmi_shut_down)(void);
+  rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
   rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
   rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
   rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);