From 987c16b2f755c0ac7f3c3b03d4b875dffc96a551 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 22 Jan 2024 16:03:32 -0800
Subject: [PATCH] Report more information about GPUs in verbose mode

This adds additional calls to both CUDA and ROCm management libraries to
discover additional attributes about the GPU(s) detected in the system, and
wires up runtime verbosity selection.  When users hit problems with GPUs we can
ask them to run with `OLLAMA_DEBUG=1 ollama serve` and share the results.
---
 gpu/gpu.go          |   9 ++++
 gpu/gpu_info.h      |   7 +++
 gpu/gpu_info_cuda.c |  54 +++++++++++++++++++++-
 gpu/gpu_info_cuda.h |  12 +++++
 gpu/gpu_info_rocm.c | 108 +++++++++++++++++++++++++++++++++-----------
 gpu/gpu_info_rocm.h |  11 ++++-
 6 files changed, 171 insertions(+), 30 deletions(-)

diff --git a/gpu/gpu.go b/gpu/gpu.go
index b4124f35..f82bab85 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -259,6 +259,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
 
 func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
 	var resp C.cuda_init_resp_t
+	resp.ch.verbose = getVerboseState()
 	for _, libPath := range cudaLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
@@ -275,6 +276,7 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
 
 func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
 	var resp C.rocm_init_resp_t
+	resp.rh.verbose = getVerboseState()
 	for _, libPath := range rocmLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
@@ -288,3 +290,10 @@ func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
 	}
 	return nil
 }
+
+func getVerboseState() C.uint16_t {
+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+		return C.uint16_t(1)
+	}
+	return C.uint16_t(0)
+}
diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h
index 5ba19271..f32efa8e 100644
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -27,6 +27,13 @@
 
 #endif
 
+#define LOG(verbose, ...) \
+  do { \
+    if (verbose) { \
+      fprintf(stderr, __VA_ARGS__); \
+    } \
+  } while (0)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c
index 4ee0d8f9..5d619436 100644
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -4,7 +4,7 @@
 
 #include <string.h>
 
-#define CUDA_LOOKUP_SIZE 6
+#define CUDA_LOOKUP_SIZE 12
 
 void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
   nvmlReturn_t ret;
@@ -23,6 +23,12 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
       {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
       {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
       {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
+      {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
+      {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
+      {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
+      {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
+      {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
+      {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
   };
 
   resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
@@ -58,7 +64,13 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
     resp->err = strdup(buf);
   }
 
-  return;
+  // Report driver version if we're in verbose mode, ignore errors
+  ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
+  if (ret != NVML_SUCCESS) {
+    LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
+  } else {
+    LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
+  }
 }
 
 void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
@@ -98,6 +110,44 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
       resp->err = strdup(buf);
       return;
     }
+    if (h.verbose) {
+      nvmlBrandType_t brand = 0;
+      // When in verbose mode, report more information about
+      // the card we discover, but don't fail on error
+      ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetBrand)(device, &brand);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
+      }
+    }
+
+    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
+    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
 
     resp->total += memInfo.total;
     resp->free += memInfo.free;
diff --git a/gpu/gpu_info_cuda.h b/gpu/gpu_info_cuda.h
index 61c48012..41f1d424 100644
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -15,14 +15,26 @@ typedef struct nvmlMemory_st {
   unsigned long long used;
 } nvmlMemory_t;
 
+typedef enum nvmlBrandType_enum
+{
+    NVML_BRAND_UNKNOWN          = 0,
+} nvmlBrandType_t;
+
 typedef struct cuda_handle {
   void *handle;
+  uint16_t verbose;
   nvmlReturn_t (*initFn)(void);
   nvmlReturn_t (*shutdownFn)(void);
   nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
   nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
   nvmlReturn_t (*getCount)(unsigned int *);
   nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
+  nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
 } cuda_handle_t;
 
 typedef struct cuda_init_resp {
diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c
index 845274e1..3bb57621 100644
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -4,7 +4,7 @@
 
 #include <string.h>
 
-#define ROCM_LOOKUP_SIZE 5
+#define ROCM_LOOKUP_SIZE 14
 
 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
   rsmi_status_t ret;
@@ -21,7 +21,15 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
       {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
       {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
       {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
-      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
+      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
+      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
+      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
+      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
+      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
+      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
+      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
+      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
+      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
   };
 
   resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
@@ -62,8 +70,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 
 void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
   resp->err = NULL;
-  // uint32_t num_devices;
-  // uint16_t device;
   uint64_t totalMem = 0;
   uint64_t usedMem = 0;
   rsmi_status_t ret;
@@ -76,34 +82,82 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
     return;
   }
 
-  // TODO - iterate through devices...  ret =
-  // rsmi_num_monitor_devices(&num_devices);
-
-  // ret = (*h.getHandle)(0, &device);
-  // if (ret != RSMI_STATUS_SUCCESS) {
-  //     printf("rocm vram device lookup failure: %d\n", ret);
-  //     return -1;
-  // }
-
-  // Get total memory - used memory for available memory
-  ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
+  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
   if (ret != RSMI_STATUS_SUCCESS) {
-    snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-  ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
-  if (ret != RSMI_STATUS_SUCCESS) {
-    snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
     resp->err = strdup(buf);
     return;
   }
+  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
 
-  // TODO: set this to the actual number of devices
-  resp->count = 1;
-  resp->total = totalMem;
-  resp->free = totalMem - usedMem;
-  return;
+  resp->total = 0;
+  resp->free = 0;
+  for (i = 0; i < resp->count; i++) {
+    if (h.verbose) {
+      // When in verbose mode, report more information about
+      // the card we discover, but don't fail on error
+      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
+      }
+    }
+
+    // Get total memory - used memory for available memory
+    ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
+    if (ret != RSMI_STATUS_SUCCESS) {
+      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
+    if (ret != RSMI_STATUS_SUCCESS) {
+      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
+    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
+    resp->total += totalMem;
+    resp->free += totalMem - usedMem;
+  }
 }
 
 void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
diff --git a/gpu/gpu_info_rocm.h b/gpu/gpu_info_rocm.h
index 90d9a09f..f2a5b782 100644
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -24,12 +24,21 @@ typedef enum rsmi_memory_type {
 
 typedef struct rocm_handle {
   void *handle;
+  uint16_t verbose;
   rsmi_status_t (*initFn)(uint64_t);
   rsmi_status_t (*shutdownFn)(void);
   rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
   rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
   rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
-  // rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
+  rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
+  rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
+  rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
+  rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);		
 } rocm_handle_t;
 
 typedef struct rocm_init_resp {