diff --git a/gpu/gpu.go b/gpu/gpu.go index b4124f35..f82bab85 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -259,6 +259,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string { func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t { var resp C.cuda_init_resp_t + resp.ch.verbose = getVerboseState() for _, libPath := range cudaLibPaths { lib := C.CString(libPath) defer C.free(unsafe.Pointer(lib)) @@ -275,6 +276,7 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t { func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t { var resp C.rocm_init_resp_t + resp.rh.verbose = getVerboseState() for _, libPath := range rocmLibPaths { lib := C.CString(libPath) defer C.free(unsafe.Pointer(lib)) @@ -288,3 +290,10 @@ func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t { } return nil } + +func getVerboseState() C.uint16_t { + if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" { + return C.uint16_t(1) + } + return C.uint16_t(0) +} diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h index 5ba19271..f32efa8e 100644 --- a/gpu/gpu_info.h +++ b/gpu/gpu_info.h @@ -27,6 +27,13 @@ #endif +#define LOG(verbose, ...) \ + do { \ + if (verbose) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while (0) + #ifdef __cplusplus extern "C" { #endif diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c index 4ee0d8f9..5d619436 100644 --- a/gpu/gpu_info_cuda.c +++ b/gpu/gpu_info_cuda.c @@ -4,7 +4,7 @@ #include -#define CUDA_LOOKUP_SIZE 6 +#define CUDA_LOOKUP_SIZE 12 void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { nvmlReturn_t ret; @@ -23,6 +23,12 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo}, {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount}, {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability}, + {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion}, + {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName}, + {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial}, + {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion}, + {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber}, + {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand}, }; resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY); @@ -58,7 +64,13 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { resp->err = strdup(buf); } - return; + // Report driver version if we're in verbose mode, ignore errors + ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen); + if (ret != NVML_SUCCESS) { + LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret); + } else { + LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf); + } } void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { @@ -98,6 +110,44 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { resp->err = strdup(buf); return; } + if (h.verbose) { + nvmlBrandType_t brand = 0; + // When in verbose mode, report more information about + // the card we discover, but don't fail on error + ret = (*h.nvmlDeviceGetName)(device, buf, buflen); + if (ret != RSMI_STATUS_SUCCESS) { + LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret); + } else { + LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf); + } + ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen); + if (ret != RSMI_STATUS_SUCCESS) { + LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret); + } else { + LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf); + } + ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen); + if (ret != RSMI_STATUS_SUCCESS) { + LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret); + } else { + LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf); + } + ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen); + if (ret != RSMI_STATUS_SUCCESS) { + LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret); + } else { + LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf); + } + ret = (*h.nvmlDeviceGetBrand)(device, &brand); + if (ret != RSMI_STATUS_SUCCESS) { + LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret); + } else { + LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand); + } + } + + LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total); + LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free); resp->total += memInfo.total; resp->free += memInfo.free; diff --git a/gpu/gpu_info_cuda.h b/gpu/gpu_info_cuda.h index 61c48012..41f1d424 100644 --- a/gpu/gpu_info_cuda.h +++ b/gpu/gpu_info_cuda.h @@ -15,14 +15,26 @@ typedef struct nvmlMemory_st { unsigned long long used; } nvmlMemory_t; +typedef enum nvmlBrandType_enum +{ + NVML_BRAND_UNKNOWN = 0, +} nvmlBrandType_t; + typedef struct cuda_handle { void *handle; + uint16_t verbose; nvmlReturn_t (*initFn)(void); nvmlReturn_t (*shutdownFn)(void); nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *); nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *); nvmlReturn_t (*getCount)(unsigned int *); nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor); + nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int length); + nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int length); + nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int length); + nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int length); + nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int length); + nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type); } cuda_handle_t; typedef struct cuda_init_resp { diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c index 845274e1..3bb57621 100644 --- a/gpu/gpu_info_rocm.c +++ b/gpu/gpu_info_rocm.c @@ -4,7 +4,7 @@ #include -#define ROCM_LOOKUP_SIZE 5 +#define ROCM_LOOKUP_SIZE 14 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { rsmi_status_t ret; @@ -21,7 +21,15 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn}, {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn}, {"rsmi_version_get", (void *)&resp->rh.versionGetFn}, - // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle }, + {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices}, + {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get}, + {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get}, + {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get}, + {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get}, + {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get}, + {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get}, + {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get}, + {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get}, }; resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY); @@ -62,8 +70,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { resp->err = NULL; - // uint32_t num_devices; - // uint16_t device; uint64_t totalMem = 0; uint64_t usedMem = 0; rsmi_status_t ret; @@ -76,34 +82,82 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { return; } - // TODO - iterate through devices... ret = - // rsmi_num_monitor_devices(&num_devices); - - // ret = (*h.getHandle)(0, &device); - // if (ret != RSMI_STATUS_SUCCESS) { - // printf("rocm vram device lookup failure: %d\n", ret); - // return -1; - // } - - // Get total memory - used memory for available memory - ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem); + ret = (*h.rsmi_num_monitor_devices)(&resp->count); if (ret != RSMI_STATUS_SUCCESS) { - snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret); - resp->err = strdup(buf); - return; - } - ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem); - if (ret != RSMI_STATUS_SUCCESS) { - snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret); + snprintf(buf, buflen, "unable to get device count: %d", ret); resp->err = strdup(buf); return; } + LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count); - // TODO: set this to the actual number of devices - resp->count = 1; - resp->total = totalMem; - resp->free = totalMem - usedMem; - return; + resp->total = 0; + resp->free = 0; + for (i = 0; i < resp->count; i++) { + if (h.verbose) { + // When in verbose mode, report more information about + // the card we discover, but don't fail on error + ret = (*h.rsmi_dev_name_get)(i, buf, buflen); + if (ret != RSMI_STATUS_SUCCESS) { + LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret); + } else { + LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf); + } + ret = (*h.rsmi_dev_brand_get)(i, buf, buflen); + if (ret != RSMI_STATUS_SUCCESS) { + LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret); + } else { + LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf); + } + ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen); + if (ret != RSMI_STATUS_SUCCESS) { + LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret); + } else { + LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf); + } + ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen); + if (ret != RSMI_STATUS_SUCCESS) { + LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret); + } else { + LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf); + } + ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen); + if (ret != RSMI_STATUS_SUCCESS) { + LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret); + } else { + LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf); + } + ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen); + if (ret != RSMI_STATUS_SUCCESS) { + LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret); + } else { + LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf); + } + ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen); + if (ret != RSMI_STATUS_SUCCESS) { + LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret); + } else { + LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf); + } + } + + // Get total memory - used memory for available memory + ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem); + if (ret != RSMI_STATUS_SUCCESS) { + snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret); + resp->err = strdup(buf); + return; + } + ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem); + if (ret != RSMI_STATUS_SUCCESS) { + snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret); + resp->err = strdup(buf); + return; + } + LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem); + LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem); + resp->total += totalMem; + resp->free += totalMem - usedMem; + } } void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) { diff --git a/gpu/gpu_info_rocm.h b/gpu/gpu_info_rocm.h index 90d9a09f..f2a5b782 100644 --- a/gpu/gpu_info_rocm.h +++ b/gpu/gpu_info_rocm.h @@ -24,12 +24,21 @@ typedef enum rsmi_memory_type { typedef struct rocm_handle { void *handle; + uint16_t verbose; rsmi_status_t (*initFn)(uint64_t); rsmi_status_t (*shutdownFn)(void); rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); rsmi_status_t (*versionGetFn) (rsmi_version_t *version); - // rsmi_status_t (*getHandle)(uint32_t, uint16_t *); + rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *); + rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *); + rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t); + rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t); + rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t); + rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t); + rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t); + rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t); + rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t); } rocm_handle_t; typedef struct rocm_init_resp {