diff --git a/gpu/gpu.go b/gpu/gpu.go index f82bab85..fb120ea5 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -40,11 +40,13 @@ var CudaLinuxGlobs = []string{ "/usr/lib/wsl/lib/libnvidia-ml.so*", "/usr/lib/wsl/drivers/*/libnvidia-ml.so*", "/opt/cuda/lib64/libnvidia-ml.so*", - "/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*", "/usr/lib*/libnvidia-ml.so*", "/usr/local/lib*/libnvidia-ml.so*", "/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*", "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*", + + // TODO: are these stubs ever valid? + "/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*", } var CudaWindowsGlobs = []string{ diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c index 5d619436..9299b22c 100644 --- a/gpu/gpu_info_cuda.c +++ b/gpu/gpu_info_cuda.c @@ -4,8 +4,6 @@ #include -#define CUDA_LOOKUP_SIZE 12 - void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { nvmlReturn_t ret; resp->err = NULL; @@ -16,24 +14,26 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { struct lookup { char *s; void **p; - } l[CUDA_LOOKUP_SIZE] = { - {"nvmlInit_v2", (void *)&resp->ch.initFn}, - {"nvmlShutdown", (void *)&resp->ch.shutdownFn}, - {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle}, - {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo}, - {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount}, - {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability}, + } l[] = { + {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2}, + {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown}, + {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex}, + {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo}, + {"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2}, + {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability}, {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion}, {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName}, {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial}, {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion}, {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber}, {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand}, + {NULL, NULL}, }; resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY); if (!resp->ch.handle) { char *msg = LOAD_ERR(); + LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg); snprintf(buf, buflen, "Unable to load %s library to query for Nvidia GPUs: %s", cuda_lib_path, msg); @@ -42,12 +42,19 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { return; } - for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list + // TODO once we've squashed the remaining corner cases remove this log + LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path); + + for (i = 0; l[i].s != NULL; i++) { + // TODO once we've squashed the remaining corner cases remove this log + LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s); + *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); if (!l[i].p) { - UNLOAD_LIBRARY(resp->ch.handle); resp->ch.handle = NULL; char *msg = LOAD_ERR(); + LOG(resp->ch.verbose, "dlerr: %s\n", msg); + UNLOAD_LIBRARY(resp->ch.handle); snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg); free(msg); @@ -56,8 +63,9 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { } } - ret = (*resp->ch.initFn)(); + ret = (*resp->ch.nvmlInit_v2)(); if (ret != NVML_SUCCESS) { + LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret); UNLOAD_LIBRARY(resp->ch.handle); resp->ch.handle = NULL; snprintf(buf, buflen, "nvml vram init failure: %d", ret); @@ -87,7 +95,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { return; } - ret = (*h.getCount)(&resp->count); + ret = (*h.nvmlDeviceGetCount_v2)(&resp->count); if (ret != NVML_SUCCESS) { snprintf(buf, buflen, "unable to get device count: %d", ret); resp->err = strdup(buf); @@ -97,14 +105,14 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { resp->total = 0; resp->free = 0; for (i = 0; i < resp->count; i++) { - ret = (*h.getHandle)(i, &device); + ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device); if (ret != NVML_SUCCESS) { snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret); resp->err = strdup(buf); return; } - ret = (*h.getMemInfo)(device, &memInfo); + ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo); if (ret != NVML_SUCCESS) { snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret); resp->err = strdup(buf); @@ -172,7 +180,7 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) { } unsigned int devices; - ret = (*h.getCount)(&devices); + ret = (*h.nvmlDeviceGetCount_v2)(&devices); if (ret != NVML_SUCCESS) { snprintf(buf, buflen, "unable to get device count: %d", ret); resp->err = strdup(buf); @@ -180,14 +188,14 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) { } for (i = 0; i < devices; i++) { - ret = (*h.getHandle)(i, &device); + ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device); if (ret != NVML_SUCCESS) { snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret); resp->err = strdup(buf); return; } - ret = (*h.getComputeCapability)(device, &major, &minor); + ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor); if (ret != NVML_SUCCESS) { snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret); resp->err = strdup(buf); diff --git a/gpu/gpu_info_cuda.h b/gpu/gpu_info_cuda.h index 41f1d424..5b1a27f5 100644 --- a/gpu/gpu_info_cuda.h +++ b/gpu/gpu_info_cuda.h @@ -23,12 +23,12 @@ typedef enum nvmlBrandType_enum typedef struct cuda_handle { void *handle; uint16_t verbose; - nvmlReturn_t (*initFn)(void); - nvmlReturn_t (*shutdownFn)(void); - nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *); - nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *); - nvmlReturn_t (*getCount)(unsigned int *); - nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor); + nvmlReturn_t (*nvmlInit_v2)(void); + nvmlReturn_t (*nvmlShutdown)(void); + nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *); + nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *); + nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *); + nvmlReturn_t (*nvmlDeviceGetCudaComputeCapability)(nvmlDevice_t, int* major, int* minor); nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int length); nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int length); nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int length); diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c index 3bb57621..59ab0817 100644 --- a/gpu/gpu_info_rocm.c +++ b/gpu/gpu_info_rocm.c @@ -4,8 +4,6 @@ #include -#define ROCM_LOOKUP_SIZE 14 - void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { rsmi_status_t ret; resp->err = NULL; @@ -15,12 +13,12 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { struct lookup { char *s; void **p; - } l[ROCM_LOOKUP_SIZE] = { - {"rsmi_init", (void *)&resp->rh.initFn}, - {"rsmi_shut_down", (void *)&resp->rh.shutdownFn}, - {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn}, - {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn}, - {"rsmi_version_get", (void *)&resp->rh.versionGetFn}, + } l[] = { + {"rsmi_init", (void *)&resp->rh.rsmi_init}, + {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down}, + {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get}, + {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get}, + {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get}, {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices}, {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get}, {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get}, @@ -30,6 +28,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get}, {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get}, {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get}, + {NULL, NULL}, }; resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY); @@ -43,12 +42,19 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { return; } - for (i = 0; i < ROCM_LOOKUP_SIZE; i++) { + // TODO once we've squashed the remaining corner cases remove this log + LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path); + + for (i = 0; l[i].s != NULL; i++) { + // TODO once we've squashed the remaining corner cases remove this log + LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s); + *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s); if (!l[i].p) { - UNLOAD_LIBRARY(resp->rh.handle); resp->rh.handle = NULL; char *msg = LOAD_ERR(); + LOG(resp->rh.verbose, "dlerr: %s\n", msg); + UNLOAD_LIBRARY(resp->rh.handle); snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg); free(msg); @@ -57,8 +63,9 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { } } - ret = (*resp->rh.initFn)(0); + ret = (*resp->rh.rsmi_init)(0); if (ret != RSMI_STATUS_SUCCESS) { + LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret); UNLOAD_LIBRARY(resp->rh.handle); resp->rh.handle = NULL; snprintf(buf, buflen, "rocm vram init failure: %d", ret); @@ -141,13 +148,13 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { } // Get total memory - used memory for available memory - ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem); + ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem); if (ret != RSMI_STATUS_SUCCESS) { snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret); resp->err = strdup(buf); return; } - ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem); + ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem); if (ret != RSMI_STATUS_SUCCESS) { snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret); resp->err = strdup(buf); @@ -170,7 +177,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) { } rsmi_version_t ver; rsmi_status_t ret; - ret = h.versionGetFn(&ver); + ret = h.rsmi_version_get(&ver); if (ret != RSMI_STATUS_SUCCESS) { snprintf(buf, buflen, "unexpected response on version lookup %d", ret); resp->status = 1; diff --git a/gpu/gpu_info_rocm.h b/gpu/gpu_info_rocm.h index f2a5b782..0a8d50c0 100644 --- a/gpu/gpu_info_rocm.h +++ b/gpu/gpu_info_rocm.h @@ -25,11 +25,11 @@ typedef enum rsmi_memory_type { typedef struct rocm_handle { void *handle; uint16_t verbose; - rsmi_status_t (*initFn)(uint64_t); - rsmi_status_t (*shutdownFn)(void); - rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); - rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); - rsmi_status_t (*versionGetFn) (rsmi_version_t *version); + rsmi_status_t (*rsmi_init)(uint64_t); + rsmi_status_t (*rsmi_shut_down)(void); + rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *); + rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *); + rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version); rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *); rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *); rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);