From 29ab9fa7d730b9173418c6f6bb3ac4f1d7a6b014 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Sat, 2 Nov 2024 16:35:41 -0700 Subject: [PATCH] nvidia libs have inconsistent ordering (#7473) The runtime and management libraries may not always have identical ordering, so use the device UUID to correlate instead of ID. --- discover/gpu.go | 8 ++++++-- discover/gpu_info_nvml.c | 10 +++++----- discover/gpu_info_nvml.h | 4 ++-- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/discover/gpu.go b/discover/gpu.go index 4f2e0884..808c807b 100644 --- a/discover/gpu.go +++ b/discover/gpu.go @@ -316,7 +316,9 @@ func GetGPUInfo() GpuInfoList { // query the management library as well so we can record any skew between the two // which represents overhead on the GPU we must set aside on subsequent updates if cHandles.nvml != nil { - C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used) + uuid := C.CString(gpuInfo.ID) + defer C.free(unsafe.Pointer(uuid)) + C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used) if memInfo.err != nil { slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) C.free(unsafe.Pointer(memInfo.err)) @@ -417,7 +419,9 @@ func GetGPUInfo() GpuInfoList { } for i, gpu := range cudaGPUs { if cHandles.nvml != nil { - C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used) + uuid := C.CString(gpu.ID) + defer C.free(unsafe.Pointer(uuid)) + C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used) } else if cHandles.cudart != nil { C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo) } else if cHandles.nvcuda != nil { diff --git a/discover/gpu_info_nvml.c b/discover/gpu_info_nvml.c index 11293e44..342a3aa4 100644 --- a/discover/gpu_info_nvml.c +++ b/discover/gpu_info_nvml.c @@ -17,7 +17,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) { } l[] = { {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2}, {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown}, - {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex}, + {"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID}, {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo}, {NULL, NULL}, }; @@ -67,20 +67,20 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) { } -void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) { +void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) { nvmlDevice_t device; nvmlMemory_t memInfo = {0}; nvmlReturn_t ret; - ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device); + ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device); if (ret != NVML_SUCCESS) { - LOG(1, "unable to get device handle %d: %d", device_id, ret); + LOG(1, "unable to get device handle %s: %d", uuid, ret); *free = 0; return; } ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo); if (ret != NVML_SUCCESS) { - LOG(1, "device memory info lookup failure %d: %d", device_id, ret); + LOG(1, "device memory info lookup failure %s: %d", uuid, ret); *free = 0; return; } diff --git a/discover/gpu_info_nvml.h b/discover/gpu_info_nvml.h index a661f723..90880233 100644 --- a/discover/gpu_info_nvml.h +++ b/discover/gpu_info_nvml.h @@ -25,7 +25,7 @@ typedef struct nvml_handle { uint16_t verbose; nvmlReturn_t (*nvmlInit_v2)(void); nvmlReturn_t (*nvmlShutdown)(void); - nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *); + nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *); nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *); } nvml_handle_t; @@ -41,7 +41,7 @@ typedef struct nvml_compute_capability { } nvml_compute_capability_t; void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp); -void nvml_get_free(nvml_handle_t ch, int device_id, uint64_t *free, uint64_t *total, uint64_t *used); +void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used); void nvml_release(nvml_handle_t ch); #endif // __GPU_INFO_NVML_H__