nvidia libs have inconsistent ordering (#7473)

The runtime and management libraries may not always have
identical ordering, so use the device UUID to correlate instead of ID.
This commit is contained in:
Daniel Hiltgen 2024-11-02 16:35:41 -07:00 committed by GitHub
parent b8d5036e33
commit 29ab9fa7d7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 13 additions and 9 deletions

View file

@ -316,7 +316,9 @@ func GetGPUInfo() GpuInfoList {
// query the management library as well so we can record any skew between the two // query the management library as well so we can record any skew between the two
// which represents overhead on the GPU we must set aside on subsequent updates // which represents overhead on the GPU we must set aside on subsequent updates
if cHandles.nvml != nil { if cHandles.nvml != nil {
C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used) uuid := C.CString(gpuInfo.ID)
defer C.free(unsafe.Pointer(uuid))
C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
if memInfo.err != nil { if memInfo.err != nil {
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err)) C.free(unsafe.Pointer(memInfo.err))
@ -417,7 +419,9 @@ func GetGPUInfo() GpuInfoList {
} }
for i, gpu := range cudaGPUs { for i, gpu := range cudaGPUs {
if cHandles.nvml != nil { if cHandles.nvml != nil {
C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used) uuid := C.CString(gpu.ID)
defer C.free(unsafe.Pointer(uuid))
C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
} else if cHandles.cudart != nil { } else if cHandles.cudart != nil {
C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo) C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
} else if cHandles.nvcuda != nil { } else if cHandles.nvcuda != nil {

View file

@ -17,7 +17,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
} l[] = { } l[] = {
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2}, {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown}, {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex}, {"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo}, {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
{NULL, NULL}, {NULL, NULL},
}; };
@ -67,20 +67,20 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
} }
void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) { void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) {
nvmlDevice_t device; nvmlDevice_t device;
nvmlMemory_t memInfo = {0}; nvmlMemory_t memInfo = {0};
nvmlReturn_t ret; nvmlReturn_t ret;
ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device); ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
LOG(1, "unable to get device handle %d: %d", device_id, ret); LOG(1, "unable to get device handle %s: %d", uuid, ret);
*free = 0; *free = 0;
return; return;
} }
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo); ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
LOG(1, "device memory info lookup failure %d: %d", device_id, ret); LOG(1, "device memory info lookup failure %s: %d", uuid, ret);
*free = 0; *free = 0;
return; return;
} }

View file

@ -25,7 +25,7 @@ typedef struct nvml_handle {
uint16_t verbose; uint16_t verbose;
nvmlReturn_t (*nvmlInit_v2)(void); nvmlReturn_t (*nvmlInit_v2)(void);
nvmlReturn_t (*nvmlShutdown)(void); nvmlReturn_t (*nvmlShutdown)(void);
nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *); nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *); nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
} nvml_handle_t; } nvml_handle_t;
@ -41,7 +41,7 @@ typedef struct nvml_compute_capability {
} nvml_compute_capability_t; } nvml_compute_capability_t;
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp); void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
void nvml_get_free(nvml_handle_t ch, int device_id, uint64_t *free, uint64_t *total, uint64_t *used); void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
void nvml_release(nvml_handle_t ch); void nvml_release(nvml_handle_t ch);
#endif // __GPU_INFO_NVML_H__ #endif // __GPU_INFO_NVML_H__