nvidia libs have inconsistent ordering (#7473)
The runtime and management libraries may not always have identical ordering, so use the device UUID to correlate instead of ID.
This commit is contained in:
parent
b8d5036e33
commit
29ab9fa7d7
3 changed files with 13 additions and 9 deletions
|
@ -316,7 +316,9 @@ func GetGPUInfo() GpuInfoList {
|
||||||
// query the management library as well so we can record any skew between the two
|
// query the management library as well so we can record any skew between the two
|
||||||
// which represents overhead on the GPU we must set aside on subsequent updates
|
// which represents overhead on the GPU we must set aside on subsequent updates
|
||||||
if cHandles.nvml != nil {
|
if cHandles.nvml != nil {
|
||||||
C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used)
|
uuid := C.CString(gpuInfo.ID)
|
||||||
|
defer C.free(unsafe.Pointer(uuid))
|
||||||
|
C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
|
@ -417,7 +419,9 @@ func GetGPUInfo() GpuInfoList {
|
||||||
}
|
}
|
||||||
for i, gpu := range cudaGPUs {
|
for i, gpu := range cudaGPUs {
|
||||||
if cHandles.nvml != nil {
|
if cHandles.nvml != nil {
|
||||||
C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
|
uuid := C.CString(gpu.ID)
|
||||||
|
defer C.free(unsafe.Pointer(uuid))
|
||||||
|
C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
|
||||||
} else if cHandles.cudart != nil {
|
} else if cHandles.cudart != nil {
|
||||||
C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
|
C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
|
||||||
} else if cHandles.nvcuda != nil {
|
} else if cHandles.nvcuda != nil {
|
||||||
|
|
|
@ -17,7 +17,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
||||||
} l[] = {
|
} l[] = {
|
||||||
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
|
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
|
||||||
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
|
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
|
||||||
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
|
{"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID},
|
||||||
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
|
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
|
||||||
{NULL, NULL},
|
{NULL, NULL},
|
||||||
};
|
};
|
||||||
|
@ -67,20 +67,20 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
|
void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) {
|
||||||
nvmlDevice_t device;
|
nvmlDevice_t device;
|
||||||
nvmlMemory_t memInfo = {0};
|
nvmlMemory_t memInfo = {0};
|
||||||
nvmlReturn_t ret;
|
nvmlReturn_t ret;
|
||||||
ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
|
ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device);
|
||||||
if (ret != NVML_SUCCESS) {
|
if (ret != NVML_SUCCESS) {
|
||||||
LOG(1, "unable to get device handle %d: %d", device_id, ret);
|
LOG(1, "unable to get device handle %s: %d", uuid, ret);
|
||||||
*free = 0;
|
*free = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
|
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
|
||||||
if (ret != NVML_SUCCESS) {
|
if (ret != NVML_SUCCESS) {
|
||||||
LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
|
LOG(1, "device memory info lookup failure %s: %d", uuid, ret);
|
||||||
*free = 0;
|
*free = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,7 +25,7 @@ typedef struct nvml_handle {
|
||||||
uint16_t verbose;
|
uint16_t verbose;
|
||||||
nvmlReturn_t (*nvmlInit_v2)(void);
|
nvmlReturn_t (*nvmlInit_v2)(void);
|
||||||
nvmlReturn_t (*nvmlShutdown)(void);
|
nvmlReturn_t (*nvmlShutdown)(void);
|
||||||
nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
|
nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
|
||||||
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
|
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
|
||||||
} nvml_handle_t;
|
} nvml_handle_t;
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ typedef struct nvml_compute_capability {
|
||||||
} nvml_compute_capability_t;
|
} nvml_compute_capability_t;
|
||||||
|
|
||||||
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
|
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
|
||||||
void nvml_get_free(nvml_handle_t ch, int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
|
void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
|
||||||
void nvml_release(nvml_handle_t ch);
|
void nvml_release(nvml_handle_t ch);
|
||||||
|
|
||||||
#endif // __GPU_INFO_NVML_H__
|
#endif // __GPU_INFO_NVML_H__
|
||||||
|
|
Loading…
Reference in a new issue