gpu: read memory info from all cuda devices (#1802)

* gpu: read memory info from all cuda devices

* add `LOOKUP_SIZE` constant

* better constant name

* address comments
This commit is contained in:
Jeffrey Morgan 2024-01-05 11:25:58 -05:00 committed by GitHub
parent 3367b5f3df
commit df32537312
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 13 deletions

View file

@ -20,6 +20,8 @@ const char *cuda_lib_paths[] = {
};
#endif
#define CUDA_LOOKUP_SIZE 5
void cuda_init(cuda_init_resp_t *resp) {
nvmlReturn_t ret;
resp->err = NULL;
@ -30,11 +32,12 @@ void cuda_init(cuda_init_resp_t *resp) {
struct lookup {
char *s;
void **p;
} l[4] = {
} l[CUDA_LOOKUP_SIZE] = {
{"nvmlInit_v2", (void *)&resp->ch.initFn},
{"nvmlShutdown", (void *)&resp->ch.shutdownFn},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
};
for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
@ -52,7 +55,7 @@ void cuda_init(cuda_init_resp_t *resp) {
return;
}
for (i = 0; i < 4; i++) { // TODO - fix this to use a null terminated list
for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(resp->ch.handle);
@ -89,22 +92,34 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
return;
}
// TODO - handle multiple GPUs
ret = (*h.getHandle)(0, &device);
unsigned int devices;
ret = (*h.getCount)(&devices);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle: %d", ret);
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
return;
}
resp->total = 0;
resp->free = 0;
for (i = 0; i < devices; i++) {
ret = (*h.getHandle)(i, &device);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
ret = (*h.getMemInfo)(device, &memInfo);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
resp->total = memInfo.total;
resp->free = memInfo.free;
return;
resp->total += memInfo.total;
resp->free += memInfo.free;
}
}
#endif // __APPLE__

View file

@ -21,6 +21,7 @@ typedef struct cuda_handle {
nvmlReturn_t (*shutdownFn)(void);
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
nvmlReturn_t (*getCount)(unsigned int *);
} cuda_handle_t;
typedef struct cuda_init_resp {