gpu: read memory info from all cuda devices (#1802)
* gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments
This commit is contained in:
parent
3367b5f3df
commit
df32537312
2 changed files with 29 additions and 13 deletions
|
@ -20,6 +20,8 @@ const char *cuda_lib_paths[] = {
|
|||
};
|
||||
#endif
|
||||
|
||||
#define CUDA_LOOKUP_SIZE 5
|
||||
|
||||
void cuda_init(cuda_init_resp_t *resp) {
|
||||
nvmlReturn_t ret;
|
||||
resp->err = NULL;
|
||||
|
@ -30,11 +32,12 @@ void cuda_init(cuda_init_resp_t *resp) {
|
|||
struct lookup {
|
||||
char *s;
|
||||
void **p;
|
||||
} l[4] = {
|
||||
} l[CUDA_LOOKUP_SIZE] = {
|
||||
{"nvmlInit_v2", (void *)&resp->ch.initFn},
|
||||
{"nvmlShutdown", (void *)&resp->ch.shutdownFn},
|
||||
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
|
||||
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
|
||||
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
|
||||
};
|
||||
|
||||
for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
|
||||
|
@ -52,7 +55,7 @@ void cuda_init(cuda_init_resp_t *resp) {
|
|||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < 4; i++) { // TODO - fix this to use a null terminated list
|
||||
for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list
|
||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||
if (!l[i].p) {
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
|
@ -89,22 +92,34 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
|||
return;
|
||||
}
|
||||
|
||||
// TODO - handle multiple GPUs
|
||||
ret = (*h.getHandle)(0, &device);
|
||||
unsigned int devices;
|
||||
ret = (*h.getCount)(&devices);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "unable to get device handle: %d", ret);
|
||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
resp->total = 0;
|
||||
resp->free = 0;
|
||||
|
||||
for (i = 0; i < devices; i++) {
|
||||
ret = (*h.getHandle)(i, &device);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.getMemInfo)(device, &memInfo);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
|
||||
snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
resp->total = memInfo.total;
|
||||
resp->free = memInfo.free;
|
||||
return;
|
||||
|
||||
resp->total += memInfo.total;
|
||||
resp->free += memInfo.free;
|
||||
}
|
||||
}
|
||||
#endif // __APPLE__
|
|
@ -21,6 +21,7 @@ typedef struct cuda_handle {
|
|||
nvmlReturn_t (*shutdownFn)(void);
|
||||
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
|
||||
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
|
||||
nvmlReturn_t (*getCount)(unsigned int *);
|
||||
} cuda_handle_t;
|
||||
|
||||
typedef struct cuda_init_resp {
|
||||
|
|
Loading…
Reference in a new issue