Merge pull request #2174 from dhiltgen/rocm_real_gpus

More logging for gpu management
This commit is contained in:
Daniel Hiltgen 2024-01-24 11:09:17 -08:00 committed by GitHub
commit a170888dd4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 61 additions and 44 deletions

View file

@ -40,11 +40,13 @@ var CudaLinuxGlobs = []string{
"/usr/lib/wsl/lib/libnvidia-ml.so*", "/usr/lib/wsl/lib/libnvidia-ml.so*",
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*", "/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
"/opt/cuda/lib64/libnvidia-ml.so*", "/opt/cuda/lib64/libnvidia-ml.so*",
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
"/usr/lib*/libnvidia-ml.so*", "/usr/lib*/libnvidia-ml.so*",
"/usr/local/lib*/libnvidia-ml.so*", "/usr/local/lib*/libnvidia-ml.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*", "/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*", "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
// TODO: are these stubs ever valid?
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
} }
var CudaWindowsGlobs = []string{ var CudaWindowsGlobs = []string{

View file

@ -4,8 +4,6 @@
#include <string.h> #include <string.h>
#define CUDA_LOOKUP_SIZE 12
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
nvmlReturn_t ret; nvmlReturn_t ret;
resp->err = NULL; resp->err = NULL;
@ -16,24 +14,26 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
struct lookup { struct lookup {
char *s; char *s;
void **p; void **p;
} l[CUDA_LOOKUP_SIZE] = { } l[] = {
{"nvmlInit_v2", (void *)&resp->ch.initFn}, {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
{"nvmlShutdown", (void *)&resp->ch.shutdownFn}, {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle}, {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo}, {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount}, {"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability}, {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
{"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion}, {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
{"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName}, {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
{"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial}, {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
{"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion}, {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
{"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber}, {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
{"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand}, {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
{NULL, NULL},
}; };
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY); resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
if (!resp->ch.handle) { if (!resp->ch.handle) {
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
snprintf(buf, buflen, snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s", "Unable to load %s library to query for Nvidia GPUs: %s",
cuda_lib_path, msg); cuda_lib_path, msg);
@ -42,12 +42,19 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
return; return;
} }
for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list // TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) { if (!l[i].p) {
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL; resp->ch.handle = NULL;
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg); msg);
free(msg); free(msg);
@ -56,8 +63,9 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
} }
} }
ret = (*resp->ch.initFn)(); ret = (*resp->ch.nvmlInit_v2)();
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle); UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL; resp->ch.handle = NULL;
snprintf(buf, buflen, "nvml vram init failure: %d", ret); snprintf(buf, buflen, "nvml vram init failure: %d", ret);
@ -87,7 +95,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
return; return;
} }
ret = (*h.getCount)(&resp->count); ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device count: %d", ret); snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@ -97,14 +105,14 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
resp->total = 0; resp->total = 0;
resp->free = 0; resp->free = 0;
for (i = 0; i < resp->count; i++) { for (i = 0; i < resp->count; i++) {
ret = (*h.getHandle)(i, &device); ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret); snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
ret = (*h.getMemInfo)(device, &memInfo); ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret); snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@ -172,7 +180,7 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
} }
unsigned int devices; unsigned int devices;
ret = (*h.getCount)(&devices); ret = (*h.nvmlDeviceGetCount_v2)(&devices);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device count: %d", ret); snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@ -180,14 +188,14 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
} }
for (i = 0; i < devices; i++) { for (i = 0; i < devices; i++) {
ret = (*h.getHandle)(i, &device); ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret); snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
ret = (*h.getComputeCapability)(device, &major, &minor); ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret); snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
resp->err = strdup(buf); resp->err = strdup(buf);

View file

@ -23,12 +23,12 @@ typedef enum nvmlBrandType_enum
typedef struct cuda_handle { typedef struct cuda_handle {
void *handle; void *handle;
uint16_t verbose; uint16_t verbose;
nvmlReturn_t (*initFn)(void); nvmlReturn_t (*nvmlInit_v2)(void);
nvmlReturn_t (*shutdownFn)(void); nvmlReturn_t (*nvmlShutdown)(void);
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *); nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *); nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
nvmlReturn_t (*getCount)(unsigned int *); nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *);
nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor); nvmlReturn_t (*nvmlDeviceGetCudaComputeCapability)(nvmlDevice_t, int* major, int* minor);
nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int length); nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int length); nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int length); nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int length);

View file

@ -4,8 +4,6 @@
#include <string.h> #include <string.h>
#define ROCM_LOOKUP_SIZE 14
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
rsmi_status_t ret; rsmi_status_t ret;
resp->err = NULL; resp->err = NULL;
@ -15,12 +13,12 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
struct lookup { struct lookup {
char *s; char *s;
void **p; void **p;
} l[ROCM_LOOKUP_SIZE] = { } l[] = {
{"rsmi_init", (void *)&resp->rh.initFn}, {"rsmi_init", (void *)&resp->rh.rsmi_init},
{"rsmi_shut_down", (void *)&resp->rh.shutdownFn}, {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
{"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn}, {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn}, {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
{"rsmi_version_get", (void *)&resp->rh.versionGetFn}, {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
{"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices}, {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
{"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get}, {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
{"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get}, {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
@ -30,6 +28,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
{"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get}, {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
{"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get}, {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
{"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get}, {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
{NULL, NULL},
}; };
resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY); resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
@ -43,12 +42,19 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
return; return;
} }
for (i = 0; i < ROCM_LOOKUP_SIZE; i++) { // TODO once we've squashed the remaining corner cases remove this log
LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
if (!l[i].p) { if (!l[i].p) {
UNLOAD_LIBRARY(resp->rh.handle);
resp->rh.handle = NULL; resp->rh.handle = NULL;
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->rh.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->rh.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg); msg);
free(msg); free(msg);
@ -57,8 +63,9 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
} }
} }
ret = (*resp->rh.initFn)(0); ret = (*resp->rh.rsmi_init)(0);
if (ret != RSMI_STATUS_SUCCESS) { if (ret != RSMI_STATUS_SUCCESS) {
LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
UNLOAD_LIBRARY(resp->rh.handle); UNLOAD_LIBRARY(resp->rh.handle);
resp->rh.handle = NULL; resp->rh.handle = NULL;
snprintf(buf, buflen, "rocm vram init failure: %d", ret); snprintf(buf, buflen, "rocm vram init failure: %d", ret);
@ -141,13 +148,13 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
} }
// Get total memory - used memory for available memory // Get total memory - used memory for available memory
ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem); ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
if (ret != RSMI_STATUS_SUCCESS) { if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret); snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem); ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
if (ret != RSMI_STATUS_SUCCESS) { if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret); snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@ -170,7 +177,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
} }
rsmi_version_t ver; rsmi_version_t ver;
rsmi_status_t ret; rsmi_status_t ret;
ret = h.versionGetFn(&ver); ret = h.rsmi_version_get(&ver);
if (ret != RSMI_STATUS_SUCCESS) { if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "unexpected response on version lookup %d", ret); snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
resp->status = 1; resp->status = 1;

View file

@ -25,11 +25,11 @@ typedef enum rsmi_memory_type {
typedef struct rocm_handle { typedef struct rocm_handle {
void *handle; void *handle;
uint16_t verbose; uint16_t verbose;
rsmi_status_t (*initFn)(uint64_t); rsmi_status_t (*rsmi_init)(uint64_t);
rsmi_status_t (*shutdownFn)(void); rsmi_status_t (*rsmi_shut_down)(void);
rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*versionGetFn) (rsmi_version_t *version); rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *); rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *); rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t); rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);