More logging for gpu management

Fix an ordering glitch of dlerr/dlclose and add more logging to help
root cause some crashes users are hitting. This also refines the
function pointer names to use the underlying function names instead
of simplified names for readability.
This commit is contained in:
Daniel Hiltgen 2024-01-24 10:32:00 -08:00
parent f63dc2db5c
commit 013fd07139
5 changed files with 61 additions and 44 deletions

View file

@ -40,11 +40,13 @@ var CudaLinuxGlobs = []string{
"/usr/lib/wsl/lib/libnvidia-ml.so*", "/usr/lib/wsl/lib/libnvidia-ml.so*",
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*", "/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
"/opt/cuda/lib64/libnvidia-ml.so*", "/opt/cuda/lib64/libnvidia-ml.so*",
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
"/usr/lib*/libnvidia-ml.so*", "/usr/lib*/libnvidia-ml.so*",
"/usr/local/lib*/libnvidia-ml.so*", "/usr/local/lib*/libnvidia-ml.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*", "/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*", "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
// TODO: are these stubs ever valid?
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
} }
var CudaWindowsGlobs = []string{ var CudaWindowsGlobs = []string{

View file

@ -4,8 +4,6 @@
#include <string.h> #include <string.h>
#define CUDA_LOOKUP_SIZE 12
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
nvmlReturn_t ret; nvmlReturn_t ret;
resp->err = NULL; resp->err = NULL;
@ -16,24 +14,26 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
struct lookup { struct lookup {
char *s; char *s;
void **p; void **p;
} l[CUDA_LOOKUP_SIZE] = { } l[] = {
{"nvmlInit_v2", (void *)&resp->ch.initFn}, {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
{"nvmlShutdown", (void *)&resp->ch.shutdownFn}, {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle}, {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo}, {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount}, {"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability}, {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
{"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion}, {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
{"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName}, {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
{"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial}, {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
{"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion}, {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
{"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber}, {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
{"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand}, {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
{NULL, NULL},
}; };
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY); resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
if (!resp->ch.handle) { if (!resp->ch.handle) {
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
snprintf(buf, buflen, snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s", "Unable to load %s library to query for Nvidia GPUs: %s",
cuda_lib_path, msg); cuda_lib_path, msg);
@ -42,12 +42,19 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
return; return;
} }
for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list // TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) { if (!l[i].p) {
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL; resp->ch.handle = NULL;
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg); msg);
free(msg); free(msg);
@ -56,8 +63,9 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
} }
} }
ret = (*resp->ch.initFn)(); ret = (*resp->ch.nvmlInit_v2)();
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle); UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL; resp->ch.handle = NULL;
snprintf(buf, buflen, "nvml vram init failure: %d", ret); snprintf(buf, buflen, "nvml vram init failure: %d", ret);
@ -87,7 +95,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
return; return;
} }
ret = (*h.getCount)(&resp->count); ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device count: %d", ret); snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@ -97,14 +105,14 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
resp->total = 0; resp->total = 0;
resp->free = 0; resp->free = 0;
for (i = 0; i < resp->count; i++) { for (i = 0; i < resp->count; i++) {
ret = (*h.getHandle)(i, &device); ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret); snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
ret = (*h.getMemInfo)(device, &memInfo); ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret); snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@ -172,7 +180,7 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
} }
unsigned int devices; unsigned int devices;
ret = (*h.getCount)(&devices); ret = (*h.nvmlDeviceGetCount_v2)(&devices);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device count: %d", ret); snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@ -180,14 +188,14 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
} }
for (i = 0; i < devices; i++) { for (i = 0; i < devices; i++) {
ret = (*h.getHandle)(i, &device); ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret); snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
ret = (*h.getComputeCapability)(device, &major, &minor); ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret); snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
resp->err = strdup(buf); resp->err = strdup(buf);

View file

@ -23,12 +23,12 @@ typedef enum nvmlBrandType_enum
typedef struct cuda_handle { typedef struct cuda_handle {
void *handle; void *handle;
uint16_t verbose; uint16_t verbose;
nvmlReturn_t (*initFn)(void); nvmlReturn_t (*nvmlInit_v2)(void);
nvmlReturn_t (*shutdownFn)(void); nvmlReturn_t (*nvmlShutdown)(void);
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *); nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *); nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
nvmlReturn_t (*getCount)(unsigned int *); nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *);
nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor); nvmlReturn_t (*nvmlDeviceGetCudaComputeCapability)(nvmlDevice_t, int* major, int* minor);
nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int length); nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int length); nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int length);
nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int length); nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int length);

View file

@ -4,8 +4,6 @@
#include <string.h> #include <string.h>
#define ROCM_LOOKUP_SIZE 14
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
rsmi_status_t ret; rsmi_status_t ret;
resp->err = NULL; resp->err = NULL;
@ -15,12 +13,12 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
struct lookup { struct lookup {
char *s; char *s;
void **p; void **p;
} l[ROCM_LOOKUP_SIZE] = { } l[] = {
{"rsmi_init", (void *)&resp->rh.initFn}, {"rsmi_init", (void *)&resp->rh.rsmi_init},
{"rsmi_shut_down", (void *)&resp->rh.shutdownFn}, {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
{"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn}, {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn}, {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
{"rsmi_version_get", (void *)&resp->rh.versionGetFn}, {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
{"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices}, {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
{"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get}, {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
{"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get}, {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
@ -30,6 +28,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
{"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get}, {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
{"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get}, {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
{"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get}, {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
{NULL, NULL},
}; };
resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY); resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
@ -43,12 +42,19 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
return; return;
} }
for (i = 0; i < ROCM_LOOKUP_SIZE; i++) { // TODO once we've squashed the remaining corner cases remove this log
LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
if (!l[i].p) { if (!l[i].p) {
UNLOAD_LIBRARY(resp->rh.handle);
resp->rh.handle = NULL; resp->rh.handle = NULL;
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->rh.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->rh.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg); msg);
free(msg); free(msg);
@ -57,8 +63,9 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
} }
} }
ret = (*resp->rh.initFn)(0); ret = (*resp->rh.rsmi_init)(0);
if (ret != RSMI_STATUS_SUCCESS) { if (ret != RSMI_STATUS_SUCCESS) {
LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
UNLOAD_LIBRARY(resp->rh.handle); UNLOAD_LIBRARY(resp->rh.handle);
resp->rh.handle = NULL; resp->rh.handle = NULL;
snprintf(buf, buflen, "rocm vram init failure: %d", ret); snprintf(buf, buflen, "rocm vram init failure: %d", ret);
@ -141,13 +148,13 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
} }
// Get total memory - used memory for available memory // Get total memory - used memory for available memory
ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem); ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
if (ret != RSMI_STATUS_SUCCESS) { if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret); snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem); ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
if (ret != RSMI_STATUS_SUCCESS) { if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret); snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@ -170,7 +177,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
} }
rsmi_version_t ver; rsmi_version_t ver;
rsmi_status_t ret; rsmi_status_t ret;
ret = h.versionGetFn(&ver); ret = h.rsmi_version_get(&ver);
if (ret != RSMI_STATUS_SUCCESS) { if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "unexpected response on version lookup %d", ret); snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
resp->status = 1; resp->status = 1;

View file

@ -25,11 +25,11 @@ typedef enum rsmi_memory_type {
typedef struct rocm_handle { typedef struct rocm_handle {
void *handle; void *handle;
uint16_t verbose; uint16_t verbose;
rsmi_status_t (*initFn)(uint64_t); rsmi_status_t (*rsmi_init)(uint64_t);
rsmi_status_t (*shutdownFn)(void); rsmi_status_t (*rsmi_shut_down)(void);
rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*versionGetFn) (rsmi_version_t *version); rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *); rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *); rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t); rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);