Debug logging for nvcuda init (#7532)

Some users are reporting crashes during nvcuda.dll initialization
on windows.  This should help narrow down where things are going bad.
This commit is contained in:
Daniel Hiltgen 2024-11-07 14:25:53 -08:00 committed by GitHub
parent 9e83e550e1
commit b111aa5a91
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -4,6 +4,7 @@
#include "gpu_info_nvcuda.h" #include "gpu_info_nvcuda.h"
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
CUresult ret; CUresult ret;
resp->err = NULL; resp->err = NULL;
resp->num_devices = 0; resp->num_devices = 0;
@ -57,8 +58,10 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
resp->cudaErr = -1; resp->cudaErr = -1;
return; return;
} }
LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
} }
LOG(resp->ch.verbose, "calling cuInit\n");
ret = (*resp->ch.cuInit)(0); ret = (*resp->ch.cuInit)(0);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(resp->ch.verbose, "cuInit err: %d\n", ret); LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
@ -75,15 +78,18 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
resp->ch.driver_minor = 0; resp->ch.driver_minor = 0;
// Report driver version if we're in verbose mode, ignore errors // Report driver version if we're in verbose mode, ignore errors
LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
ret = (*resp->ch.cuDriverGetVersion)(&version); ret = (*resp->ch.cuDriverGetVersion)(&version);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret); LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
} else { } else {
LOG(resp->ch.verbose, "raw version 0x%x\n", version);
resp->ch.driver_major = version / 1000; resp->ch.driver_major = version / 1000;
resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10; resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor); LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
} }
LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices); ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret); LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
@ -94,6 +100,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
resp->cudaErr = ret; resp->cudaErr = ret;
return; return;
} }
LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
} }
const int buflen = 256; const int buflen = 256;