From b111aa5a91769e5af0edf7259773b20514f9883f Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 7 Nov 2024 14:25:53 -0800 Subject: [PATCH] Debug logging for nvcuda init (#7532) Some users are reporting crashes during nvcuda.dll initialization on windows. This should help narrow down where things are going bad. --- discover/gpu_info_nvcuda.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/discover/gpu_info_nvcuda.c b/discover/gpu_info_nvcuda.c index a1a38bfc..466e1ac2 100644 --- a/discover/gpu_info_nvcuda.c +++ b/discover/gpu_info_nvcuda.c @@ -4,6 +4,7 @@ #include "gpu_info_nvcuda.h" void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { + LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path); CUresult ret; resp->err = NULL; resp->num_devices = 0; @@ -57,8 +58,10 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { resp->cudaErr = -1; return; } + LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p); } + LOG(resp->ch.verbose, "calling cuInit\n"); ret = (*resp->ch.cuInit)(0); if (ret != CUDA_SUCCESS) { LOG(resp->ch.verbose, "cuInit err: %d\n", ret); @@ -75,15 +78,18 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { resp->ch.driver_minor = 0; // Report driver version if we're in verbose mode, ignore errors + LOG(resp->ch.verbose, "calling cuDriverGetVersion\n"); ret = (*resp->ch.cuDriverGetVersion)(&version); if (ret != CUDA_SUCCESS) { LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret); } else { + LOG(resp->ch.verbose, "raw version 0x%x\n", version); resp->ch.driver_major = version / 1000; resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10; LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor); } + LOG(resp->ch.verbose, "calling cuDeviceGetCount\n"); ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices); if (ret != CUDA_SUCCESS) { LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret); @@ -94,6 +100,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { resp->cudaErr = ret; return; } + LOG(resp->ch.verbose, "device count %d\n", resp->num_devices); } const int buflen = 256;