Better nvidia GPU discovery logging

Refine the way we log GPU discovery to improve the non-debug
output, and report more actionable log messages when possible
to help users troubleshoot on their own.
This commit is contained in:
Daniel Hiltgen 2024-07-03 10:30:07 -07:00
parent e5352297d9
commit ef757da2c9
4 changed files with 51 additions and 23 deletions

View file

@ -70,14 +70,18 @@ curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example OLLAMA_TMPDIR=/usr/share/ollama/ If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example OLLAMA_TMPDIR=/usr/share/ollama/
## Container fails to run on NVIDIA GPU ## NVIDIA GPU Discovery
Make sure you've set up the container runtime first as described in [docker.md](./docker.md) When Ollama starts up, it takes inventory of the GPUs present in the system to determine compatibility and how much VRAM is available. Sometimes this discovery can fail to find your GPUs. In general, running the latest driver will yield the best results.
Sometimes the container runtime can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem ### Linux NVIDIA Troubleshooting
- Is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU. If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker.md](./docker.md)
- Is the uvm driver not loaded? `sudo nvidia-modprobe -u`
Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem
- If you are using a container, is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU.
- Is the uvm driver loaded? `sudo nvidia-modprobe -u`
- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm` - Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
- Try rebooting - Try rebooting
- Make sure you're running the latest nvidia drivers - Make sure you're running the latest nvidia drivers

View file

@ -202,7 +202,7 @@ func GetGPUInfo() GpuInfoList {
}() }()
if !bootstrapped { if !bootstrapped {
slog.Debug("Detecting GPUs") slog.Info("looking for compatible GPUs")
needRefresh = false needRefresh = false
cpuCapability = GetCPUCapability() cpuCapability = GetCPUCapability()
var memInfo C.mem_info_t var memInfo C.mem_info_t
@ -320,6 +320,9 @@ func GetGPUInfo() GpuInfoList {
rocmGPUs = AMDGetGPUInfo() rocmGPUs = AMDGetGPUInfo()
bootstrapped = true bootstrapped = true
if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
slog.Info("no compatible GPUs were discovered")
}
} }
// For detected GPUs, load library if not loaded // For detected GPUs, load library if not loaded
@ -514,7 +517,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
defer C.free(unsafe.Pointer(lib)) defer C.free(unsafe.Pointer(lib))
C.nvcuda_init(lib, &resp) C.nvcuda_init(lib, &resp)
if resp.err != nil { if resp.err != nil {
slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err)) // Decide what log level based on the type of error message to help users understand why
msg := C.GoString(resp.err)
switch resp.cudaErr {
case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH:
slog.Warn("version mismatch between driver and cuda driver library - reboot or upgrade may be required", "library", libPath, "error", msg)
case C.CUDA_ERROR_NO_DEVICE:
slog.Info("no nvidia devices detected", "library", libPath)
case C.CUDA_ERROR_UNKNOWN:
slog.Warn("unknown error initializing cuda driver library", "library", libPath, "error", msg)
slog.Warn("see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information")
default:
if strings.Contains(msg, "wrong ELF class") {
slog.Debug("skipping 32bit library", "library", libPath)
} else {
slog.Info("unable to load cuda driver library", "library", libPath, "error", msg)
}
}
C.free(unsafe.Pointer(resp.err)) C.free(unsafe.Pointer(resp.err))
} else { } else {
return int(resp.num_devices), &resp.ch, libPath return int(resp.num_devices), &resp.ch, libPath

View file

@ -7,6 +7,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
CUresult ret; CUresult ret;
resp->err = NULL; resp->err = NULL;
resp->num_devices = 0; resp->num_devices = 0;
resp->cudaErr = CUDA_SUCCESS;
const int buflen = 256; const int buflen = 256;
char buf[buflen + 1]; char buf[buflen + 1];
int i; int i;
@ -38,6 +39,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
nvcuda_lib_path, msg); nvcuda_lib_path, msg);
free(msg); free(msg);
resp->err = strdup(buf); resp->err = strdup(buf);
resp->cudaErr = -1;
return; return;
} }
@ -52,6 +54,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
msg); msg);
free(msg); free(msg);
resp->err = strdup(buf); resp->err = strdup(buf);
resp->cudaErr = -1;
return; return;
} }
} }
@ -61,12 +64,9 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
LOG(resp->ch.verbose, "cuInit err: %d\n", ret); LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle); UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL; resp->ch.handle = NULL;
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) { snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
return;
}
snprintf(buf, buflen, "nvcuda init failure: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
resp->cudaErr = ret;
return; return;
} }
@ -91,6 +91,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
resp->ch.handle = NULL; resp->ch.handle = NULL;
snprintf(buf, buflen, "unable to get device count: %d", ret); snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
resp->cudaErr = ret;
return; return;
} }
} }
@ -106,13 +107,13 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
if (h.handle == NULL) { if (h.handle == NULL) {
resp->err = strdup("nvcuda handle isn't initialized"); resp->err = strdup("cuda driver library handle isn't initialized");
return; return;
} }
ret = (*h.cuDeviceGet)(&device, i); ret = (*h.cuDeviceGet)(&device, i);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda device failed to initialize"); snprintf(buf, buflen, "cuda driver library device failed to initialize");
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
@ -168,14 +169,14 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
// To get memory we have to set (and release) a context // To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda failed to get device context %d", ret); snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total); ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret); snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
// Best effort on failure... // Best effort on failure...
(*h.cuCtxDestroy)(ctx); (*h.cuCtxDestroy)(ctx);
@ -193,7 +194,7 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
ret = (*h.cuCtxDestroy)(ctx); ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release device context %d", ret); LOG(1, "cuda driver library failed to release device context %d", ret);
} }
} }
@ -206,7 +207,7 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
ret = (*h.cuDeviceGet)(&device, i); ret = (*h.cuDeviceGet)(&device, i);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda device failed to initialize"); LOG(1, "cuda driver library device failed to initialize");
return; return;
} }
@ -214,13 +215,13 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
// To get memory we have to set (and release) a context // To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to get device context %d", ret); LOG(1, "cuda driver library failed to get device context %d", ret);
return; return;
} }
ret = (*h.cuMemGetInfo_v2)(free, total); ret = (*h.cuMemGetInfo_v2)(free, total);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda device memory info lookup failure %d", ret); LOG(1, "cuda driver library device memory info lookup failure %d", ret);
// Best effort on failure... // Best effort on failure...
(*h.cuCtxDestroy)(ctx); (*h.cuCtxDestroy)(ctx);
return; return;
@ -228,12 +229,12 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
ret = (*h.cuCtxDestroy)(ctx); ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release device context %d", ret); LOG(1, "cuda driver library failed to release device context %d", ret);
} }
} }
void nvcuda_release(nvcuda_handle_t h) { void nvcuda_release(nvcuda_handle_t h) {
LOG(h.verbose, "releasing nvcuda library\n"); LOG(h.verbose, "releasing cuda driver library\n");
UNLOAD_LIBRARY(h.handle); UNLOAD_LIBRARY(h.handle);
// TODO and other context release logic? // TODO and other context release logic?
h.handle = NULL; h.handle = NULL;

View file

@ -7,9 +7,12 @@
typedef enum cudaError_enum { typedef enum cudaError_enum {
CUDA_SUCCESS = 0, CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1, CUDA_ERROR_INVALID_VALUE = 1,
CUDA_ERROR_MEMORY_ALLOCATION = 2, CUDA_ERROR_OUT_OF_MEMORY = 2,
CUDA_ERROR_NOT_INITIALIZED = 3, CUDA_ERROR_NOT_INITIALIZED = 3,
CUDA_ERROR_INSUFFICIENT_DRIVER = 35, CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
CUDA_ERROR_NO_DEVICE = 100,
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803,
CUDA_ERROR_UNKNOWN = 999,
// Other values omitted for now... // Other values omitted for now...
} CUresult; } CUresult;
@ -64,6 +67,7 @@ typedef struct nvcuda_init_resp {
char *err; // If err is non-null handle is invalid char *err; // If err is non-null handle is invalid
nvcuda_handle_t ch; nvcuda_handle_t ch;
int num_devices; int num_devices;
CUresult cudaErr;
} nvcuda_init_resp_t; } nvcuda_init_resp_t;
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp); void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);