Add CUDA Driver API for GPU discovery
We're seeing some corner cases with cudart which might be resolved by switching to the driver API which comes bundled with the driver package
This commit is contained in:
parent
b9f74ff3d6
commit
089daaeabc
5 changed files with 342 additions and 5 deletions
66
gpu/gpu.go
66
gpu/gpu.go
|
@ -26,6 +26,7 @@ import (
|
|||
type handles struct {
|
||||
deviceCount int
|
||||
cudart *C.cudart_handle_t
|
||||
nvcuda *C.nvcuda_handle_t
|
||||
}
|
||||
|
||||
const (
|
||||
|
@ -62,6 +63,22 @@ var CudartWindowsGlobs = []string{
|
|||
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
|
||||
}
|
||||
|
||||
var NvcudaLinuxGlobs = []string{
|
||||
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
|
||||
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
|
||||
"/usr/lib/*-linux-gnu/libcuda.so*",
|
||||
"/usr/lib/wsl/lib/libcuda.so*",
|
||||
"/usr/lib/wsl/drivers/*/libcuda.so*",
|
||||
"/opt/cuda/lib*/libcuda.so*",
|
||||
"/usr/local/cuda/lib*/libcuda.so*",
|
||||
"/usr/lib*/libcuda.so*",
|
||||
"/usr/local/lib*/libcuda.so*",
|
||||
}
|
||||
|
||||
var NvcudaWindowsGlobs = []string{
|
||||
"c:\\windows\\system*\\nvcuda.dll",
|
||||
}
|
||||
|
||||
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
||||
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||
|
@ -74,6 +91,8 @@ func initGPUHandles() *handles {
|
|||
gpuHandles := &handles{}
|
||||
var cudartMgmtName string
|
||||
var cudartMgmtPatterns []string
|
||||
var nvcudaMgmtName string
|
||||
var nvcudaMgmtPatterns []string
|
||||
|
||||
tmpDir, _ := PayloadsDir()
|
||||
switch runtime.GOOS {
|
||||
|
@ -82,6 +101,9 @@ func initGPUHandles() *handles {
|
|||
localAppData := os.Getenv("LOCALAPPDATA")
|
||||
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
|
||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
|
||||
// Aligned with driver, we can't carry as payloads
|
||||
nvcudaMgmtName = "nvcuda.dll"
|
||||
nvcudaMgmtPatterns = NvcudaWindowsGlobs
|
||||
case "linux":
|
||||
cudartMgmtName = "libcudart.so*"
|
||||
if tmpDir != "" {
|
||||
|
@ -89,11 +111,25 @@ func initGPUHandles() *handles {
|
|||
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
|
||||
}
|
||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
|
||||
// Aligned with driver, we can't carry as payloads
|
||||
nvcudaMgmtName = "libcuda.so*"
|
||||
nvcudaMgmtPatterns = NvcudaLinuxGlobs
|
||||
default:
|
||||
return gpuHandles
|
||||
}
|
||||
|
||||
slog.Info("Detecting GPUs")
|
||||
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
|
||||
if len(nvcudaLibPaths) > 0 {
|
||||
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
|
||||
if nvcuda != nil {
|
||||
slog.Info("detected GPUs", "count", deviceCount, "library", libPath)
|
||||
gpuHandles.nvcuda = nvcuda
|
||||
gpuHandles.deviceCount = deviceCount
|
||||
return gpuHandles
|
||||
}
|
||||
}
|
||||
|
||||
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
|
||||
if len(cudartLibPaths) > 0 {
|
||||
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
|
||||
|
@ -118,6 +154,9 @@ func GetGPUInfo() GpuInfoList {
|
|||
if gpuHandles.cudart != nil {
|
||||
C.cudart_release(*gpuHandles.cudart)
|
||||
}
|
||||
if gpuHandles.nvcuda != nil {
|
||||
C.nvcuda_release(*gpuHandles.nvcuda)
|
||||
}
|
||||
}()
|
||||
|
||||
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
|
||||
|
@ -138,7 +177,11 @@ func GetGPUInfo() GpuInfoList {
|
|||
gpuInfo := GpuInfo{
|
||||
Library: "cuda",
|
||||
}
|
||||
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
|
||||
if gpuHandles.cudart != nil {
|
||||
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
|
||||
} else {
|
||||
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
|
||||
}
|
||||
if memInfo.err != nil {
|
||||
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
||||
C.free(unsafe.Pointer(memInfo.err))
|
||||
|
@ -196,9 +239,10 @@ func GetCPUMem() (memInfo, error) {
|
|||
return ret, nil
|
||||
}
|
||||
|
||||
func FindGPULibs(baseLibName string, patterns []string) []string {
|
||||
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
||||
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
||||
var ldPaths []string
|
||||
var patterns []string
|
||||
gpuLibPaths := []string{}
|
||||
slog.Debug("Searching for GPU library", "name", baseLibName)
|
||||
|
||||
|
@ -218,6 +262,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
|
|||
}
|
||||
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
|
||||
}
|
||||
patterns = append(patterns, defaultPatterns...)
|
||||
slog.Debug("gpu library search", "globs", patterns)
|
||||
for _, pattern := range patterns {
|
||||
// Ignore glob discovery errors
|
||||
|
@ -267,6 +312,23 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
|
|||
return 0, nil, ""
|
||||
}
|
||||
|
||||
func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
|
||||
var resp C.nvcuda_init_resp_t
|
||||
resp.ch.verbose = getVerboseState()
|
||||
for _, libPath := range nvcudaLibPaths {
|
||||
lib := C.CString(libPath)
|
||||
defer C.free(unsafe.Pointer(lib))
|
||||
C.nvcuda_init(lib, &resp)
|
||||
if resp.err != nil {
|
||||
slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err))
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
} else {
|
||||
return int(resp.num_devices), &resp.ch, libPath
|
||||
}
|
||||
}
|
||||
return 0, nil, ""
|
||||
}
|
||||
|
||||
func getVerboseState() C.uint16_t {
|
||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||
return C.uint16_t(1)
|
||||
|
|
|
@ -58,6 +58,7 @@ void cpu_check_ram(mem_info_t *resp);
|
|||
#endif
|
||||
|
||||
#include "gpu_info_cudart.h"
|
||||
#include "gpu_info_nvcuda.h"
|
||||
|
||||
#endif // __GPU_INFO_H__
|
||||
#endif // __APPLE__
|
|
@ -6,9 +6,9 @@
|
|||
// Just enough typedef's to dlopen/dlsym for memory information
|
||||
typedef enum cudartReturn_enum {
|
||||
CUDART_SUCCESS = 0,
|
||||
CUDA_ERROR_INVALID_VALUE = 1,
|
||||
CUDA_ERROR_MEMORY_ALLOCATION = 2,
|
||||
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
|
||||
CUDART_ERROR_INVALID_VALUE = 1,
|
||||
CUDART_ERROR_MEMORY_ALLOCATION = 2,
|
||||
CUDART_ERROR_INSUFFICIENT_DRIVER = 35,
|
||||
// Other values omitted for now...
|
||||
} cudartReturn_t;
|
||||
|
||||
|
|
203
gpu/gpu_info_nvcuda.c
Normal file
203
gpu/gpu_info_nvcuda.c
Normal file
|
@ -0,0 +1,203 @@
|
|||
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
||||
|
||||
#include <string.h>
|
||||
#include "gpu_info_nvcuda.h"
|
||||
|
||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||
CUresult ret;
|
||||
resp->err = NULL;
|
||||
resp->num_devices = 0;
|
||||
const int buflen = 256;
|
||||
char buf[buflen + 1];
|
||||
int i;
|
||||
|
||||
struct lookup {
|
||||
char *s;
|
||||
void **p;
|
||||
} l[] = {
|
||||
|
||||
{"cuInit", (void *)&resp->ch.cuInit},
|
||||
{"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
|
||||
{"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
|
||||
{"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
|
||||
{"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
|
||||
{"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
|
||||
{"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
|
||||
{"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
|
||||
{"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
|
||||
if (!resp->ch.handle) {
|
||||
char *msg = LOAD_ERR();
|
||||
LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
|
||||
snprintf(buf, buflen,
|
||||
"Unable to load %s library to query for Nvidia GPUs: %s",
|
||||
nvcuda_lib_path, msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; l[i].s != NULL; i++) {
|
||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||
if (!*l[i].p) {
|
||||
char *msg = LOAD_ERR();
|
||||
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
||||
msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
ret = (*resp->ch.cuInit)(0);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
|
||||
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
|
||||
return;
|
||||
}
|
||||
snprintf(buf, buflen, "nvcuda init failure: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
int version = 0;
|
||||
nvcudaDriverVersion_t driverVersion;
|
||||
driverVersion.major = 0;
|
||||
driverVersion.minor = 0;
|
||||
|
||||
// Report driver version if we're in verbose mode, ignore errors
|
||||
ret = (*resp->ch.cuDriverGetVersion)(&version);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
|
||||
} else {
|
||||
driverVersion.major = version / 1000;
|
||||
driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
|
||||
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
|
||||
}
|
||||
|
||||
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const int buflen = 256;
|
||||
void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
||||
resp->err = NULL;
|
||||
nvcudaMemory_t memInfo = {0,0};
|
||||
CUresult ret;
|
||||
CUdevice device = -1;
|
||||
CUcontext ctx = NULL;
|
||||
char buf[buflen + 1];
|
||||
CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
|
||||
if (h.handle == NULL) {
|
||||
resp->err = strdup("nvcuda handle isn't initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.cuDeviceGet)(&device, i);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
snprintf(buf, buflen, "nvcuda device failed to initialize");
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
resp->major = 0;
|
||||
resp->minor = 0;
|
||||
int major = 0;
|
||||
int minor = 0;
|
||||
ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
|
||||
} else {
|
||||
ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
|
||||
} else {
|
||||
resp->minor = minor;
|
||||
resp->major = major;
|
||||
}
|
||||
}
|
||||
|
||||
ret = (*h.cuDeviceGetUuid)(&uuid, device);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
|
||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
|
||||
} else {
|
||||
// GPU-d110a105-ac29-1d54-7b49-9c90440f215b
|
||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN,
|
||||
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
uuid.bytes[0],
|
||||
uuid.bytes[1],
|
||||
uuid.bytes[2],
|
||||
uuid.bytes[3],
|
||||
uuid.bytes[4],
|
||||
uuid.bytes[5],
|
||||
uuid.bytes[6],
|
||||
uuid.bytes[7],
|
||||
uuid.bytes[8],
|
||||
uuid.bytes[9],
|
||||
uuid.bytes[10],
|
||||
uuid.bytes[11],
|
||||
uuid.bytes[12],
|
||||
uuid.bytes[13],
|
||||
uuid.bytes[14],
|
||||
uuid.bytes[15]
|
||||
);
|
||||
}
|
||||
|
||||
// To get memory we have to set (and release) a context
|
||||
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
// Best effort on failure...
|
||||
(*h.cuCtxDestroy)(ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
resp->total = memInfo.total;
|
||||
resp->free = memInfo.free;
|
||||
|
||||
LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
|
||||
LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
|
||||
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
|
||||
|
||||
|
||||
|
||||
ret = (*h.cuCtxDestroy)(ctx);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(1, "nvcuda failed to release primary device context %d", ret);
|
||||
}
|
||||
}
|
||||
|
||||
void nvcuda_release(nvcuda_handle_t h) {
|
||||
LOG(h.verbose, "releasing nvcuda library\n");
|
||||
UNLOAD_LIBRARY(h.handle);
|
||||
// TODO and other context release logic?
|
||||
h.handle = NULL;
|
||||
}
|
||||
|
||||
#endif // __APPLE__
|
71
gpu/gpu_info_nvcuda.h
Normal file
71
gpu/gpu_info_nvcuda.h
Normal file
|
@ -0,0 +1,71 @@
|
|||
#ifndef __APPLE__
|
||||
#ifndef __GPU_INFO_NVCUDA_H__
|
||||
#define __GPU_INFO_NVCUDA_H__
|
||||
#include "gpu_info.h"
|
||||
|
||||
// Just enough typedef's to dlopen/dlsym for memory information
|
||||
typedef enum cudaError_enum {
|
||||
CUDA_SUCCESS = 0,
|
||||
CUDA_ERROR_INVALID_VALUE = 1,
|
||||
CUDA_ERROR_MEMORY_ALLOCATION = 2,
|
||||
CUDA_ERROR_NOT_INITIALIZED = 3,
|
||||
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
|
||||
// Other values omitted for now...
|
||||
} CUresult;
|
||||
|
||||
typedef enum CUdevice_attribute_enum {
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
|
||||
|
||||
// TODO - not yet wired up but may be useful for Jetson or other
|
||||
// integrated GPU scenarios with shared memory
|
||||
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
|
||||
|
||||
} CUdevice_attribute;
|
||||
|
||||
typedef void *nvcudaDevice_t; // Opaque is sufficient
|
||||
typedef struct nvcudaMemory_st {
|
||||
uint64_t total;
|
||||
uint64_t free;
|
||||
} nvcudaMemory_t;
|
||||
|
||||
typedef struct nvcudaDriverVersion {
|
||||
int major;
|
||||
int minor;
|
||||
} nvcudaDriverVersion_t;
|
||||
|
||||
typedef struct CUuuid_st {
|
||||
unsigned char bytes[16];
|
||||
} CUuuid;
|
||||
|
||||
typedef int CUdevice;
|
||||
typedef void* CUcontext;
|
||||
|
||||
typedef struct nvcuda_handle {
|
||||
void *handle;
|
||||
uint16_t verbose;
|
||||
CUresult (*cuInit)(unsigned int Flags);
|
||||
CUresult (*cuDriverGetVersion)(int *driverVersion);
|
||||
CUresult (*cuDeviceGetCount)(int *);
|
||||
CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
|
||||
CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
|
||||
CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
|
||||
|
||||
// Context specific aspects
|
||||
CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
|
||||
CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total);
|
||||
CUresult (*cuCtxDestroy)(CUcontext ctx);
|
||||
} nvcuda_handle_t;
|
||||
|
||||
typedef struct nvcuda_init_resp {
|
||||
char *err; // If err is non-null handle is invalid
|
||||
nvcuda_handle_t ch;
|
||||
int num_devices;
|
||||
} nvcuda_init_resp_t;
|
||||
|
||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
|
||||
void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
|
||||
void nvcuda_release(nvcuda_handle_t ch);
|
||||
|
||||
#endif // __GPU_INFO_NVCUDA_H__
|
||||
#endif // __APPLE__
|
Loading…
Reference in a new issue