Merge pull request #4067 from dhiltgen/cudart

Add CUDA Driver API for GPU discovery
This commit is contained in:
Daniel Hiltgen 2024-05-06 13:30:27 -07:00 committed by GitHub
commit 06093fd396
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 342 additions and 5 deletions

View file

@ -27,6 +27,7 @@ import (
type handles struct { type handles struct {
deviceCount int deviceCount int
cudart *C.cudart_handle_t cudart *C.cudart_handle_t
nvcuda *C.nvcuda_handle_t
} }
const ( const (
@ -63,6 +64,22 @@ var CudartWindowsGlobs = []string{
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll", "c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
} }
var NvcudaLinuxGlobs = []string{
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
"/usr/lib/*-linux-gnu/libcuda.so*",
"/usr/lib/wsl/lib/libcuda.so*",
"/usr/lib/wsl/drivers/*/libcuda.so*",
"/opt/cuda/lib*/libcuda.so*",
"/usr/local/cuda/lib*/libcuda.so*",
"/usr/lib*/libcuda.so*",
"/usr/local/lib*/libcuda.so*",
}
var NvcudaWindowsGlobs = []string{
"c:\\windows\\system*\\nvcuda.dll",
}
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK") var CudaTegra string = os.Getenv("JETSON_JETPACK")
@ -75,6 +92,8 @@ func initGPUHandles() *handles {
gpuHandles := &handles{} gpuHandles := &handles{}
var cudartMgmtName string var cudartMgmtName string
var cudartMgmtPatterns []string var cudartMgmtPatterns []string
var nvcudaMgmtName string
var nvcudaMgmtPatterns []string
tmpDir, _ := PayloadsDir() tmpDir, _ := PayloadsDir()
switch runtime.GOOS { switch runtime.GOOS {
@ -83,6 +102,9 @@ func initGPUHandles() *handles {
localAppData := os.Getenv("LOCALAPPDATA") localAppData := os.Getenv("LOCALAPPDATA")
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)} cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...) cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
// Aligned with driver, we can't carry as payloads
nvcudaMgmtName = "nvcuda.dll"
nvcudaMgmtPatterns = NvcudaWindowsGlobs
case "linux": case "linux":
cudartMgmtName = "libcudart.so*" cudartMgmtName = "libcudart.so*"
if tmpDir != "" { if tmpDir != "" {
@ -90,11 +112,25 @@ func initGPUHandles() *handles {
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)} cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
} }
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...) cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
// Aligned with driver, we can't carry as payloads
nvcudaMgmtName = "libcuda.so*"
nvcudaMgmtPatterns = NvcudaLinuxGlobs
default: default:
return gpuHandles return gpuHandles
} }
slog.Info("Detecting GPUs") slog.Info("Detecting GPUs")
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
if len(nvcudaLibPaths) > 0 {
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
if nvcuda != nil {
slog.Info("detected GPUs", "count", deviceCount, "library", libPath)
gpuHandles.nvcuda = nvcuda
gpuHandles.deviceCount = deviceCount
return gpuHandles
}
}
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns) cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
if len(cudartLibPaths) > 0 { if len(cudartLibPaths) > 0 {
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths) deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
@ -119,6 +155,9 @@ func GetGPUInfo() GpuInfoList {
if gpuHandles.cudart != nil { if gpuHandles.cudart != nil {
C.cudart_release(*gpuHandles.cudart) C.cudart_release(*gpuHandles.cudart)
} }
if gpuHandles.nvcuda != nil {
C.nvcuda_release(*gpuHandles.nvcuda)
}
}() }()
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX // All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
@ -139,7 +178,11 @@ func GetGPUInfo() GpuInfoList {
gpuInfo := GpuInfo{ gpuInfo := GpuInfo{
Library: "cuda", Library: "cuda",
} }
if gpuHandles.cudart != nil {
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo) C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
} else {
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
}
if memInfo.err != nil { if memInfo.err != nil {
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err)) C.free(unsafe.Pointer(memInfo.err))
@ -197,9 +240,10 @@ func GetCPUMem() (memInfo, error) {
return ret, nil return ret, nil
} }
func FindGPULibs(baseLibName string, patterns []string) []string { func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them // Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
var ldPaths []string var ldPaths []string
var patterns []string
gpuLibPaths := []string{} gpuLibPaths := []string{}
slog.Debug("Searching for GPU library", "name", baseLibName) slog.Debug("Searching for GPU library", "name", baseLibName)
@ -219,6 +263,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
} }
patterns = append(patterns, filepath.Join(d, baseLibName+"*")) patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
} }
patterns = append(patterns, defaultPatterns...)
slog.Debug("gpu library search", "globs", patterns) slog.Debug("gpu library search", "globs", patterns)
for _, pattern := range patterns { for _, pattern := range patterns {
// Ignore glob discovery errors // Ignore glob discovery errors
@ -268,6 +313,23 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
return 0, nil, "" return 0, nil, ""
} }
func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
var resp C.nvcuda_init_resp_t
resp.ch.verbose = getVerboseState()
for _, libPath := range nvcudaLibPaths {
lib := C.CString(libPath)
defer C.free(unsafe.Pointer(lib))
C.nvcuda_init(lib, &resp)
if resp.err != nil {
slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
} else {
return int(resp.num_devices), &resp.ch, libPath
}
}
return 0, nil, ""
}
func getVerboseState() C.uint16_t { func getVerboseState() C.uint16_t {
if envconfig.Debug { if envconfig.Debug {
return C.uint16_t(1) return C.uint16_t(1)

View file

@ -58,6 +58,7 @@ void cpu_check_ram(mem_info_t *resp);
#endif #endif
#include "gpu_info_cudart.h" #include "gpu_info_cudart.h"
#include "gpu_info_nvcuda.h"
#endif // __GPU_INFO_H__ #endif // __GPU_INFO_H__
#endif // __APPLE__ #endif // __APPLE__

View file

@ -6,9 +6,9 @@
// Just enough typedef's to dlopen/dlsym for memory information // Just enough typedef's to dlopen/dlsym for memory information
typedef enum cudartReturn_enum { typedef enum cudartReturn_enum {
CUDART_SUCCESS = 0, CUDART_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1, CUDART_ERROR_INVALID_VALUE = 1,
CUDA_ERROR_MEMORY_ALLOCATION = 2, CUDART_ERROR_MEMORY_ALLOCATION = 2,
CUDA_ERROR_INSUFFICIENT_DRIVER = 35, CUDART_ERROR_INSUFFICIENT_DRIVER = 35,
// Other values omitted for now... // Other values omitted for now...
} cudartReturn_t; } cudartReturn_t;

203
gpu/gpu_info_nvcuda.c Normal file
View file

@ -0,0 +1,203 @@
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include <string.h>
#include "gpu_info_nvcuda.h"
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
CUresult ret;
resp->err = NULL;
resp->num_devices = 0;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup {
char *s;
void **p;
} l[] = {
{"cuInit", (void *)&resp->ch.cuInit},
{"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
{"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
{"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
{"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
{"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
{"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
{"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
{"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
{NULL, NULL},
};
resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
if (!resp->ch.handle) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s",
nvcuda_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
for (i = 0; l[i].s != NULL; i++) {
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!*l[i].p) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg);
free(msg);
resp->err = strdup(buf);
return;
}
}
ret = (*resp->ch.cuInit)(0);
if (ret != CUDA_SUCCESS) {
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
return;
}
snprintf(buf, buflen, "nvcuda init failure: %d", ret);
resp->err = strdup(buf);
return;
}
int version = 0;
nvcudaDriverVersion_t driverVersion;
driverVersion.major = 0;
driverVersion.minor = 0;
// Report driver version if we're in verbose mode, ignore errors
ret = (*resp->ch.cuDriverGetVersion)(&version);
if (ret != CUDA_SUCCESS) {
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
} else {
driverVersion.major = version / 1000;
driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
}
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
if (ret != CUDA_SUCCESS) {
LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
return;
}
}
const int buflen = 256;
void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
resp->err = NULL;
nvcudaMemory_t memInfo = {0,0};
CUresult ret;
CUdevice device = -1;
CUcontext ctx = NULL;
char buf[buflen + 1];
CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
if (h.handle == NULL) {
resp->err = strdup("nvcuda handle isn't initialized");
return;
}
ret = (*h.cuDeviceGet)(&device, i);
if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda device failed to initialize");
resp->err = strdup(buf);
return;
}
resp->major = 0;
resp->minor = 0;
int major = 0;
int minor = 0;
ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
if (ret != CUDA_SUCCESS) {
LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
} else {
ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
if (ret != CUDA_SUCCESS) {
LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
} else {
resp->minor = minor;
resp->major = major;
}
}
ret = (*h.cuDeviceGetUuid)(&uuid, device);
if (ret != CUDA_SUCCESS) {
LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
} else {
// GPU-d110a105-ac29-1d54-7b49-9c90440f215b
snprintf(&resp->gpu_id[0], GPU_ID_LEN,
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
uuid.bytes[0],
uuid.bytes[1],
uuid.bytes[2],
uuid.bytes[3],
uuid.bytes[4],
uuid.bytes[5],
uuid.bytes[6],
uuid.bytes[7],
uuid.bytes[8],
uuid.bytes[9],
uuid.bytes[10],
uuid.bytes[11],
uuid.bytes[12],
uuid.bytes[13],
uuid.bytes[14],
uuid.bytes[15]
);
}
// To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
resp->err = strdup(buf);
// Best effort on failure...
(*h.cuCtxDestroy)(ctx);
return;
}
resp->total = memInfo.total;
resp->free = memInfo.free;
LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release primary device context %d", ret);
}
}
void nvcuda_release(nvcuda_handle_t h) {
LOG(h.verbose, "releasing nvcuda library\n");
UNLOAD_LIBRARY(h.handle);
// TODO and other context release logic?
h.handle = NULL;
}
#endif // __APPLE__

71
gpu/gpu_info_nvcuda.h Normal file
View file

@ -0,0 +1,71 @@
#ifndef __APPLE__
#ifndef __GPU_INFO_NVCUDA_H__
#define __GPU_INFO_NVCUDA_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum cudaError_enum {
CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1,
CUDA_ERROR_MEMORY_ALLOCATION = 2,
CUDA_ERROR_NOT_INITIALIZED = 3,
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
// Other values omitted for now...
} CUresult;
typedef enum CUdevice_attribute_enum {
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
// TODO - not yet wired up but may be useful for Jetson or other
// integrated GPU scenarios with shared memory
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
} CUdevice_attribute;
typedef void *nvcudaDevice_t; // Opaque is sufficient
typedef struct nvcudaMemory_st {
uint64_t total;
uint64_t free;
} nvcudaMemory_t;
typedef struct nvcudaDriverVersion {
int major;
int minor;
} nvcudaDriverVersion_t;
typedef struct CUuuid_st {
unsigned char bytes[16];
} CUuuid;
typedef int CUdevice;
typedef void* CUcontext;
typedef struct nvcuda_handle {
void *handle;
uint16_t verbose;
CUresult (*cuInit)(unsigned int Flags);
CUresult (*cuDriverGetVersion)(int *driverVersion);
CUresult (*cuDeviceGetCount)(int *);
CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
// Context specific aspects
CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total);
CUresult (*cuCtxDestroy)(CUcontext ctx);
} nvcuda_handle_t;
typedef struct nvcuda_init_resp {
char *err; // If err is non-null handle is invalid
nvcuda_handle_t ch;
int num_devices;
} nvcuda_init_resp_t;
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
void nvcuda_release(nvcuda_handle_t ch);
#endif // __GPU_INFO_NVCUDA_H__
#endif // __APPLE__