From dfc6721b203fdf2a91f022f61170d26306dbae63 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Mon, 25 Mar 2024 11:07:44 -0400 Subject: [PATCH] add support for libcudart.so for CUDA devices (adds Jetson support) --- gpu/gpu.go | 155 ++++++++++++++---- gpu/gpu_info.h | 3 +- gpu/gpu_info_cudart.c | 190 +++++++++++++++++++++++ gpu/gpu_info_cudart.h | 59 +++++++ gpu/{gpu_info_cuda.c => gpu_info_nvml.c} | 24 +-- gpu/{gpu_info_cuda.h => gpu_info_nvml.h} | 26 ++-- llm/generate/gen_common.sh | 2 +- llm/generate/gen_linux.sh | 60 ++++--- 8 files changed, 437 insertions(+), 82 deletions(-) create mode 100644 gpu/gpu_info_cudart.c create mode 100644 gpu/gpu_info_cudart.h rename gpu/{gpu_info_cuda.c => gpu_info_nvml.c} (91%) rename gpu/{gpu_info_cuda.h => gpu_info_nvml.h} (77%) diff --git a/gpu/gpu.go b/gpu/gpu.go index e0c18e26..d09e94e4 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -23,7 +23,8 @@ import ( ) type handles struct { - cuda *C.cuda_handle_t + nvml *C.nvml_handle_t + cudart *C.cudart_handle_t } var gpuMutex sync.Mutex @@ -33,7 +34,7 @@ var gpuHandles *handles = nil var CudaComputeMin = [2]C.int{5, 0} // Possible locations for the nvidia-ml library -var CudaLinuxGlobs = []string{ +var NvmlLinuxGlobs = []string{ "/usr/local/cuda/lib64/libnvidia-ml.so*", "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*", "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*", @@ -41,49 +42,98 @@ var CudaLinuxGlobs = []string{ "/usr/lib/wsl/drivers/*/libnvidia-ml.so*", "/opt/cuda/lib64/libnvidia-ml.so*", "/usr/lib*/libnvidia-ml.so*", - "/usr/local/lib*/libnvidia-ml.so*", "/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*", "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*", + "/usr/local/lib*/libnvidia-ml.so*", // TODO: are these stubs ever valid? "/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*", } -var CudaWindowsGlobs = []string{ +var NvmlWindowsGlobs = []string{ "c:\\Windows\\System32\\nvml.dll", } +var CudartLinuxGlobs = []string{ + "/usr/local/cuda/lib64/libcudart.so*", + "/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*", + "/usr/lib/x86_64-linux-gnu/libcudart.so*", + "/usr/lib/wsl/lib/libcudart.so*", + "/usr/lib/wsl/drivers/*/libcudart.so*", + "/opt/cuda/lib64/libcudart.so*", + "/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*", + "/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*", + "/usr/lib/aarch64-linux-gnu/libcudart.so*", + "/usr/local/cuda/lib*/libcudart.so*", + "/usr/lib*/libcudart.so*", + "/usr/local/lib*/libcudart.so*", +} + +var CudartWindowsGlobs = []string{ + "c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll", +} + +// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. +// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. +var CudaTegra string = os.Getenv("JETSON_JETPACK") + // Note: gpuMutex must already be held func initGPUHandles() { // TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing - gpuHandles = &handles{nil} - var cudaMgmtName string - var cudaMgmtPatterns []string + gpuHandles = &handles{nil, nil} + var nvmlMgmtName string + var nvmlMgmtPatterns []string + var cudartMgmtName string + var cudartMgmtPatterns []string + + tmpDir, _ := PayloadsDir() switch runtime.GOOS { case "windows": - cudaMgmtName = "nvml.dll" - cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs)) - copy(cudaMgmtPatterns, CudaWindowsGlobs) + nvmlMgmtName = "nvml.dll" + nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs)) + copy(nvmlMgmtPatterns, NvmlWindowsGlobs) + cudartMgmtName = "cudart64_*.dll" + localAppData := os.Getenv("LOCALAPPDATA") + cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)} + cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...) case "linux": - cudaMgmtName = "libnvidia-ml.so" - cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs)) - copy(cudaMgmtPatterns, CudaLinuxGlobs) + nvmlMgmtName = "libnvidia-ml.so" + nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs)) + copy(nvmlMgmtPatterns, NvmlLinuxGlobs) + cudartMgmtName = "libcudart.so*" + if tmpDir != "" { + // TODO - add "payloads" for subprocess + cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)} + } + cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...) default: return } slog.Info("Detecting GPU type") - cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns) - if len(cudaLibPaths) > 0 { - cuda := LoadCUDAMgmt(cudaLibPaths) - if cuda != nil { - slog.Info("Nvidia GPU detected") - gpuHandles.cuda = cuda + cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns) + if len(cudartLibPaths) > 0 { + cudart := LoadCUDARTMgmt(cudartLibPaths) + if cudart != nil { + slog.Info("Nvidia GPU detected via cudart") + gpuHandles.cudart = cudart return } } + + // TODO once we build confidence, remove this and the gpu_info_nvml.[ch] files + nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns) + if len(nvmlLibPaths) > 0 { + nvml := LoadNVMLMgmt(nvmlLibPaths) + if nvml != nil { + slog.Info("Nvidia GPU detected via nvidia-ml") + gpuHandles.nvml = nvml + return + } + } + } func GetGPUInfo() GpuInfo { @@ -103,23 +153,42 @@ func GetGPUInfo() GpuInfo { var memInfo C.mem_info_t resp := GpuInfo{} - if gpuHandles.cuda != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") { - C.cuda_check_vram(*gpuHandles.cuda, &memInfo) + if gpuHandles.nvml != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") { + C.nvml_check_vram(*gpuHandles.nvml, &memInfo) if memInfo.err != nil { - slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))) + slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU memory: %s", C.GoString(memInfo.err))) C.free(unsafe.Pointer(memInfo.err)) } else if memInfo.count > 0 { // Verify minimum compute capability - var cc C.cuda_compute_capability_t - C.cuda_compute_capability(*gpuHandles.cuda, &cc) + var cc C.nvml_compute_capability_t + C.nvml_compute_capability(*gpuHandles.nvml, &cc) if cc.err != nil { - slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err))) + slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU compute capability: %s", C.GoString(cc.err))) C.free(unsafe.Pointer(cc.err)) } else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) { - slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor)) + slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor)) resp.Library = "cuda" } else { - slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)) + slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)) + } + } + } else if gpuHandles.cudart != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") { + C.cudart_check_vram(*gpuHandles.cudart, &memInfo) + if memInfo.err != nil { + slog.Info(fmt.Sprintf("[cudart] error looking up CUDART GPU memory: %s", C.GoString(memInfo.err))) + C.free(unsafe.Pointer(memInfo.err)) + } else if memInfo.count > 0 { + // Verify minimum compute capability + var cc C.cudart_compute_capability_t + C.cudart_compute_capability(*gpuHandles.cudart, &cc) + if cc.err != nil { + slog.Info(fmt.Sprintf("[cudart] error looking up CUDA compute capability: %s", C.GoString(cc.err))) + C.free(unsafe.Pointer(cc.err)) + } else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) { + slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor)) + resp.Library = "cuda" + } else { + slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)) } } } else { @@ -176,6 +245,11 @@ func CheckVRAM() (int64, error) { if overhead < gpus*1024*1024*1024 { overhead = gpus * 1024 * 1024 * 1024 } + // Assigning full reported free memory for Tegras due to OS controlled caching. + if CudaTegra != "" { + // Setting overhead for non-Tegra devices + overhead = 0 + } avail := int64(gpuInfo.FreeMemory - overhead) slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024)) return avail, nil @@ -238,15 +312,32 @@ func FindGPULibs(baseLibName string, patterns []string) []string { return gpuLibPaths } -func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t { - var resp C.cuda_init_resp_t +func LoadNVMLMgmt(nvmlLibPaths []string) *C.nvml_handle_t { + var resp C.nvml_init_resp_t resp.ch.verbose = getVerboseState() - for _, libPath := range cudaLibPaths { + for _, libPath := range nvmlLibPaths { lib := C.CString(libPath) defer C.free(unsafe.Pointer(lib)) - C.cuda_init(lib, &resp) + C.nvml_init(lib, &resp) if resp.err != nil { - slog.Info(fmt.Sprintf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err))) + slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err))) + C.free(unsafe.Pointer(resp.err)) + } else { + return &resp.ch + } + } + return nil +} + +func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t { + var resp C.cudart_init_resp_t + resp.ch.verbose = getVerboseState() + for _, libPath := range cudartLibPaths { + lib := C.CString(libPath) + defer C.free(unsafe.Pointer(lib)) + C.cudart_init(lib, &resp) + if resp.err != nil { + slog.Info(fmt.Sprintf("Unable to load cudart CUDA management library %s: %s", libPath, C.GoString(resp.err))) C.free(unsafe.Pointer(resp.err)) } else { return &resp.ch diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h index 8186a3f0..4c449a60 100644 --- a/gpu/gpu_info.h +++ b/gpu/gpu_info.h @@ -52,7 +52,8 @@ void cpu_check_ram(mem_info_t *resp); } #endif -#include "gpu_info_cuda.h" +#include "gpu_info_nvml.h" +#include "gpu_info_cudart.h" #endif // __GPU_INFO_H__ #endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_cudart.c b/gpu/gpu_info_cudart.c new file mode 100644 index 00000000..9f69f845 --- /dev/null +++ b/gpu/gpu_info_cudart.c @@ -0,0 +1,190 @@ +#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? + +#include +#include "gpu_info_cudart.h" + +void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) { + cudartReturn_t ret; + resp->err = NULL; + const int buflen = 256; + char buf[buflen + 1]; + int i; + + struct lookup { + char *s; + void **p; + } l[] = { + {"cudaSetDevice", (void *)&resp->ch.cudaSetDevice}, + {"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize}, + {"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset}, + {"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo}, + {"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount}, + {"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute}, + {"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion}, + {NULL, NULL}, + }; + + resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY); + if (!resp->ch.handle) { + char *msg = LOAD_ERR(); + LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg); + snprintf(buf, buflen, + "Unable to load %s library to query for Nvidia GPUs: %s", + cudart_lib_path, msg); + free(msg); + resp->err = strdup(buf); + return; + } + + // TODO once we've squashed the remaining corner cases remove this log + LOG(resp->ch.verbose, "wiring cudart library functions in %s\n", cudart_lib_path); + + for (i = 0; l[i].s != NULL; i++) { + // TODO once we've squashed the remaining corner cases remove this log + LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s); + + *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); + if (!l[i].p) { + char *msg = LOAD_ERR(); + LOG(resp->ch.verbose, "dlerr: %s\n", msg); + UNLOAD_LIBRARY(resp->ch.handle); + resp->ch.handle = NULL; + snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, + msg); + free(msg); + resp->err = strdup(buf); + return; + } + } + + ret = (*resp->ch.cudaSetDevice)(0); + if (ret != CUDART_SUCCESS) { + LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret); + UNLOAD_LIBRARY(resp->ch.handle); + resp->ch.handle = NULL; + snprintf(buf, buflen, "cudart init failure: %d", ret); + resp->err = strdup(buf); + return; + } + + int version = 0; + cudartDriverVersion_t driverVersion; + driverVersion.major = 0; + driverVersion.minor = 0; + + // Report driver version if we're in verbose mode, ignore errors + ret = (*resp->ch.cudaDriverGetVersion)(&version); + if (ret != CUDART_SUCCESS) { + LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret); + } else { + driverVersion.major = version / 1000; + driverVersion.minor = (version - (driverVersion.major * 1000)) / 10; + LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor); + } +} + + +void cudart_check_vram(cudart_handle_t h, mem_info_t *resp) { + resp->err = NULL; + cudartMemory_t memInfo = {0,0,0}; + cudartReturn_t ret; + const int buflen = 256; + char buf[buflen + 1]; + int i; + + if (h.handle == NULL) { + resp->err = strdup("cudart handle isn't initialized"); + return; + } + + // cudaGetDeviceCount takes int type, resp-> count is uint + int deviceCount; + ret = (*h.cudaGetDeviceCount)(&deviceCount); + if (ret != CUDART_SUCCESS) { + snprintf(buf, buflen, "unable to get device count: %d", ret); + resp->err = strdup(buf); + return; + } else { + resp->count = (unsigned int)deviceCount; + } + + resp->total = 0; + resp->free = 0; + for (i = 0; i < resp-> count; i++) { + ret = (*h.cudaSetDevice)(i); + if (ret != CUDART_SUCCESS) { + snprintf(buf, buflen, "cudart device failed to initialize"); + resp->err = strdup(buf); + return; + } + ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total); + if (ret != CUDART_SUCCESS) { + snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret); + resp->err = strdup(buf); + return; + } + + LOG(h.verbose, "[%d] CUDA totalMem %lu\n", i, memInfo.total); + LOG(h.verbose, "[%d] CUDA freeMem %lu\n", i, memInfo.free); + + resp->total += memInfo.total; + resp->free += memInfo.free; + } +} + +void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *resp) { + resp->err = NULL; + resp->major = 0; + resp->minor = 0; + int major = 0; + int minor = 0; + cudartReturn_t ret; + const int buflen = 256; + char buf[buflen + 1]; + int i; + + if (h.handle == NULL) { + resp->err = strdup("cudart handle not initialized"); + return; + } + + int devices; + ret = (*h.cudaGetDeviceCount)(&devices); + if (ret != CUDART_SUCCESS) { + snprintf(buf, buflen, "unable to get cudart device count: %d", ret); + resp->err = strdup(buf); + return; + } + + for (i = 0; i < devices; i++) { + ret = (*h.cudaSetDevice)(i); + if (ret != CUDART_SUCCESS) { + snprintf(buf, buflen, "cudart device failed to initialize"); + resp->err = strdup(buf); + return; + } + + ret = (*h.cudaDeviceGetAttribute)(&major, cudartDevAttrComputeCapabilityMajor, i); + if (ret != CUDART_SUCCESS) { + snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret); + resp->err = strdup(buf); + return; + } + ret = (*h.cudaDeviceGetAttribute)(&minor, cudartDevAttrComputeCapabilityMinor, i); + if (ret != CUDART_SUCCESS) { + snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret); + resp->err = strdup(buf); + return; + } + + // Report the lowest major.minor we detect as that limits our compatibility + if (resp->major == 0 || resp->major > major ) { + resp->major = major; + resp->minor = minor; + } else if ( resp->major == major && resp->minor > minor ) { + resp->minor = minor; + } + } +} + +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_cudart.h b/gpu/gpu_info_cudart.h new file mode 100644 index 00000000..476e7555 --- /dev/null +++ b/gpu/gpu_info_cudart.h @@ -0,0 +1,59 @@ +#ifndef __APPLE__ +#ifndef __GPU_INFO_CUDART_H__ +#define __GPU_INFO_CUDART_H__ +#include "gpu_info.h" + +// Just enough typedef's to dlopen/dlsym for memory information +typedef enum cudartReturn_enum { + CUDART_SUCCESS = 0, + CUDART_UNSUPPORTED = 1, + // Other values omitted for now... +} cudartReturn_t; + +typedef enum cudartDeviceAttr_enum { + cudartDevAttrComputeCapabilityMajor = 75, + cudartDevAttrComputeCapabilityMinor = 76, +} cudartDeviceAttr_t; + +typedef void *cudartDevice_t; // Opaque is sufficient +typedef struct cudartMemory_st { + size_t total; + size_t free; + size_t used; +} cudartMemory_t; + +typedef struct cudartDriverVersion { + int major; + int minor; +} cudartDriverVersion_t; + +typedef struct cudart_handle { + void *handle; + uint16_t verbose; + cudartReturn_t (*cudaSetDevice)(int device); + cudartReturn_t (*cudaDeviceSynchronize)(void); + cudartReturn_t (*cudaDeviceReset)(void); + cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *); + cudartReturn_t (*cudaGetDeviceCount)(int *); + cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device); + cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion); +} cudart_handle_t; + +typedef struct cudart_init_resp { + char *err; // If err is non-null handle is invalid + cudart_handle_t ch; +} cudart_init_resp_t; + +typedef struct cudart_compute_capability { + char *err; + int major; + int minor; +} cudart_compute_capability_t; + + +void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp); +void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp); +void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc); + +#endif // __GPU_INFO_CUDART_H__ +#endif // __APPLE__ diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_nvml.c similarity index 91% rename from gpu/gpu_info_cuda.c rename to gpu/gpu_info_nvml.c index 36743d7c..aacf0410 100644 --- a/gpu/gpu_info_cuda.c +++ b/gpu/gpu_info_nvml.c @@ -1,10 +1,10 @@ #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? -#include "gpu_info_cuda.h" - #include -void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { +#include "gpu_info_nvml.h" + +void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) { nvmlReturn_t ret; resp->err = NULL; const int buflen = 256; @@ -30,20 +30,20 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { {NULL, NULL}, }; - resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY); + resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY); if (!resp->ch.handle) { char *msg = LOAD_ERR(); - LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg); + LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg); snprintf(buf, buflen, "Unable to load %s library to query for Nvidia GPUs: %s", - cuda_lib_path, msg); + nvml_lib_path, msg); free(msg); resp->err = strdup(buf); return; } // TODO once we've squashed the remaining corner cases remove this log - LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path); + LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path); for (i = 0; l[i].s != NULL; i++) { // TODO once we've squashed the remaining corner cases remove this log @@ -82,7 +82,7 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { } } -void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { +void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) { resp->err = NULL; nvmlDevice_t device; nvmlMemory_t memInfo = {0}; @@ -92,7 +92,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { int i; if (h.handle == NULL) { - resp->err = strdup("nvml handle sn't initialized"); + resp->err = strdup("nvml handle isn't initialized"); return; } @@ -155,15 +155,15 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { } } - LOG(h.verbose, "[%d] CUDA totalMem %llu\n", i, memInfo.total); - LOG(h.verbose, "[%d] CUDA usedMem %llu\n", i, memInfo.used); + LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total); + LOG(h.verbose, "[%d] CUDA freeMem %ld\n", i, memInfo.free); resp->total += memInfo.total; resp->free += memInfo.free; } } -void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) { +void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) { resp->err = NULL; resp->major = 0; resp->minor = 0; diff --git a/gpu/gpu_info_cuda.h b/gpu/gpu_info_nvml.h similarity index 77% rename from gpu/gpu_info_cuda.h rename to gpu/gpu_info_nvml.h index 5b1a27f5..819e41fd 100644 --- a/gpu/gpu_info_cuda.h +++ b/gpu/gpu_info_nvml.h @@ -1,6 +1,6 @@ #ifndef __APPLE__ -#ifndef __GPU_INFO_CUDA_H__ -#define __GPU_INFO_CUDA_H__ +#ifndef __GPU_INFO_NVML_H__ +#define __GPU_INFO_NVML_H__ #include "gpu_info.h" // Just enough typedef's to dlopen/dlsym for memory information @@ -20,7 +20,7 @@ typedef enum nvmlBrandType_enum NVML_BRAND_UNKNOWN = 0, } nvmlBrandType_t; -typedef struct cuda_handle { +typedef struct nvml_handle { void *handle; uint16_t verbose; nvmlReturn_t (*nvmlInit_v2)(void); @@ -35,22 +35,22 @@ typedef struct cuda_handle { nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int length); nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int length); nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type); -} cuda_handle_t; +} nvml_handle_t; -typedef struct cuda_init_resp { +typedef struct nvml_init_resp { char *err; // If err is non-null handle is invalid - cuda_handle_t ch; -} cuda_init_resp_t; + nvml_handle_t ch; +} nvml_init_resp_t; -typedef struct cuda_compute_capability { +typedef struct nvml_compute_capability { char *err; int major; int minor; -} cuda_compute_capability_t; +} nvml_compute_capability_t; -void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp); -void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp); -void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc); +void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp); +void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp); +void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc); -#endif // __GPU_INFO_CUDA_H__ +#endif // __GPU_INFO_NVML_H__ #endif // __APPLE__ \ No newline at end of file diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index 7638dc02..1186a06b 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -39,7 +39,7 @@ init_vars() { *) ;; esac - if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then + if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" fi } diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 591fc803..67f1d6e6 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -90,30 +90,35 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then compress_libs fi - if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then + if [ "${ARCH}" == "x86_64" ]; then # - # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance - # Approximately 400% faster than LCD on same CPU + # ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions. # - init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx" - echo "Building AVX CPU" - build - compress_libs - fi + if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then + # + # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance + # Approximately 400% faster than LCD on same CPU + # + init_vars + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx" + echo "Building AVX CPU" + build + compress_libs + fi - if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then - # - # ~2013 CPU Dynamic library - # Approximately 10% faster than AVX on same CPU - # - init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2" - echo "Building AVX2 CPU" - build - compress_libs + if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then + # + # ~2013 CPU Dynamic library + # Approximately 10% faster than AVX on same CPU + # + init_vars + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2" + echo "Building AVX2 CPU" + build + compress_libs + fi fi fi else @@ -142,12 +147,21 @@ if [ -d "${CUDA_LIB_DIR}" ]; then if [ -n "${CUDA_MAJOR}" ]; then CUDA_VARIANT=_v${CUDA_MAJOR} fi - CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" + if [ "${ARCH}" == "arm64" ]; then + echo "ARM CPU detected - disabling unsupported AVX instructions" + + # ARM-based CPUs such as M1 and Tegra do not support AVX extensions. + # + # CUDA compute < 6.0 lacks proper FP16 support on ARM. + # Disabling has minimal performance effect while maintaining compatibility. + ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off" + fi + CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}" BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}" EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" build - # Cary the CUDA libs as payloads to help reduce dependency burden on users + # Carry the CUDA libs as payloads to help reduce dependency burden on users # # TODO - in the future we may shift to packaging these separately and conditionally # downloading them in the install script.