add support for libcudart.so for CUDA devices (adds Jetson support)
This commit is contained in:
parent
acfa2b9422
commit
dfc6721b20
8 changed files with 437 additions and 82 deletions
155
gpu/gpu.go
155
gpu/gpu.go
|
@ -23,7 +23,8 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
type handles struct {
|
type handles struct {
|
||||||
cuda *C.cuda_handle_t
|
nvml *C.nvml_handle_t
|
||||||
|
cudart *C.cudart_handle_t
|
||||||
}
|
}
|
||||||
|
|
||||||
var gpuMutex sync.Mutex
|
var gpuMutex sync.Mutex
|
||||||
|
@ -33,7 +34,7 @@ var gpuHandles *handles = nil
|
||||||
var CudaComputeMin = [2]C.int{5, 0}
|
var CudaComputeMin = [2]C.int{5, 0}
|
||||||
|
|
||||||
// Possible locations for the nvidia-ml library
|
// Possible locations for the nvidia-ml library
|
||||||
var CudaLinuxGlobs = []string{
|
var NvmlLinuxGlobs = []string{
|
||||||
"/usr/local/cuda/lib64/libnvidia-ml.so*",
|
"/usr/local/cuda/lib64/libnvidia-ml.so*",
|
||||||
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
|
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
|
||||||
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
|
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
|
||||||
|
@ -41,49 +42,98 @@ var CudaLinuxGlobs = []string{
|
||||||
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
|
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
|
||||||
"/opt/cuda/lib64/libnvidia-ml.so*",
|
"/opt/cuda/lib64/libnvidia-ml.so*",
|
||||||
"/usr/lib*/libnvidia-ml.so*",
|
"/usr/lib*/libnvidia-ml.so*",
|
||||||
"/usr/local/lib*/libnvidia-ml.so*",
|
|
||||||
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
|
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
|
||||||
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
|
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
|
||||||
|
"/usr/local/lib*/libnvidia-ml.so*",
|
||||||
|
|
||||||
// TODO: are these stubs ever valid?
|
// TODO: are these stubs ever valid?
|
||||||
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
|
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
|
||||||
}
|
}
|
||||||
|
|
||||||
var CudaWindowsGlobs = []string{
|
var NvmlWindowsGlobs = []string{
|
||||||
"c:\\Windows\\System32\\nvml.dll",
|
"c:\\Windows\\System32\\nvml.dll",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var CudartLinuxGlobs = []string{
|
||||||
|
"/usr/local/cuda/lib64/libcudart.so*",
|
||||||
|
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
|
||||||
|
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
|
||||||
|
"/usr/lib/wsl/lib/libcudart.so*",
|
||||||
|
"/usr/lib/wsl/drivers/*/libcudart.so*",
|
||||||
|
"/opt/cuda/lib64/libcudart.so*",
|
||||||
|
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
|
||||||
|
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
|
||||||
|
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
|
||||||
|
"/usr/local/cuda/lib*/libcudart.so*",
|
||||||
|
"/usr/lib*/libcudart.so*",
|
||||||
|
"/usr/local/lib*/libcudart.so*",
|
||||||
|
}
|
||||||
|
|
||||||
|
var CudartWindowsGlobs = []string{
|
||||||
|
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
||||||
|
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||||
|
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||||
|
|
||||||
// Note: gpuMutex must already be held
|
// Note: gpuMutex must already be held
|
||||||
func initGPUHandles() {
|
func initGPUHandles() {
|
||||||
|
|
||||||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
||||||
|
|
||||||
gpuHandles = &handles{nil}
|
gpuHandles = &handles{nil, nil}
|
||||||
var cudaMgmtName string
|
var nvmlMgmtName string
|
||||||
var cudaMgmtPatterns []string
|
var nvmlMgmtPatterns []string
|
||||||
|
var cudartMgmtName string
|
||||||
|
var cudartMgmtPatterns []string
|
||||||
|
|
||||||
|
tmpDir, _ := PayloadsDir()
|
||||||
switch runtime.GOOS {
|
switch runtime.GOOS {
|
||||||
case "windows":
|
case "windows":
|
||||||
cudaMgmtName = "nvml.dll"
|
nvmlMgmtName = "nvml.dll"
|
||||||
cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
|
nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
|
||||||
copy(cudaMgmtPatterns, CudaWindowsGlobs)
|
copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
|
||||||
|
cudartMgmtName = "cudart64_*.dll"
|
||||||
|
localAppData := os.Getenv("LOCALAPPDATA")
|
||||||
|
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
|
||||||
|
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
|
||||||
case "linux":
|
case "linux":
|
||||||
cudaMgmtName = "libnvidia-ml.so"
|
nvmlMgmtName = "libnvidia-ml.so"
|
||||||
cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
|
nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs))
|
||||||
copy(cudaMgmtPatterns, CudaLinuxGlobs)
|
copy(nvmlMgmtPatterns, NvmlLinuxGlobs)
|
||||||
|
cudartMgmtName = "libcudart.so*"
|
||||||
|
if tmpDir != "" {
|
||||||
|
// TODO - add "payloads" for subprocess
|
||||||
|
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
|
||||||
|
}
|
||||||
|
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
|
||||||
default:
|
default:
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("Detecting GPU type")
|
slog.Info("Detecting GPU type")
|
||||||
cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
|
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
|
||||||
if len(cudaLibPaths) > 0 {
|
if len(cudartLibPaths) > 0 {
|
||||||
cuda := LoadCUDAMgmt(cudaLibPaths)
|
cudart := LoadCUDARTMgmt(cudartLibPaths)
|
||||||
if cuda != nil {
|
if cudart != nil {
|
||||||
slog.Info("Nvidia GPU detected")
|
slog.Info("Nvidia GPU detected via cudart")
|
||||||
gpuHandles.cuda = cuda
|
gpuHandles.cudart = cudart
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO once we build confidence, remove this and the gpu_info_nvml.[ch] files
|
||||||
|
nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
|
||||||
|
if len(nvmlLibPaths) > 0 {
|
||||||
|
nvml := LoadNVMLMgmt(nvmlLibPaths)
|
||||||
|
if nvml != nil {
|
||||||
|
slog.Info("Nvidia GPU detected via nvidia-ml")
|
||||||
|
gpuHandles.nvml = nvml
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetGPUInfo() GpuInfo {
|
func GetGPUInfo() GpuInfo {
|
||||||
|
@ -103,23 +153,42 @@ func GetGPUInfo() GpuInfo {
|
||||||
|
|
||||||
var memInfo C.mem_info_t
|
var memInfo C.mem_info_t
|
||||||
resp := GpuInfo{}
|
resp := GpuInfo{}
|
||||||
if gpuHandles.cuda != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
if gpuHandles.nvml != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
||||||
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
|
C.nvml_check_vram(*gpuHandles.nvml, &memInfo)
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
|
slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU memory: %s", C.GoString(memInfo.err)))
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
} else if memInfo.count > 0 {
|
} else if memInfo.count > 0 {
|
||||||
// Verify minimum compute capability
|
// Verify minimum compute capability
|
||||||
var cc C.cuda_compute_capability_t
|
var cc C.nvml_compute_capability_t
|
||||||
C.cuda_compute_capability(*gpuHandles.cuda, &cc)
|
C.nvml_compute_capability(*gpuHandles.nvml, &cc)
|
||||||
if cc.err != nil {
|
if cc.err != nil {
|
||||||
slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
|
slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU compute capability: %s", C.GoString(cc.err)))
|
||||||
C.free(unsafe.Pointer(cc.err))
|
C.free(unsafe.Pointer(cc.err))
|
||||||
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
|
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
|
||||||
slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
|
slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||||
resp.Library = "cuda"
|
resp.Library = "cuda"
|
||||||
} else {
|
} else {
|
||||||
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if gpuHandles.cudart != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
||||||
|
C.cudart_check_vram(*gpuHandles.cudart, &memInfo)
|
||||||
|
if memInfo.err != nil {
|
||||||
|
slog.Info(fmt.Sprintf("[cudart] error looking up CUDART GPU memory: %s", C.GoString(memInfo.err)))
|
||||||
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
|
} else if memInfo.count > 0 {
|
||||||
|
// Verify minimum compute capability
|
||||||
|
var cc C.cudart_compute_capability_t
|
||||||
|
C.cudart_compute_capability(*gpuHandles.cudart, &cc)
|
||||||
|
if cc.err != nil {
|
||||||
|
slog.Info(fmt.Sprintf("[cudart] error looking up CUDA compute capability: %s", C.GoString(cc.err)))
|
||||||
|
C.free(unsafe.Pointer(cc.err))
|
||||||
|
} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
|
||||||
|
slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||||
|
resp.Library = "cuda"
|
||||||
|
} else {
|
||||||
|
slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -176,6 +245,11 @@ func CheckVRAM() (int64, error) {
|
||||||
if overhead < gpus*1024*1024*1024 {
|
if overhead < gpus*1024*1024*1024 {
|
||||||
overhead = gpus * 1024 * 1024 * 1024
|
overhead = gpus * 1024 * 1024 * 1024
|
||||||
}
|
}
|
||||||
|
// Assigning full reported free memory for Tegras due to OS controlled caching.
|
||||||
|
if CudaTegra != "" {
|
||||||
|
// Setting overhead for non-Tegra devices
|
||||||
|
overhead = 0
|
||||||
|
}
|
||||||
avail := int64(gpuInfo.FreeMemory - overhead)
|
avail := int64(gpuInfo.FreeMemory - overhead)
|
||||||
slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
|
slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
|
||||||
return avail, nil
|
return avail, nil
|
||||||
|
@ -238,15 +312,32 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
|
||||||
return gpuLibPaths
|
return gpuLibPaths
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
|
func LoadNVMLMgmt(nvmlLibPaths []string) *C.nvml_handle_t {
|
||||||
var resp C.cuda_init_resp_t
|
var resp C.nvml_init_resp_t
|
||||||
resp.ch.verbose = getVerboseState()
|
resp.ch.verbose = getVerboseState()
|
||||||
for _, libPath := range cudaLibPaths {
|
for _, libPath := range nvmlLibPaths {
|
||||||
lib := C.CString(libPath)
|
lib := C.CString(libPath)
|
||||||
defer C.free(unsafe.Pointer(lib))
|
defer C.free(unsafe.Pointer(lib))
|
||||||
C.cuda_init(lib, &resp)
|
C.nvml_init(lib, &resp)
|
||||||
if resp.err != nil {
|
if resp.err != nil {
|
||||||
slog.Info(fmt.Sprintf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err)))
|
slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
|
||||||
|
C.free(unsafe.Pointer(resp.err))
|
||||||
|
} else {
|
||||||
|
return &resp.ch
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
|
||||||
|
var resp C.cudart_init_resp_t
|
||||||
|
resp.ch.verbose = getVerboseState()
|
||||||
|
for _, libPath := range cudartLibPaths {
|
||||||
|
lib := C.CString(libPath)
|
||||||
|
defer C.free(unsafe.Pointer(lib))
|
||||||
|
C.cudart_init(lib, &resp)
|
||||||
|
if resp.err != nil {
|
||||||
|
slog.Info(fmt.Sprintf("Unable to load cudart CUDA management library %s: %s", libPath, C.GoString(resp.err)))
|
||||||
C.free(unsafe.Pointer(resp.err))
|
C.free(unsafe.Pointer(resp.err))
|
||||||
} else {
|
} else {
|
||||||
return &resp.ch
|
return &resp.ch
|
||||||
|
|
|
@ -52,7 +52,8 @@ void cpu_check_ram(mem_info_t *resp);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "gpu_info_cuda.h"
|
#include "gpu_info_nvml.h"
|
||||||
|
#include "gpu_info_cudart.h"
|
||||||
|
|
||||||
#endif // __GPU_INFO_H__
|
#endif // __GPU_INFO_H__
|
||||||
#endif // __APPLE__
|
#endif // __APPLE__
|
190
gpu/gpu_info_cudart.c
Normal file
190
gpu/gpu_info_cudart.c
Normal file
|
@ -0,0 +1,190 @@
|
||||||
|
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
#include "gpu_info_cudart.h"
|
||||||
|
|
||||||
|
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
|
||||||
|
cudartReturn_t ret;
|
||||||
|
resp->err = NULL;
|
||||||
|
const int buflen = 256;
|
||||||
|
char buf[buflen + 1];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
struct lookup {
|
||||||
|
char *s;
|
||||||
|
void **p;
|
||||||
|
} l[] = {
|
||||||
|
{"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
|
||||||
|
{"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
|
||||||
|
{"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
|
||||||
|
{"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
|
||||||
|
{"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
|
||||||
|
{"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
|
||||||
|
{"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
|
||||||
|
{NULL, NULL},
|
||||||
|
};
|
||||||
|
|
||||||
|
resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
|
||||||
|
if (!resp->ch.handle) {
|
||||||
|
char *msg = LOAD_ERR();
|
||||||
|
LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
|
||||||
|
snprintf(buf, buflen,
|
||||||
|
"Unable to load %s library to query for Nvidia GPUs: %s",
|
||||||
|
cudart_lib_path, msg);
|
||||||
|
free(msg);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
|
LOG(resp->ch.verbose, "wiring cudart library functions in %s\n", cudart_lib_path);
|
||||||
|
|
||||||
|
for (i = 0; l[i].s != NULL; i++) {
|
||||||
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
|
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
|
||||||
|
|
||||||
|
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||||
|
if (!l[i].p) {
|
||||||
|
char *msg = LOAD_ERR();
|
||||||
|
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
|
||||||
|
UNLOAD_LIBRARY(resp->ch.handle);
|
||||||
|
resp->ch.handle = NULL;
|
||||||
|
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
|
||||||
|
msg);
|
||||||
|
free(msg);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = (*resp->ch.cudaSetDevice)(0);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
|
||||||
|
UNLOAD_LIBRARY(resp->ch.handle);
|
||||||
|
resp->ch.handle = NULL;
|
||||||
|
snprintf(buf, buflen, "cudart init failure: %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int version = 0;
|
||||||
|
cudartDriverVersion_t driverVersion;
|
||||||
|
driverVersion.major = 0;
|
||||||
|
driverVersion.minor = 0;
|
||||||
|
|
||||||
|
// Report driver version if we're in verbose mode, ignore errors
|
||||||
|
ret = (*resp->ch.cudaDriverGetVersion)(&version);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
driverVersion.major = version / 1000;
|
||||||
|
driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
|
||||||
|
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void cudart_check_vram(cudart_handle_t h, mem_info_t *resp) {
|
||||||
|
resp->err = NULL;
|
||||||
|
cudartMemory_t memInfo = {0,0,0};
|
||||||
|
cudartReturn_t ret;
|
||||||
|
const int buflen = 256;
|
||||||
|
char buf[buflen + 1];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (h.handle == NULL) {
|
||||||
|
resp->err = strdup("cudart handle isn't initialized");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// cudaGetDeviceCount takes int type, resp-> count is uint
|
||||||
|
int deviceCount;
|
||||||
|
ret = (*h.cudaGetDeviceCount)(&deviceCount);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
resp->count = (unsigned int)deviceCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
resp->total = 0;
|
||||||
|
resp->free = 0;
|
||||||
|
for (i = 0; i < resp-> count; i++) {
|
||||||
|
ret = (*h.cudaSetDevice)(i);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "cudart device failed to initialize");
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG(h.verbose, "[%d] CUDA totalMem %lu\n", i, memInfo.total);
|
||||||
|
LOG(h.verbose, "[%d] CUDA freeMem %lu\n", i, memInfo.free);
|
||||||
|
|
||||||
|
resp->total += memInfo.total;
|
||||||
|
resp->free += memInfo.free;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *resp) {
|
||||||
|
resp->err = NULL;
|
||||||
|
resp->major = 0;
|
||||||
|
resp->minor = 0;
|
||||||
|
int major = 0;
|
||||||
|
int minor = 0;
|
||||||
|
cudartReturn_t ret;
|
||||||
|
const int buflen = 256;
|
||||||
|
char buf[buflen + 1];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (h.handle == NULL) {
|
||||||
|
resp->err = strdup("cudart handle not initialized");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int devices;
|
||||||
|
ret = (*h.cudaGetDeviceCount)(&devices);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "unable to get cudart device count: %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < devices; i++) {
|
||||||
|
ret = (*h.cudaSetDevice)(i);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "cudart device failed to initialize");
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = (*h.cudaDeviceGetAttribute)(&major, cudartDevAttrComputeCapabilityMajor, i);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ret = (*h.cudaDeviceGetAttribute)(&minor, cudartDevAttrComputeCapabilityMinor, i);
|
||||||
|
if (ret != CUDART_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Report the lowest major.minor we detect as that limits our compatibility
|
||||||
|
if (resp->major == 0 || resp->major > major ) {
|
||||||
|
resp->major = major;
|
||||||
|
resp->minor = minor;
|
||||||
|
} else if ( resp->major == major && resp->minor > minor ) {
|
||||||
|
resp->minor = minor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __APPLE__
|
59
gpu/gpu_info_cudart.h
Normal file
59
gpu/gpu_info_cudart.h
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
#ifndef __APPLE__
|
||||||
|
#ifndef __GPU_INFO_CUDART_H__
|
||||||
|
#define __GPU_INFO_CUDART_H__
|
||||||
|
#include "gpu_info.h"
|
||||||
|
|
||||||
|
// Just enough typedef's to dlopen/dlsym for memory information
|
||||||
|
typedef enum cudartReturn_enum {
|
||||||
|
CUDART_SUCCESS = 0,
|
||||||
|
CUDART_UNSUPPORTED = 1,
|
||||||
|
// Other values omitted for now...
|
||||||
|
} cudartReturn_t;
|
||||||
|
|
||||||
|
typedef enum cudartDeviceAttr_enum {
|
||||||
|
cudartDevAttrComputeCapabilityMajor = 75,
|
||||||
|
cudartDevAttrComputeCapabilityMinor = 76,
|
||||||
|
} cudartDeviceAttr_t;
|
||||||
|
|
||||||
|
typedef void *cudartDevice_t; // Opaque is sufficient
|
||||||
|
typedef struct cudartMemory_st {
|
||||||
|
size_t total;
|
||||||
|
size_t free;
|
||||||
|
size_t used;
|
||||||
|
} cudartMemory_t;
|
||||||
|
|
||||||
|
typedef struct cudartDriverVersion {
|
||||||
|
int major;
|
||||||
|
int minor;
|
||||||
|
} cudartDriverVersion_t;
|
||||||
|
|
||||||
|
typedef struct cudart_handle {
|
||||||
|
void *handle;
|
||||||
|
uint16_t verbose;
|
||||||
|
cudartReturn_t (*cudaSetDevice)(int device);
|
||||||
|
cudartReturn_t (*cudaDeviceSynchronize)(void);
|
||||||
|
cudartReturn_t (*cudaDeviceReset)(void);
|
||||||
|
cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *);
|
||||||
|
cudartReturn_t (*cudaGetDeviceCount)(int *);
|
||||||
|
cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
|
||||||
|
cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
|
||||||
|
} cudart_handle_t;
|
||||||
|
|
||||||
|
typedef struct cudart_init_resp {
|
||||||
|
char *err; // If err is non-null handle is invalid
|
||||||
|
cudart_handle_t ch;
|
||||||
|
} cudart_init_resp_t;
|
||||||
|
|
||||||
|
typedef struct cudart_compute_capability {
|
||||||
|
char *err;
|
||||||
|
int major;
|
||||||
|
int minor;
|
||||||
|
} cudart_compute_capability_t;
|
||||||
|
|
||||||
|
|
||||||
|
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
|
||||||
|
void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
|
||||||
|
void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
|
||||||
|
|
||||||
|
#endif // __GPU_INFO_CUDART_H__
|
||||||
|
#endif // __APPLE__
|
|
@ -1,10 +1,10 @@
|
||||||
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
|
||||||
|
|
||||||
#include "gpu_info_cuda.h"
|
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
#include "gpu_info_nvml.h"
|
||||||
|
|
||||||
|
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
||||||
nvmlReturn_t ret;
|
nvmlReturn_t ret;
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
const int buflen = 256;
|
const int buflen = 256;
|
||||||
|
@ -30,20 +30,20 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
||||||
{NULL, NULL},
|
{NULL, NULL},
|
||||||
};
|
};
|
||||||
|
|
||||||
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
|
resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
|
||||||
if (!resp->ch.handle) {
|
if (!resp->ch.handle) {
|
||||||
char *msg = LOAD_ERR();
|
char *msg = LOAD_ERR();
|
||||||
LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
|
LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
|
||||||
snprintf(buf, buflen,
|
snprintf(buf, buflen,
|
||||||
"Unable to load %s library to query for Nvidia GPUs: %s",
|
"Unable to load %s library to query for Nvidia GPUs: %s",
|
||||||
cuda_lib_path, msg);
|
nvml_lib_path, msg);
|
||||||
free(msg);
|
free(msg);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO once we've squashed the remaining corner cases remove this log
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
|
LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
|
||||||
|
|
||||||
for (i = 0; l[i].s != NULL; i++) {
|
for (i = 0; l[i].s != NULL; i++) {
|
||||||
// TODO once we've squashed the remaining corner cases remove this log
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
|
@ -82,7 +82,7 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
nvmlDevice_t device;
|
nvmlDevice_t device;
|
||||||
nvmlMemory_t memInfo = {0};
|
nvmlMemory_t memInfo = {0};
|
||||||
|
@ -92,7 +92,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (h.handle == NULL) {
|
if (h.handle == NULL) {
|
||||||
resp->err = strdup("nvml handle sn't initialized");
|
resp->err = strdup("nvml handle isn't initialized");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -155,15 +155,15 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG(h.verbose, "[%d] CUDA totalMem %llu\n", i, memInfo.total);
|
LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
|
||||||
LOG(h.verbose, "[%d] CUDA usedMem %llu\n", i, memInfo.used);
|
LOG(h.verbose, "[%d] CUDA freeMem %ld\n", i, memInfo.free);
|
||||||
|
|
||||||
resp->total += memInfo.total;
|
resp->total += memInfo.total;
|
||||||
resp->free += memInfo.free;
|
resp->free += memInfo.free;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
|
void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
resp->major = 0;
|
resp->major = 0;
|
||||||
resp->minor = 0;
|
resp->minor = 0;
|
|
@ -1,6 +1,6 @@
|
||||||
#ifndef __APPLE__
|
#ifndef __APPLE__
|
||||||
#ifndef __GPU_INFO_CUDA_H__
|
#ifndef __GPU_INFO_NVML_H__
|
||||||
#define __GPU_INFO_CUDA_H__
|
#define __GPU_INFO_NVML_H__
|
||||||
#include "gpu_info.h"
|
#include "gpu_info.h"
|
||||||
|
|
||||||
// Just enough typedef's to dlopen/dlsym for memory information
|
// Just enough typedef's to dlopen/dlsym for memory information
|
||||||
|
@ -20,7 +20,7 @@ typedef enum nvmlBrandType_enum
|
||||||
NVML_BRAND_UNKNOWN = 0,
|
NVML_BRAND_UNKNOWN = 0,
|
||||||
} nvmlBrandType_t;
|
} nvmlBrandType_t;
|
||||||
|
|
||||||
typedef struct cuda_handle {
|
typedef struct nvml_handle {
|
||||||
void *handle;
|
void *handle;
|
||||||
uint16_t verbose;
|
uint16_t verbose;
|
||||||
nvmlReturn_t (*nvmlInit_v2)(void);
|
nvmlReturn_t (*nvmlInit_v2)(void);
|
||||||
|
@ -35,22 +35,22 @@ typedef struct cuda_handle {
|
||||||
nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int length);
|
nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int length);
|
||||||
nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int length);
|
nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int length);
|
||||||
nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
|
nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
|
||||||
} cuda_handle_t;
|
} nvml_handle_t;
|
||||||
|
|
||||||
typedef struct cuda_init_resp {
|
typedef struct nvml_init_resp {
|
||||||
char *err; // If err is non-null handle is invalid
|
char *err; // If err is non-null handle is invalid
|
||||||
cuda_handle_t ch;
|
nvml_handle_t ch;
|
||||||
} cuda_init_resp_t;
|
} nvml_init_resp_t;
|
||||||
|
|
||||||
typedef struct cuda_compute_capability {
|
typedef struct nvml_compute_capability {
|
||||||
char *err;
|
char *err;
|
||||||
int major;
|
int major;
|
||||||
int minor;
|
int minor;
|
||||||
} cuda_compute_capability_t;
|
} nvml_compute_capability_t;
|
||||||
|
|
||||||
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
|
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
|
||||||
void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
|
void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
|
||||||
void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
|
void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);
|
||||||
|
|
||||||
#endif // __GPU_INFO_CUDA_H__
|
#endif // __GPU_INFO_NVML_H__
|
||||||
#endif // __APPLE__
|
#endif // __APPLE__
|
|
@ -90,6 +90,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||||
compress_libs
|
compress_libs
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ "${ARCH}" == "x86_64" ]; then
|
||||||
|
#
|
||||||
|
# ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
|
||||||
|
#
|
||||||
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
|
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
|
||||||
#
|
#
|
||||||
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
|
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
|
||||||
|
@ -116,6 +120,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||||
compress_libs
|
compress_libs
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo "Skipping CPU generation step as requested"
|
echo "Skipping CPU generation step as requested"
|
||||||
fi
|
fi
|
||||||
|
@ -142,12 +147,21 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
|
||||||
if [ -n "${CUDA_MAJOR}" ]; then
|
if [ -n "${CUDA_MAJOR}" ]; then
|
||||||
CUDA_VARIANT=_v${CUDA_MAJOR}
|
CUDA_VARIANT=_v${CUDA_MAJOR}
|
||||||
fi
|
fi
|
||||||
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
if [ "${ARCH}" == "arm64" ]; then
|
||||||
|
echo "ARM CPU detected - disabling unsupported AVX instructions"
|
||||||
|
|
||||||
|
# ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
|
||||||
|
#
|
||||||
|
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
|
||||||
|
# Disabling has minimal performance effect while maintaining compatibility.
|
||||||
|
ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
|
||||||
|
fi
|
||||||
|
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
|
||||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
||||||
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||||
build
|
build
|
||||||
|
|
||||||
# Cary the CUDA libs as payloads to help reduce dependency burden on users
|
# Carry the CUDA libs as payloads to help reduce dependency burden on users
|
||||||
#
|
#
|
||||||
# TODO - in the future we may shift to packaging these separately and conditionally
|
# TODO - in the future we may shift to packaging these separately and conditionally
|
||||||
# downloading them in the install script.
|
# downloading them in the install script.
|
||||||
|
|
Loading…
Reference in a new issue