review comments and coverage

This commit is contained in:
Daniel Hiltgen 2024-06-05 12:07:20 -07:00
parent ff4f0cbd1d
commit 6f351bf586
18 changed files with 375 additions and 456 deletions

View file

@ -178,7 +178,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
// Shouldn't happen, but just in case... // Shouldn't happen, but just in case...
if gpuID < 0 { if gpuID < 0 {
slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue") slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
return []RocmGPUInfo{} return nil
} }
if int(major) < RocmComputeMin { if int(major) < RocmComputeMin {
@ -205,22 +205,17 @@ func AMDGetGPUInfo() []RocmGPUInfo {
matched := true matched := true
for _, m := range mapping { for _, m := range mapping {
if m.id == 0 { if m.id == 0 {
// Null ID means it didn't populate, so we can't use it to match
continue continue
} }
filename := filepath.Join(devDir, m.filename) filename := filepath.Join(devDir, m.filename)
fp, err := os.Open(filename) buf, err := os.ReadFile(filename)
if err != nil {
slog.Debug("failed to open sysfs node", "file", filename, "error", err)
matched = false
break
}
defer fp.Close()
buf, err := io.ReadAll(fp)
if err != nil { if err != nil {
slog.Debug("failed to read sysfs node", "file", filename, "error", err) slog.Debug("failed to read sysfs node", "file", filename, "error", err)
matched = false matched = false
break break
} }
// values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64) cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
if err != nil { if err != nil {
slog.Debug("failed to parse sysfs node", "file", filename, "error", err) slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
@ -239,13 +234,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
// Found the matching DRM directory // Found the matching DRM directory
slog.Debug("matched", "amdgpu", match, "drm", devDir) slog.Debug("matched", "amdgpu", match, "drm", devDir)
totalFile := filepath.Join(devDir, DRMTotalMemoryFile) totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
totalFp, err := os.Open(totalFile) buf, err := os.ReadFile(totalFile)
if err != nil {
slog.Debug("failed to open sysfs node", "file", totalFile, "error", err)
break
}
defer totalFp.Close()
buf, err := io.ReadAll(totalFp)
if err != nil { if err != nil {
slog.Debug("failed to read sysfs node", "file", totalFile, "error", err) slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
break break
@ -284,7 +273,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
TotalMemory: totalMemory, TotalMemory: totalMemory,
FreeMemory: (totalMemory - usedMemory), FreeMemory: (totalMemory - usedMemory),
}, },
ID: fmt.Sprintf("%d", gpuID), ID: strconv.Itoa(gpuID),
Name: name, Name: name,
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch), Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
MinimumMemory: rocmMinimumMemory, MinimumMemory: rocmMinimumMemory,
@ -315,7 +304,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
libDir, err = AMDValidateLibDir() libDir, err = AMDValidateLibDir()
if err != nil { if err != nil {
slog.Warn("unable to verify rocm library, will use cpu", "error", err) slog.Warn("unable to verify rocm library, will use cpu", "error", err)
return []RocmGPUInfo{} return nil
} }
} }
gpuInfo.DependencyPath = libDir gpuInfo.DependencyPath = libDir
@ -326,7 +315,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
supported, err = GetSupportedGFX(libDir) supported, err = GetSupportedGFX(libDir)
if err != nil { if err != nil {
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err) slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
return []RocmGPUInfo{} return nil
} }
slog.Debug("rocm supported GPUs", "types", supported) slog.Debug("rocm supported GPUs", "types", supported)
} }
@ -434,12 +423,7 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
} }
func getFreeMemory(usedFile string) (uint64, error) { func getFreeMemory(usedFile string) (uint64, error) {
usedFp, err := os.Open(usedFile) buf, err := os.ReadFile(usedFile)
if err != nil {
return 0, fmt.Errorf("failed to open sysfs node %s %w", usedFile, err)
}
defer usedFp.Close()
buf, err := io.ReadAll(usedFp)
if err != nil { if err != nil {
return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err) return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
} }

View file

@ -7,6 +7,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"slices" "slices"
"strconv"
"strings" "strings"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
@ -124,7 +125,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
TotalMemory: totalMemory, TotalMemory: totalMemory,
FreeMemory: freeMemory, FreeMemory: freeMemory,
}, },
ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
DependencyPath: libDir, DependencyPath: libDir,
MinimumMemory: rocmMinimumMemory, MinimumMemory: rocmMinimumMemory,
Name: name, Name: name,

View file

@ -4,11 +4,7 @@ import (
"golang.org/x/sys/cpu" "golang.org/x/sys/cpu"
) )
func GetCPUVariant() string { func GetCPUCapability() CPUCapability {
return getCPUCapability().ToVariant()
}
func getCPUCapability() CPUCapability {
if cpu.X86.HasAVX2 { if cpu.X86.HasAVX2 {
return CPUCapabilityAVX2 return CPUCapabilityAVX2
} }
@ -16,5 +12,5 @@ func getCPUCapability() CPUCapability {
return CPUCapabilityAVX return CPUCapabilityAVX
} }
// else LCD // else LCD
return CPUCapabilityBase return CPUCapabilityNone
} }

View file

@ -11,8 +11,6 @@ package gpu
*/ */
import "C" import "C"
import ( import (
"bufio"
"bytes"
"fmt" "fmt"
"log/slog" "log/slog"
"os" "os"
@ -66,54 +64,6 @@ var RocmComputeMin = 9
// TODO find a better way to detect iGPU instead of minimum memory // TODO find a better way to detect iGPU instead of minimum memory
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
var CudartLinuxGlobs = []string{
"/usr/local/cuda/lib64/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
"/usr/lib/wsl/lib/libcudart.so*",
"/usr/lib/wsl/drivers/*/libcudart.so*",
"/opt/cuda/lib64/libcudart.so*",
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
"/usr/local/cuda/lib*/libcudart.so*",
"/usr/lib*/libcudart.so*",
"/usr/local/lib*/libcudart.so*",
}
var CudartWindowsGlobs = []string{
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
}
var NvmlWindowsGlobs = []string{
"c:\\Windows\\System32\\nvml.dll",
}
var NvcudaLinuxGlobs = []string{
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
"/usr/lib/*-linux-gnu/libcuda.so*",
"/usr/lib/wsl/lib/libcuda.so*",
"/usr/lib/wsl/drivers/*/libcuda.so*",
"/opt/cuda/lib*/libcuda.so*",
"/usr/local/cuda/lib*/libcuda.so*",
"/usr/lib*/libcuda.so*",
"/usr/local/lib*/libcuda.so*",
}
var NvcudaWindowsGlobs = []string{
"c:\\windows\\system*\\nvcuda.dll",
}
var OneapiWindowsGlobs = []string{
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
}
var OneapiLinuxGlobs = []string{
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
"/usr/lib*/libze_intel_gpu.so*",
}
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK") var CudaTegra string = os.Getenv("JETSON_JETPACK")
@ -139,47 +89,24 @@ func initCudaHandles() *cudaHandles {
} }
slog.Debug("searching for GPU discovery libraries for NVIDIA") slog.Debug("searching for GPU discovery libraries for NVIDIA")
var cudartMgmtName string
var cudartMgmtPatterns []string var cudartMgmtPatterns []string
var nvcudaMgmtName string
var nvcudaMgmtPatterns []string
var nvmlMgmtName string
var nvmlMgmtPatterns []string
tmpDir, _ := PayloadsDir() // Aligned with driver, we can't carry as payloads
switch runtime.GOOS { nvcudaMgmtPatterns := NvcudaGlobs
case "windows":
cudartMgmtName = "cudart64_*.dll" if runtime.GOOS == "windows" {
localAppData := os.Getenv("LOCALAPPDATA") localAppData := os.Getenv("LOCALAPPDATA")
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)} cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
// Aligned with driver, we can't carry as payloads
nvcudaMgmtName = "nvcuda.dll"
nvcudaMgmtPatterns = NvcudaWindowsGlobs
// Use nvml to refresh free memory on windows only
nvmlMgmtName = "nvml.dll"
nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
case "linux":
cudartMgmtName = "libcudart.so*"
if tmpDir != "" {
// TODO - add "payloads" for subprocess
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
// Aligned with driver, we can't carry as payloads
nvcudaMgmtName = "libcuda.so*"
nvcudaMgmtPatterns = NvcudaLinuxGlobs
// nvml omitted on linux
default:
return cHandles
} }
tmpDir, _ := PayloadsDir()
if tmpDir != "" {
// TODO - add "payloads" for subprocess
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
}
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
if len(nvmlMgmtPatterns) > 0 { if len(NvmlGlobs) > 0 {
nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns) nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
if len(nvmlLibPaths) > 0 { if len(nvmlLibPaths) > 0 {
nvml, libPath := LoadNVMLMgmt(nvmlLibPaths) nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
if nvml != nil { if nvml != nil {
@ -190,7 +117,7 @@ func initCudaHandles() *cudaHandles {
} }
} }
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns) nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
if len(nvcudaLibPaths) > 0 { if len(nvcudaLibPaths) > 0 {
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths) deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
if nvcuda != nil { if nvcuda != nil {
@ -202,7 +129,7 @@ func initCudaHandles() *cudaHandles {
} }
} }
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns) cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
if len(cudartLibPaths) > 0 { if len(cudartLibPaths) > 0 {
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths) deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
if cudart != nil { if cudart != nil {
@ -220,8 +147,6 @@ func initCudaHandles() *cudaHandles {
// Note: gpuMutex must already be held // Note: gpuMutex must already be held
func initOneAPIHandles() *oneapiHandles { func initOneAPIHandles() *oneapiHandles {
oHandles := &oneapiHandles{} oHandles := &oneapiHandles{}
var oneapiMgmtName string
var oneapiMgmtPatterns []string
// Short Circuit if we already know which library to use // Short Circuit if we already know which library to use
if oneapiLibPath != "" { if oneapiLibPath != "" {
@ -229,18 +154,7 @@ func initOneAPIHandles() *oneapiHandles {
return oHandles return oHandles
} }
switch runtime.GOOS { oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
case "windows":
oneapiMgmtName = "ze_intel_gpu64.dll"
oneapiMgmtPatterns = OneapiWindowsGlobs
case "linux":
oneapiMgmtName = "libze_intel_gpu.so"
oneapiMgmtPatterns = OneapiLinuxGlobs
default:
return oHandles
}
oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
if len(oneapiLibPaths) > 0 { if len(oneapiLibPaths) > 0 {
oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths) oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
} }
@ -290,7 +204,7 @@ func GetGPUInfo() GpuInfoList {
if !bootstrapped { if !bootstrapped {
slog.Debug("Detecting GPUs") slog.Debug("Detecting GPUs")
needRefresh = false needRefresh = false
cpuCapability = getCPUCapability() cpuCapability = GetCPUCapability()
var memInfo C.mem_info_t var memInfo C.mem_info_t
mem, err := GetCPUMem() mem, err := GetCPUMem()
@ -301,14 +215,14 @@ func GetGPUInfo() GpuInfoList {
GpuInfo: GpuInfo{ GpuInfo: GpuInfo{
memInfo: mem, memInfo: mem,
Library: "cpu", Library: "cpu",
Variant: cpuCapability.ToVariant(), Variant: cpuCapability,
ID: "0", ID: "0",
}, },
}} }}
// Fallback to CPU mode if we're lacking required vector extensions on x86 // Fallback to CPU mode if we're lacking required vector extensions on x86
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" { if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability.ToString(), "detected", cpuCapability.ToString()) slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability)
bootstrapped = true bootstrapped = true
// No need to do any GPU discovery, since we can't run on them // No need to do any GPU discovery, since we can't run on them
return GpuInfoList{cpus[0].GpuInfo} return GpuInfoList{cpus[0].GpuInfo}
@ -357,8 +271,8 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.MinimumMemory = cudaMinimumMemory gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = int(driverMajor) gpuInfo.DriverMajor = driverMajor
gpuInfo.DriverMinor = int(driverMinor) gpuInfo.DriverMinor = driverMinor
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does... // TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
cudaGPUs = append(cudaGPUs, gpuInfo) cudaGPUs = append(cudaGPUs, gpuInfo)
@ -374,16 +288,16 @@ func GetGPUInfo() GpuInfoList {
continue continue
} }
devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d)) devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
for i := 0; i < int(devCount); i++ { for i := range devCount {
gpuInfo := OneapiGPUInfo{ gpuInfo := OneapiGPUInfo{
GpuInfo: GpuInfo{ GpuInfo: GpuInfo{
Library: "oneapi", Library: "oneapi",
}, },
driverIndex: d, driverIndex: d,
gpuIndex: i, gpuIndex: int(i),
} }
// TODO - split bootstrapping from updating free memory // TODO - split bootstrapping from updating free memory
C.oneapi_check_vram(*oHandles.oneapi, C.int(d), C.int(i), &memInfo) C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
// TODO - convert this to MinimumMemory based on testing... // TODO - convert this to MinimumMemory based on testing...
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
memInfo.free = C.uint64_t(totalFreeMem) memInfo.free = C.uint64_t(totalFreeMem)
@ -505,22 +419,6 @@ func GetGPUInfo() GpuInfoList {
return resp return resp
} }
func GetCPUMem() (memInfo, error) {
if runtime.GOOS == "linux" {
return GetLinuxMemInfo()
}
var ret memInfo
var info C.mem_info_t
C.cpu_check_ram(&info)
if info.err != nil {
defer C.free(unsafe.Pointer(info.err))
return ret, fmt.Errorf(C.GoString(info.err))
}
ret.FreeMemory = uint64(info.free)
ret.TotalMemory = uint64(info.total)
return ret, nil
}
func FindGPULibs(baseLibName string, defaultPatterns []string) []string { func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them // Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
var ldPaths []string var ldPaths []string
@ -646,7 +544,7 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err)) slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err)) C.free(unsafe.Pointer(resp.err))
} else { } else {
for i := 0; i < int(resp.oh.num_drivers); i++ { for i := range resp.oh.num_drivers {
num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i))) num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
} }
return num_devices, &resp.oh, libPath return num_devices, &resp.oh, libPath
@ -682,42 +580,3 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
return "", "" return "", ""
} }
} }
func GetLinuxMemInfo() (memInfo, error) {
var mem memInfo
var total, available, free, buffers, cached uint64
f, err := os.Open("/proc/meminfo")
if err != nil {
return mem, err
}
defer f.Close()
s := bufio.NewScanner(f)
for s.Scan() {
switch {
case bytes.HasPrefix(s.Bytes(), []byte(`MemTotal:`)):
_, err = fmt.Sscanf(s.Text(), "MemTotal:%d", &total)
case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)):
_, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available)
case bytes.HasPrefix(s.Bytes(), []byte(`MemFree:`)):
_, err = fmt.Sscanf(s.Text(), "MemFree:%d", &free)
case bytes.HasPrefix(s.Bytes(), []byte(`Buffers:`)):
_, err = fmt.Sscanf(s.Text(), "Buffers:%d", &buffers)
case bytes.HasPrefix(s.Bytes(), []byte(`Cached:`)):
_, err = fmt.Sscanf(s.Text(), "Cached:%d", &cached)
default:
continue
}
if err != nil {
return mem, err
}
if total > 0 && available > 0 {
mem.TotalMemory = total * 1024
mem.FreeMemory = available * 1024
return mem, nil
}
}
mem.TotalMemory = total * 1024
mem.FreeMemory = (free + buffers + cached) * 1024
return mem, nil
}

View file

@ -24,7 +24,7 @@ func GetGPUInfo() GpuInfoList {
return []GpuInfo{ return []GpuInfo{
{ {
Library: "cpu", Library: "cpu",
Variant: GetCPUVariant(), Variant: GetCPUCapability(),
memInfo: mem, memInfo: mem,
}, },
} }
@ -47,7 +47,7 @@ func GetCPUInfo() GpuInfoList {
return []GpuInfo{ return []GpuInfo{
{ {
Library: "cpu", Library: "cpu",
Variant: GetCPUVariant(), Variant: GetCPUCapability(),
memInfo: mem, memInfo: mem,
}, },
} }

View file

@ -1,41 +0,0 @@
#include "gpu_info.h"
// Fallbacks for CPU mode
#ifdef _WIN32
#include <sysinfoapi.h>
void cpu_check_ram(mem_info_t *resp) {
resp->err = NULL;
MEMORYSTATUSEX info;
info.dwLength = sizeof(info);
if (GlobalMemoryStatusEx(&info) != 0) {
resp->total = info.ullTotalPhys;
resp->free = info.ullAvailPhys;
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
} else {
resp->err = LOAD_ERR();
}
return;
}
#elif __linux__
#include <errno.h>
#include <string.h>
#include <sys/sysinfo.h>
void cpu_check_ram(mem_info_t *resp) {
struct sysinfo info;
resp->err = NULL;
if (sysinfo(&info) != 0) {
resp->err = strdup(strerror(errno));
} else {
resp->total = info.totalram * info.mem_unit;
resp->free = info.freeram * info.mem_unit;
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
}
return;
}
#elif __APPLE__
// Unused - see gpu_darwin.go
#else
#error "Unsupported platform"
#endif

View file

@ -4,8 +4,7 @@
#include <string.h> #include <string.h>
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
{
ze_result_t ret; ze_result_t ret;
resp->err = NULL; resp->err = NULL;
resp->oh.devices = NULL; resp->oh.devices = NULL;
@ -15,8 +14,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
const int buflen = 256; const int buflen = 256;
char buf[buflen + 1]; char buf[buflen + 1];
int i, d, count; int i, d, count;
struct lookup struct lookup {
{
char *s; char *s;
void **p; void **p;
} l[] = { } l[] = {
@ -32,8 +30,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
}; };
resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY); resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
if (!resp->oh.handle) if (!resp->oh.handle) {
{
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
snprintf(buf, buflen, snprintf(buf, buflen,
"Unable to load %s library to query for Intel GPUs: %s\n", "Unable to load %s library to query for Intel GPUs: %s\n",
@ -48,14 +45,12 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
"wiring Level-Zero management library functions in %s\n", "wiring Level-Zero management library functions in %s\n",
oneapi_lib_path); oneapi_lib_path);
for (i = 0; l[i].s != NULL; i++) for (i = 0; l[i].s != NULL; i++) {
{
// TODO once we've squashed the remaining corner cases remove this log // TODO once we've squashed the remaining corner cases remove this log
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s); LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
if (!l[i].p) if (!l[i].p) {
{
resp->oh.handle = NULL; resp->oh.handle = NULL;
char *msg = LOAD_ERR(); char *msg = LOAD_ERR();
LOG(resp->oh.verbose, "dlerr: %s\n", msg); LOG(resp->oh.verbose, "dlerr: %s\n", msg);
@ -68,8 +63,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
} }
ret = (*resp->oh.zesInit)(0); ret = (*resp->oh.zesInit)(0);
if (ret != ZE_RESULT_SUCCESS) if (ret != ZE_RESULT_SUCCESS) {
{
LOG(resp->oh.verbose, "zesInit err: %x\n", ret); LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
snprintf(buf, buflen, "oneapi vram init failure: %x", ret); snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@ -79,8 +73,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
count = 0; count = 0;
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL); ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
if (ret != ZE_RESULT_SUCCESS) if (ret != ZE_RESULT_SUCCESS) {
{
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret); LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get driver count: %x", ret); snprintf(buf, buflen, "unable to get driver count: %x", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@ -91,10 +84,10 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t)); resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t)); resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t)); memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
resp->oh.devices = malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t*)); resp->oh.devices =
malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]); ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
if (ret != ZE_RESULT_SUCCESS) if (ret != ZE_RESULT_SUCCESS) {
{
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret); LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get driver count: %x", ret); snprintf(buf, buflen, "unable to get driver count: %x", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@ -103,19 +96,20 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
} }
for (d = 0; d < resp->oh.num_drivers; d++) { for (d = 0; d < resp->oh.num_drivers; d++) {
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], &resp->oh.num_devices[d], NULL); ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
if (ret != ZE_RESULT_SUCCESS) &resp->oh.num_devices[d], NULL);
{ if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret); LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get device count: %x", ret); snprintf(buf, buflen, "unable to get device count: %x", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
oneapi_release(resp->oh); oneapi_release(resp->oh);
return; return;
} }
resp->oh.devices[d] = malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t)); resp->oh.devices[d] =
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]); malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
if (ret != ZE_RESULT_SUCCESS) ret = (*resp->oh.zesDeviceGet)(
{ resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret); LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get device count: %x", ret); snprintf(buf, buflen, "unable to get device count: %x", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
@ -128,8 +122,8 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
return; return;
} }
void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *resp) void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
{ mem_info_t *resp) {
ze_result_t ret; ze_result_t ret;
resp->err = NULL; resp->err = NULL;
uint64_t totalMem = 0; uint64_t totalMem = 0;
@ -138,12 +132,11 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
char buf[buflen + 1]; char buf[buflen + 1];
int i, d, m; int i, d, m;
if (h.handle == NULL) if (h.handle == NULL) {
{
resp->err = strdup("Level-Zero handle not initialized"); resp->err = strdup("Level-Zero handle not initialized");
return; return;
} }
if (driver > h.num_drivers || device > h.num_devices[driver]) { if (driver > h.num_drivers || device > h.num_devices[driver]) {
resp->err = strdup("driver of device index out of bounds"); resp->err = strdup("driver of device index out of bounds");
return; return;
@ -161,8 +154,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
props.pNext = &ext_props; props.pNext = &ext_props;
ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props); ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
if (ret != ZE_RESULT_SUCCESS) if (ret != ZE_RESULT_SUCCESS) {
{
snprintf(buf, buflen, "unable to get device properties: %d", ret); snprintf(buf, buflen, "unable to get device properties: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
@ -175,8 +167,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
// TODO - the driver isn't included - what if there are multiple drivers? // TODO - the driver isn't included - what if there are multiple drivers?
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device); snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
if (h.verbose) if (h.verbose) {
{
// When in verbose mode, report more information about // When in verbose mode, report more information about
// the card we discover. // the card we discover.
LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device, LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
@ -195,11 +186,11 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
// Compute Capability equivalent in resp->major, resp->minor, resp->patch // Compute Capability equivalent in resp->major, resp->minor, resp->patch
uint32_t memCount = 0; uint32_t memCount = 0;
ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, NULL); ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
if (ret != ZE_RESULT_SUCCESS) NULL);
{ if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
"unable to enumerate Level-Zero memory modules: %x", ret); ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
@ -209,14 +200,12 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t)); zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
(*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems); (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
for (m = 0; m < memCount; m++) for (m = 0; m < memCount; m++) {
{
zes_mem_state_t state; zes_mem_state_t state;
state.stype = ZES_STRUCTURE_TYPE_MEM_STATE; state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
state.pNext = NULL; state.pNext = NULL;
ret = (*h.zesMemoryGetState)(mems[m], &state); ret = (*h.zesMemoryGetState)(mems[m], &state);
if (ret != ZE_RESULT_SUCCESS) if (ret != ZE_RESULT_SUCCESS) {
{
snprintf(buf, buflen, "unable to get memory state: %x", ret); snprintf(buf, buflen, "unable to get memory state: %x", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
free(mems); free(mems);
@ -230,29 +219,23 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
free(mems); free(mems);
} }
void oneapi_release(oneapi_handle_t h) void oneapi_release(oneapi_handle_t h) {
{
int d; int d;
LOG(h.verbose, "releasing oneapi library\n"); LOG(h.verbose, "releasing oneapi library\n");
for (d = 0; d < h.num_drivers; d++) for (d = 0; d < h.num_drivers; d++) {
{ if (h.devices != NULL && h.devices[d] != NULL) {
if (h.devices != NULL && h.devices[d] != NULL)
{
free(h.devices[d]); free(h.devices[d]);
} }
} }
if (h.devices != NULL) if (h.devices != NULL) {
{
free(h.devices); free(h.devices);
h.devices = NULL; h.devices = NULL;
} }
if (h.num_devices != NULL) if (h.num_devices != NULL) {
{
free(h.num_devices); free(h.num_devices);
h.num_devices = NULL; h.num_devices = NULL;
} }
if (h.drivers != NULL) if (h.drivers != NULL) {
{
free(h.drivers); free(h.drivers);
h.drivers = NULL; h.drivers = NULL;
} }
@ -261,14 +244,11 @@ void oneapi_release(oneapi_handle_t h)
h.handle = NULL; h.handle = NULL;
} }
int oneapi_get_device_count(oneapi_handle_t h, int driver) int oneapi_get_device_count(oneapi_handle_t h, int driver) {
{ if (h.handle == NULL || h.num_devices == NULL) {
if (h.handle == NULL || h.num_devices == NULL)
{
return 0; return 0;
} }
if (driver > h.num_drivers) if (driver > h.num_drivers) {
{
return 0; return 0;
} }
return (int)h.num_devices[driver]; return (int)h.num_devices[driver];

View file

@ -9,8 +9,7 @@
#define ZE_BIT(_i) (1 << _i) #define ZE_BIT(_i) (1 << _i)
// Just enough typedef's to dlopen/dlsym for memory information // Just enough typedef's to dlopen/dlsym for memory information
typedef enum ze_result_t typedef enum ze_result_t {
{
ZE_RESULT_SUCCESS = 0, ZE_RESULT_SUCCESS = 0,
// Other values omitted for now... // Other values omitted for now...
} ze_result_t; } ze_result_t;
@ -20,13 +19,11 @@ typedef struct _zes_driver_handle_t *zes_driver_handle_t;
typedef struct _zes_device_handle_t *zes_device_handle_t; typedef struct _zes_device_handle_t *zes_device_handle_t;
typedef struct _zes_mem_handle_t *zes_mem_handle_t; typedef struct _zes_mem_handle_t *zes_mem_handle_t;
typedef enum _ze_structure_type_t typedef enum _ze_structure_type_t {
{
ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
} ze_structure_type_t; } ze_structure_type_t;
typedef enum _zes_structure_type_t typedef enum _zes_structure_type_t {
{
ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1, ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb, ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e, ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
@ -34,35 +31,29 @@ typedef enum _zes_structure_type_t
ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_structure_type_t; } zes_structure_type_t;
typedef enum _zes_mem_type_t typedef enum _zes_mem_type_t {
{
ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_mem_type_t; } zes_mem_type_t;
typedef enum _zes_mem_loc_t typedef enum _zes_mem_loc_t {
{
ZES_MEM_LOC_SYSTEM = 0, ZES_MEM_LOC_SYSTEM = 0,
ZES_MEM_LOC_DEVICE = 1, ZES_MEM_LOC_DEVICE = 1,
ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
} zes_mem_loc_t; } zes_mem_loc_t;
typedef enum _zes_mem_health_t typedef enum _zes_mem_health_t {
{
ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
} zes_mem_health_t; } zes_mem_health_t;
typedef struct _ze_device_uuid_t typedef struct _ze_device_uuid_t {
{
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE]; uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
} ze_device_uuid_t; } ze_device_uuid_t;
typedef struct _zes_uuid_t typedef struct _zes_uuid_t {
{
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE]; uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
} zes_uuid_t; } zes_uuid_t;
typedef enum _ze_device_type_t typedef enum _ze_device_type_t {
{
ZE_DEVICE_TYPE_GPU = 1, ZE_DEVICE_TYPE_GPU = 1,
ZE_DEVICE_TYPE_CPU = 2, ZE_DEVICE_TYPE_CPU = 2,
ZE_DEVICE_TYPE_FPGA = 3, ZE_DEVICE_TYPE_FPGA = 3,
@ -71,8 +62,7 @@ typedef enum _ze_device_type_t
ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
} ze_device_type_t; } ze_device_type_t;
typedef enum _zes_device_type_t typedef enum _zes_device_type_t {
{
ZES_DEVICE_TYPE_GPU = 1, ZES_DEVICE_TYPE_GPU = 1,
ZES_DEVICE_TYPE_CPU = 2, ZES_DEVICE_TYPE_CPU = 2,
ZES_DEVICE_TYPE_FPGA = 3, ZES_DEVICE_TYPE_FPGA = 3,
@ -82,8 +72,7 @@ typedef enum _zes_device_type_t
} zes_device_type_t; } zes_device_type_t;
typedef uint32_t ze_device_property_flags_t; typedef uint32_t ze_device_property_flags_t;
typedef enum _ze_device_property_flag_t typedef enum _ze_device_property_flag_t {
{
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0), ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1), ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2), ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@ -92,8 +81,7 @@ typedef enum _ze_device_property_flag_t
} ze_device_property_flag_t; } ze_device_property_flag_t;
typedef uint32_t zes_device_property_flags_t; typedef uint32_t zes_device_property_flags_t;
typedef enum _zes_device_property_flag_t typedef enum _zes_device_property_flag_t {
{
ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0), ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1), ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2), ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@ -101,8 +89,7 @@ typedef enum _zes_device_property_flag_t
ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
} zes_device_property_flag_t; } zes_device_property_flag_t;
typedef struct _ze_device_properties_t typedef struct _ze_device_properties_t {
{
ze_structure_type_t stype; ze_structure_type_t stype;
void *pNext; void *pNext;
ze_device_type_t type; ze_device_type_t type;
@ -126,8 +113,7 @@ typedef struct _ze_device_properties_t
char name[ZE_MAX_DEVICE_NAME]; char name[ZE_MAX_DEVICE_NAME];
} ze_device_properties_t; } ze_device_properties_t;
typedef struct _zes_device_properties_t typedef struct _zes_device_properties_t {
{
zes_structure_type_t stype; zes_structure_type_t stype;
void *pNext; void *pNext;
ze_device_properties_t core; ze_device_properties_t core;
@ -140,8 +126,7 @@ typedef struct _zes_device_properties_t
char driverVersion[ZES_STRING_PROPERTY_SIZE]; char driverVersion[ZES_STRING_PROPERTY_SIZE];
} zes_device_properties_t; } zes_device_properties_t;
typedef struct _zes_device_ext_properties_t typedef struct _zes_device_ext_properties_t {
{
zes_structure_type_t stype; zes_structure_type_t stype;
void *pNext; void *pNext;
zes_uuid_t uuid; zes_uuid_t uuid;
@ -149,8 +134,7 @@ typedef struct _zes_device_ext_properties_t
zes_device_property_flags_t flags; zes_device_property_flags_t flags;
} zes_device_ext_properties_t; } zes_device_ext_properties_t;
typedef struct _zes_mem_properties_t typedef struct _zes_mem_properties_t {
{
zes_structure_type_t stype; zes_structure_type_t stype;
void *pNext; void *pNext;
zes_mem_type_t type; zes_mem_type_t type;
@ -162,8 +146,7 @@ typedef struct _zes_mem_properties_t
int32_t numChannels; int32_t numChannels;
} zes_mem_properties_t; } zes_mem_properties_t;
typedef struct _zes_mem_state_t typedef struct _zes_mem_state_t {
{
zes_structure_type_t stype; zes_structure_type_t stype;
const void *pNext; const void *pNext;
zes_mem_health_t health; zes_mem_health_t health;
@ -171,15 +154,14 @@ typedef struct _zes_mem_state_t
uint64_t size; uint64_t size;
} zes_mem_state_t; } zes_mem_state_t;
typedef struct oneapi_handle typedef struct oneapi_handle {
{
void *handle; void *handle;
uint16_t verbose; uint16_t verbose;
uint32_t num_drivers; uint32_t num_drivers;
zes_driver_handle_t *drivers; zes_driver_handle_t *drivers;
uint32_t *num_devices; uint32_t *num_devices;
zes_device_handle_t **devices; zes_device_handle_t **devices;
// TODO Driver major, minor information // TODO Driver major, minor information
// int driver_major; // int driver_major;
@ -201,20 +183,19 @@ typedef struct oneapi_handle
} oneapi_handle_t; } oneapi_handle_t;
typedef struct oneapi_init_resp typedef struct oneapi_init_resp {
{
char *err; // If err is non-null handle is invalid char *err; // If err is non-null handle is invalid
oneapi_handle_t oh; oneapi_handle_t oh;
} oneapi_init_resp_t; } oneapi_init_resp_t;
typedef struct oneapi_version_resp typedef struct oneapi_version_resp {
{
ze_result_t status; ze_result_t status;
char *str; // Contains version or error string if status != 0 char *str; // Contains version or error string if status != 0
} oneapi_version_resp_t; } oneapi_version_resp_t;
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp); void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *resp); void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
mem_info_t *resp);
void oneapi_release(oneapi_handle_t h); void oneapi_release(oneapi_handle_t h);
int oneapi_get_device_count(oneapi_handle_t h, int driver); int oneapi_get_device_count(oneapi_handle_t h, int driver);

89
gpu/gpu_linux.go Normal file
View file

@ -0,0 +1,89 @@
package gpu
import (
"bufio"
"fmt"
"os"
"strings"
"github.com/ollama/ollama/format"
)
var CudartGlobs = []string{
"/usr/local/cuda/lib64/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
"/usr/lib/wsl/lib/libcudart.so*",
"/usr/lib/wsl/drivers/*/libcudart.so*",
"/opt/cuda/lib64/libcudart.so*",
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
"/usr/local/cuda/lib*/libcudart.so*",
"/usr/lib*/libcudart.so*",
"/usr/local/lib*/libcudart.so*",
}
var NvmlGlobs = []string{}
var NvcudaGlobs = []string{
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
"/usr/lib/*-linux-gnu/libcuda.so*",
"/usr/lib/wsl/lib/libcuda.so*",
"/usr/lib/wsl/drivers/*/libcuda.so*",
"/opt/cuda/lib*/libcuda.so*",
"/usr/local/cuda/lib*/libcuda.so*",
"/usr/lib*/libcuda.so*",
"/usr/local/lib*/libcuda.so*",
}
var OneapiGlobs = []string{
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
"/usr/lib*/libze_intel_gpu.so*",
}
var CudartMgmtName = "libcudart.so*"
var NvcudaMgmtName = "libcuda.so*"
var NvmlMgmtName = "" // not currently wired on linux
var OneapiMgmtName = "libze_intel_gpu.so"
func GetCPUMem() (memInfo, error) {
var mem memInfo
var total, available, free, buffers, cached uint64
f, err := os.Open("/proc/meminfo")
if err != nil {
return mem, err
}
defer f.Close()
s := bufio.NewScanner(f)
for s.Scan() {
line := s.Text()
switch {
case strings.HasPrefix(line, "MemTotal:"):
_, err = fmt.Sscanf(line, "MemTotal:%d", &total)
case strings.HasPrefix(line, "MemAvailable:"):
_, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
case strings.HasPrefix(line, "MemFree:"):
_, err = fmt.Sscanf(line, "MemFree:%d", &free)
case strings.HasPrefix(line, "Buffers:"):
_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
case strings.HasPrefix(line, "Cached:"):
_, err = fmt.Sscanf(line, "Cached:%d", &cached)
default:
continue
}
if err != nil {
return mem, err
}
if total > 0 && available > 0 {
mem.TotalMemory = total * format.KibiByte
mem.FreeMemory = available * format.KibiByte
return mem, nil
}
}
mem.TotalMemory = total * format.KibiByte
mem.FreeMemory = (free + buffers + cached) * format.KibiByte
return mem, nil
}

55
gpu/gpu_windows.go Normal file
View file

@ -0,0 +1,55 @@
package gpu
import (
"fmt"
"syscall"
"unsafe"
)
type MEMORYSTATUSEX struct {
length uint32
MemoryLoad uint32
TotalPhys uint64
AvailPhys uint64
TotalPageFile uint64
AvailPageFile uint64
TotalVirtual uint64
AvailVirtual uint64
AvailExtendedVirtual uint64
}
var (
k32 = syscall.NewLazyDLL("kernel32.dll")
globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
sizeofMemoryStatusEx = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
)
var CudartGlobs = []string{
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
}
var NvmlGlobs = []string{
"c:\\Windows\\System32\\nvml.dll",
}
var NvcudaGlobs = []string{
"c:\\windows\\system*\\nvcuda.dll",
}
var OneapiGlobs = []string{
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
}
var CudartMgmtName = "cudart64_*.dll"
var NvcudaMgmtName = "nvcuda.dll"
var NvmlMgmtName = "nvml.dll"
var OneapiMgmtName = "ze_intel_gpu64.dll"
func GetCPUMem() (memInfo, error) {
memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
if r1 == 0 {
return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
}
return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys}, nil
}

View file

@ -18,7 +18,7 @@ type GpuInfo struct {
Library string `json:"library,omitempty"` Library string `json:"library,omitempty"`
// Optional variant to select (e.g. versions, cpu feature flags) // Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant,omitempty"` Variant CPUCapability `json:"variant"`
// MinimumMemory represents the minimum memory required to use the GPU // MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory uint64 `json:"-"` MinimumMemory uint64 `json:"-"`
@ -44,21 +44,21 @@ type CPUInfo struct {
type CudaGPUInfo struct { type CudaGPUInfo struct {
GpuInfo GpuInfo
index int // nolint: unused index int //nolint:unused,nolintlint
} }
type CudaGPUInfoList []CudaGPUInfo type CudaGPUInfoList []CudaGPUInfo
type RocmGPUInfo struct { type RocmGPUInfo struct {
GpuInfo GpuInfo
usedFilepath string // nolint: unused usedFilepath string //nolint:unused,nolintlint
index int // nolint: unused index int //nolint:unused,nolintlint
} }
type RocmGPUInfoList []RocmGPUInfo type RocmGPUInfoList []RocmGPUInfo
type OneapiGPUInfo struct { type OneapiGPUInfo struct {
GpuInfo GpuInfo
driverIndex int // nolint: unused driverIndex int //nolint:unused,nolintlint
gpuIndex int // nolint: unused gpuIndex int //nolint:unused,nolintlint
} }
type OneapiGPUInfoList []OneapiGPUInfo type OneapiGPUInfoList []OneapiGPUInfo
@ -71,8 +71,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
for _, info := range l { for _, info := range l {
found := false found := false
requested := info.Library requested := info.Library
if info.Variant != "" { if info.Variant != CPUCapabilityNone {
requested += "_" + info.Variant requested += "_" + info.Variant.String()
} }
for i, lib := range libs { for i, lib := range libs {
if lib == requested { if lib == requested {
@ -117,30 +117,19 @@ type CPUCapability uint32
var GPURunnerCPUCapability = CPUCapabilityAVX var GPURunnerCPUCapability = CPUCapabilityAVX
const ( const (
CPUCapabilityBase CPUCapability = iota CPUCapabilityNone CPUCapability = iota
CPUCapabilityAVX CPUCapabilityAVX
CPUCapabilityAVX2 CPUCapabilityAVX2
// TODO AVX512 // TODO AVX512
) )
func (c CPUCapability) ToString() string { func (c CPUCapability) String() string {
switch c {
case CPUCapabilityAVX:
return "AVX"
case CPUCapabilityAVX2:
return "AVX2"
default:
return "no vector extensions"
}
}
func (c CPUCapability) ToVariant() string {
switch c { switch c {
case CPUCapabilityAVX: case CPUCapabilityAVX:
return "avx" return "avx"
case CPUCapabilityAVX2: case CPUCapabilityAVX2:
return "avx2" return "avx2"
default: default:
return "" return "no vector extensions"
} }
} }

View file

@ -11,7 +11,8 @@ import (
) )
func TestContextExhaustion(t *testing.T) { func TestContextExhaustion(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute) // Longer needed for small footprint GPUs // Longer needed for small footprint GPUs
ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute)
defer cancel() defer cancel()
// Set up the test data // Set up the test data
req := api.GenerateRequest{ req := api.GenerateRequest{

View file

@ -1,7 +1,6 @@
package llm package llm
import ( import (
"fmt"
"log/slog" "log/slog"
"strconv" "strconv"
"strings" "strings"
@ -69,13 +68,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
// Conditional output size on GPU 0 // Conditional output size on GPU 0
var memoryLayerOutput uint64 var memoryLayerOutput uint64
var includeOutput bool
// One extra layer as a pad for each GPU // The sizes of a layer
var layerBuffer uint64 var layerSize uint64
// The sizes of the main layers
var layerSizes []uint64
// The sum of all the layer sizes (just for logging) // The sum of all the layer sizes (just for logging)
var memoryWeights uint64 var memoryWeights uint64
@ -102,12 +97,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
layers := ggml.Tensors().Layers() layers := ggml.Tensors().Layers()
// add one layer worth of memory as a buffer // add one layer worth of memory as a buffer
if blk0, ok := layers["blk.0"]; ok { if blk0, ok := layers["blk.0"]; ok {
layerBuffer = blk0.size() layerSize = blk0.size()
} else {
slog.Warn("model missing blk.0 layer size")
} }
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
// KV is proportional to the number of layers
layerSize += kv / ggml.KV().BlockCount()
graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch))) graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
if graphPartialOffload == 0 { if graphPartialOffload == 0 {
graphPartialOffload = ggml.KV().GQA() * kv / 6 graphPartialOffload = ggml.KV().GQA() * kv / 6
@ -119,6 +119,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
// on metal there's no partial offload overhead // on metal there's no partial offload overhead
if gpus[0].Library == "metal" { if gpus[0].Library == "metal" {
graphPartialOffload = graphFullOffload graphPartialOffload = graphFullOffload
} else if len(gpus) > 1 {
// multigpu should always use the partial graph size
graphFullOffload = graphPartialOffload
} }
if layer, ok := layers["output_norm"]; ok { if layer, ok := layers["output_norm"]; ok {
@ -130,16 +133,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
memoryLayerOutput += layer.size() memoryLayerOutput += layer.size()
} }
if gpus[0].Library == "metal" && opts.UseMMap { // Output layer handled at the end if we have space
includeOutput = true
} else if gpus[0].Library != "metal" || !opts.UseMMap {
includeOutput = true
}
gpuZeroOverhead := projectorSize gpuZeroOverhead := projectorSize
if includeOutput {
gpuZeroOverhead += memoryLayerOutput
}
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
var layerCount int var layerCount int
@ -156,12 +151,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
gzo = gpuZeroOverhead gzo = gpuZeroOverhead
} }
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer { if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i]) slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
continue continue
} }
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]}) gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
} }
var gpuZeroID int var gpuZeroID int
@ -170,23 +165,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
gpuAllocations[gpuZeroID] += gpuZeroOverhead gpuAllocations[gpuZeroID] += gpuZeroOverhead
} }
layerSizes = make([]uint64, int(ggml.KV().BlockCount()))
for i := range int(ggml.KV().BlockCount()) {
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
memoryLayer := blk.size()
// KV is proportional to the number of layers
memoryLayer += kv / ggml.KV().BlockCount()
layerSizes[i] = memoryLayer
memoryWeights += memoryLayer
}
}
// For all the layers, find where they can fit on the GPU(s) // For all the layers, find where they can fit on the GPU(s)
for i := range layerSizes { for i := range int(ggml.KV().BlockCount()) {
if layerSizes[i] == 0 { memoryWeights += layerSize
continue
}
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
// Stop allocating on GPU(s) once we hit the users target NumGPU // Stop allocating on GPU(s) once we hit the users target NumGPU
continue continue
@ -196,8 +178,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
for j := len(gpusWithSpace); j > 0; j-- { for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[i%j] g := gpusWithSpace[i%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > used+layerSizes[i] { if g.g.FreeMemory > used+layerSize {
gpuAllocations[g.i] += layerSizes[i] gpuAllocations[g.i] += layerSize
layerCounts[g.i]++ layerCounts[g.i]++
layerCount++ layerCount++
break break
@ -205,17 +187,18 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...) gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
} }
} }
} }
if layerCount >= int(ggml.KV().BlockCount()) { if layerCount >= int(ggml.KV().BlockCount()) {
fullyLoaded = true fullyLoaded = true
} else { } else {
for i := layerCount; i < int(ggml.KV().BlockCount()); i++ { for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
overflow += layerSizes[i] overflow += layerSize
} }
} }
// Find where the output fits
if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) { // Determine if we need to consider output then find where it fits
if ((gpus[0].Library == "metal" && opts.UseMMap) || (gpus[0].Library != "metal" || !opts.UseMMap)) &&
memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
for j := len(gpusWithSpace); j > 0; j-- { for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[layerCount%j] g := gpusWithSpace[layerCount%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
@ -226,6 +209,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
break break
} }
} }
if layerCount < int(ggml.KV().BlockCount())+1 { if layerCount < int(ggml.KV().BlockCount())+1 {
fullyLoaded = false fullyLoaded = false
overflow += memoryLayerOutput overflow += memoryLayerOutput
@ -253,7 +237,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
var memoryRequiredPartial, memoryRequiredTotal uint64 var memoryRequiredPartial, memoryRequiredTotal uint64
for i := range gpuAllocations { for i := range gpuAllocations {
memoryRequiredPartial += gpuAllocations[i] memoryRequiredPartial += gpuAllocations[i]
} }
memoryRequiredTotal = memoryRequiredPartial + overflow memoryRequiredTotal = memoryRequiredPartial + overflow

View file

@ -18,7 +18,7 @@ func TestEstimateGPULayers(t *testing.T) {
envconfig.Debug = true envconfig.Debug = true
modelName := "dummy" modelName := "dummy"
f, err := os.CreateTemp(t.TempDir(), modelName) f, err := os.CreateTemp(t.TempDir(), modelName)
assert.Nil(t, err) require.NoError(t, err)
defer f.Close() defer f.Close()
gguf := NewGGUFV3(binary.LittleEndian) gguf := NewGGUFV3(binary.LittleEndian)
inputLayerCount := 5 inputLayerCount := 5
@ -30,7 +30,7 @@ func TestEstimateGPULayers(t *testing.T) {
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
} }
assert.Equal(t, inputLayerCount+1, len(tensors)) assert.Len(t, tensors, inputLayerCount+1)
err = gguf.Encode(f, KV{ err = gguf.Encode(f, KV{
"general.architecture": "llama", "general.architecture": "llama",
"general.name": "name", "general.name": "name",
@ -56,9 +56,11 @@ func TestEstimateGPULayers(t *testing.T) {
} }
projectors := []string{} projectors := []string{}
opts := api.DefaultOptions() opts := api.DefaultOptions()
estimate := EstimateGPULayers(gpus, ggml, projectors, opts) t.Run("cpu", func(t *testing.T) {
assert.Equal(t, 0, estimate.Layers) estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
assert.Equal(t, uint64(0), estimate.Graph) assert.Equal(t, 0, estimate.Layers)
assert.Equal(t, uint64(0), estimate.Graph)
})
// derived from the dummy ggml file above // derived from the dummy ggml file above
graphPartialOffload := uint64(202377216) graphPartialOffload := uint64(202377216)
@ -80,7 +82,10 @@ func TestEstimateGPULayers(t *testing.T) {
}, },
} }
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1 // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
for i, s := range [][]uint64{ for i, s := range []struct {
layer0, layer1 uint64
expect0, expect1 uint64
}{
{1, 1, 1, 1}, {1, 1, 1, 1},
{2, 1, 2, 1}, {2, 1, 2, 1},
{2, 2, 2, 2}, {2, 2, 2, 2},
@ -90,27 +95,33 @@ func TestEstimateGPULayers(t *testing.T) {
{6, 6, 3, 3}, {6, 6, 3, 3},
{0, 3, 0, 3}, {0, 3, 0, 3},
} { } {
gpus[0].FreeMemory = 0 t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
gpus[1].FreeMemory = 0 gpus[0].FreeMemory = 0
gpus[0].FreeMemory += projectorSize + memoryLayerOutput gpus[1].FreeMemory = 0
gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1 gpus[0].FreeMemory += projectorSize
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1 if s.layer0 > 0 {
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload) gpus[0].FreeMemory += memoryLayerOutput
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload) } else {
estimate = EstimateGPULayers(gpus, ggml, projectors, opts) gpus[1].FreeMemory += memoryLayerOutput
assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s) }
assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s) gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
var layerSums uint64 gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
for _, b := range estimate.GPUSizes { gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
layerSums += b gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
} estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
if estimate.Layers < inputLayerCount+1 { assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate) var layerSums uint64
} else { for _, b := range estimate.GPUSizes {
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) layerSums += b
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate) }
} if estimate.Layers < inputLayerCount+1 {
assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
} else {
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
}
})
} }
} }

View file

@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
// glob workDir for files that start with ollama_ // glob workDir for files that start with ollama_
availableServers := availableServers() availableServers := availableServers()
requested := info.Library requested := info.Library
if info.Variant != "" { if info.Variant != gpu.CPUCapabilityNone {
requested += "_" + info.Variant requested += "_" + info.Variant.String()
} }
servers := []string{} servers := []string{}
@ -117,14 +117,14 @@ func serversForGpu(info gpu.GpuInfo) []string {
// Load up the best CPU variant if not primary requested // Load up the best CPU variant if not primary requested
if info.Library != "cpu" { if info.Library != "cpu" {
variant := gpu.GetCPUVariant() variant := gpu.GetCPUCapability()
// If no variant, then we fall back to default // If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match // If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the // Attempting to run the wrong CPU instructions will panic the
// process // process
if variant != "" { if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers { for cmp := range availableServers {
if cmp == "cpu_"+variant { if cmp == "cpu_"+variant.String() {
servers = append(servers, cmp) servers = append(servers, cmp)
break break
} }
@ -146,11 +146,11 @@ func serverForCpu() string {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return "metal" return "metal"
} }
variant := gpu.GetCPUVariant() variant := gpu.GetCPUCapability()
availableServers := availableServers() availableServers := availableServers()
if variant != "" { if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers { for cmp := range availableServers {
if cmp == "cpu_"+variant { if cmp == "cpu_"+variant.String() {
return cmp return cmp
} }
} }

View file

@ -39,7 +39,7 @@ type LlamaServer interface {
Close() error Close() error
EstimatedVRAM() uint64 // Total VRAM across all GPUs EstimatedVRAM() uint64 // Total VRAM across all GPUs
EstimatedTotal() uint64 EstimatedTotal() uint64
EstimagedVRAMByGPU(gpuID string) uint64 EstimatedVRAMByGPU(gpuID string) uint64
} }
// llmServer is an instance of the llama.cpp server // llmServer is an instance of the llama.cpp server
@ -1016,7 +1016,7 @@ func (s *llmServer) EstimatedTotal() uint64 {
return s.estimate.TotalSize return s.estimate.TotalSize
} }
func (s *llmServer) EstimagedVRAMByGPU(gpuID string) uint64 { func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
for i, gpu := range s.gpus { for i, gpu := range s.gpus {
if gpu.ID == gpuID { if gpu.ID == gpuID {
return s.estimate.GPUSizes[i] return s.estimate.GPUSizes[i]

View file

@ -182,7 +182,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
// We want to avoid loading on any GPUs that have other // We want to avoid loading on any GPUs that have other
// models still loading on them to avoid potential races // models still loading on them to avoid potential races
// with VRAM consumption ramping up during load // with VRAM consumption ramping up during load
availGpus := s.filterGPUsWithLoadingModels(gpus) availGpus := s.filterGPUsWithoutLoadingModels(gpus)
// Update free memory from currently loaded models // Update free memory from currently loaded models
s.updateFreeSpace(availGpus) s.updateFreeSpace(availGpus)
@ -414,9 +414,7 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
r.refMu.Lock() r.refMu.Lock()
if r.llama != nil { if r.llama != nil {
for _, gpu := range allGpus { for _, gpu := range allGpus {
// if slices.Contains(gpuIDs, gpu.ID) { predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimagedVRAMByGPU(gpu.ID)
// }
} }
} else { } else {
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect") slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
@ -448,7 +446,7 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
// to avoid scheduling another model on the same GPU(s) that haven't stabilized. // to avoid scheduling another model on the same GPU(s) that haven't stabilized.
// This routine returns the set of GPUs that do not have an active loading model. // This routine returns the set of GPUs that do not have an active loading model.
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry) // If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
func (s *Scheduler) filterGPUsWithLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList { func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
ret := append(gpu.GpuInfoList{}, allGpus...) ret := append(gpu.GpuInfoList{}, allGpus...)
s.loadedMu.Lock() s.loadedMu.Lock()
defer s.loadedMu.Unlock() defer s.loadedMu.Unlock()
@ -702,5 +700,4 @@ func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML,
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room // TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
return s.findRunnerToUnload() return s.findRunnerToUnload()
} }

View file

@ -156,7 +156,7 @@ func TestRequests(t *testing.T) {
// Same model, same request // Same model, same request
scenario1a := newScenario(t, ctx, "ollama-model-1", 10) scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
scenario1a.req.sessionDuration = 0 scenario1a.req.sessionDuration = 5 * time.Millisecond
scenario1b := newScenario(t, ctx, "ollama-model-1", 11) scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
scenario1b.req.model = scenario1a.req.model scenario1b.req.model = scenario1a.req.model
scenario1b.ggml = scenario1a.ggml scenario1b.ggml = scenario1a.ggml
@ -167,6 +167,7 @@ func TestRequests(t *testing.T) {
tmpModel := *scenario1a.req.model tmpModel := *scenario1a.req.model
scenario2a.req.model = &tmpModel scenario2a.req.model = &tmpModel
scenario2a.ggml = scenario1a.ggml scenario2a.ggml = scenario1a.ggml
scenario2a.req.sessionDuration = 5 * time.Millisecond
// Multiple loaded models // Multiple loaded models
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte) scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
@ -316,7 +317,6 @@ func TestGetRunner(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
// Same model, same request
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10) scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
scenario1a.req.sessionDuration = 0 scenario1a.req.sessionDuration = 0
scenario1b := newScenario(t, ctx, "ollama-model-1b", 10) scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
@ -475,6 +475,40 @@ func TestUpdateFreeSpace(t *testing.T) {
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory) require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
} }
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done()
gpus := gpu.GpuInfoList{
{
Library: "cuda",
ID: "0",
},
{
Library: "cuda",
ID: "1",
},
}
r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
s := InitScheduler(ctx)
s.loadedMu.Lock()
s.loaded["a"] = r1
s.loadedMu.Unlock()
tmp := s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 1)
require.Equal(t, "1", tmp[0].ID)
r1.gpus = gpu.GpuInfoList{gpus[1]}
tmp = s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 1)
require.Equal(t, "0", tmp[0].ID)
r1.gpus = gpu.GpuInfoList{}
tmp = s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 2)
}
func TestFindRunnerToUnload(t *testing.T) { func TestFindRunnerToUnload(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
@ -607,4 +641,4 @@ func (s *mockLlm) Close() error {
} }
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM } func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal } func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
func (s *mockLlm) EstimagedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] } func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }