review comments and coverage
This commit is contained in:
parent
ff4f0cbd1d
commit
6f351bf586
18 changed files with 375 additions and 456 deletions
|
@ -178,7 +178,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
// Shouldn't happen, but just in case...
|
// Shouldn't happen, but just in case...
|
||||||
if gpuID < 0 {
|
if gpuID < 0 {
|
||||||
slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
|
slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
|
||||||
return []RocmGPUInfo{}
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if int(major) < RocmComputeMin {
|
if int(major) < RocmComputeMin {
|
||||||
|
@ -205,22 +205,17 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
matched := true
|
matched := true
|
||||||
for _, m := range mapping {
|
for _, m := range mapping {
|
||||||
if m.id == 0 {
|
if m.id == 0 {
|
||||||
|
// Null ID means it didn't populate, so we can't use it to match
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
filename := filepath.Join(devDir, m.filename)
|
filename := filepath.Join(devDir, m.filename)
|
||||||
fp, err := os.Open(filename)
|
buf, err := os.ReadFile(filename)
|
||||||
if err != nil {
|
|
||||||
slog.Debug("failed to open sysfs node", "file", filename, "error", err)
|
|
||||||
matched = false
|
|
||||||
break
|
|
||||||
}
|
|
||||||
defer fp.Close()
|
|
||||||
buf, err := io.ReadAll(fp)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug("failed to read sysfs node", "file", filename, "error", err)
|
slog.Debug("failed to read sysfs node", "file", filename, "error", err)
|
||||||
matched = false
|
matched = false
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
// values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
|
||||||
cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
|
cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
|
slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
|
||||||
|
@ -239,13 +234,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
// Found the matching DRM directory
|
// Found the matching DRM directory
|
||||||
slog.Debug("matched", "amdgpu", match, "drm", devDir)
|
slog.Debug("matched", "amdgpu", match, "drm", devDir)
|
||||||
totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
|
totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
|
||||||
totalFp, err := os.Open(totalFile)
|
buf, err := os.ReadFile(totalFile)
|
||||||
if err != nil {
|
|
||||||
slog.Debug("failed to open sysfs node", "file", totalFile, "error", err)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
defer totalFp.Close()
|
|
||||||
buf, err := io.ReadAll(totalFp)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
|
slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
|
||||||
break
|
break
|
||||||
|
@ -284,7 +273,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
TotalMemory: totalMemory,
|
TotalMemory: totalMemory,
|
||||||
FreeMemory: (totalMemory - usedMemory),
|
FreeMemory: (totalMemory - usedMemory),
|
||||||
},
|
},
|
||||||
ID: fmt.Sprintf("%d", gpuID),
|
ID: strconv.Itoa(gpuID),
|
||||||
Name: name,
|
Name: name,
|
||||||
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
|
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
|
||||||
MinimumMemory: rocmMinimumMemory,
|
MinimumMemory: rocmMinimumMemory,
|
||||||
|
@ -315,7 +304,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
libDir, err = AMDValidateLibDir()
|
libDir, err = AMDValidateLibDir()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("unable to verify rocm library, will use cpu", "error", err)
|
slog.Warn("unable to verify rocm library, will use cpu", "error", err)
|
||||||
return []RocmGPUInfo{}
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
gpuInfo.DependencyPath = libDir
|
gpuInfo.DependencyPath = libDir
|
||||||
|
@ -326,7 +315,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
supported, err = GetSupportedGFX(libDir)
|
supported, err = GetSupportedGFX(libDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
|
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
|
||||||
return []RocmGPUInfo{}
|
return nil
|
||||||
}
|
}
|
||||||
slog.Debug("rocm supported GPUs", "types", supported)
|
slog.Debug("rocm supported GPUs", "types", supported)
|
||||||
}
|
}
|
||||||
|
@ -434,12 +423,7 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func getFreeMemory(usedFile string) (uint64, error) {
|
func getFreeMemory(usedFile string) (uint64, error) {
|
||||||
usedFp, err := os.Open(usedFile)
|
buf, err := os.ReadFile(usedFile)
|
||||||
if err != nil {
|
|
||||||
return 0, fmt.Errorf("failed to open sysfs node %s %w", usedFile, err)
|
|
||||||
}
|
|
||||||
defer usedFp.Close()
|
|
||||||
buf, err := io.ReadAll(usedFp)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
|
return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,6 +7,7 @@ import (
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"slices"
|
"slices"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
|
@ -124,7 +125,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
TotalMemory: totalMemory,
|
TotalMemory: totalMemory,
|
||||||
FreeMemory: freeMemory,
|
FreeMemory: freeMemory,
|
||||||
},
|
},
|
||||||
ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
|
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
||||||
DependencyPath: libDir,
|
DependencyPath: libDir,
|
||||||
MinimumMemory: rocmMinimumMemory,
|
MinimumMemory: rocmMinimumMemory,
|
||||||
Name: name,
|
Name: name,
|
||||||
|
|
|
@ -4,11 +4,7 @@ import (
|
||||||
"golang.org/x/sys/cpu"
|
"golang.org/x/sys/cpu"
|
||||||
)
|
)
|
||||||
|
|
||||||
func GetCPUVariant() string {
|
func GetCPUCapability() CPUCapability {
|
||||||
return getCPUCapability().ToVariant()
|
|
||||||
}
|
|
||||||
|
|
||||||
func getCPUCapability() CPUCapability {
|
|
||||||
if cpu.X86.HasAVX2 {
|
if cpu.X86.HasAVX2 {
|
||||||
return CPUCapabilityAVX2
|
return CPUCapabilityAVX2
|
||||||
}
|
}
|
||||||
|
@ -16,5 +12,5 @@ func getCPUCapability() CPUCapability {
|
||||||
return CPUCapabilityAVX
|
return CPUCapabilityAVX
|
||||||
}
|
}
|
||||||
// else LCD
|
// else LCD
|
||||||
return CPUCapabilityBase
|
return CPUCapabilityNone
|
||||||
}
|
}
|
||||||
|
|
191
gpu/gpu.go
191
gpu/gpu.go
|
@ -11,8 +11,6 @@ package gpu
|
||||||
*/
|
*/
|
||||||
import "C"
|
import "C"
|
||||||
import (
|
import (
|
||||||
"bufio"
|
|
||||||
"bytes"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
@ -66,54 +64,6 @@ var RocmComputeMin = 9
|
||||||
// TODO find a better way to detect iGPU instead of minimum memory
|
// TODO find a better way to detect iGPU instead of minimum memory
|
||||||
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
||||||
|
|
||||||
var CudartLinuxGlobs = []string{
|
|
||||||
"/usr/local/cuda/lib64/libcudart.so*",
|
|
||||||
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
|
|
||||||
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
|
|
||||||
"/usr/lib/wsl/lib/libcudart.so*",
|
|
||||||
"/usr/lib/wsl/drivers/*/libcudart.so*",
|
|
||||||
"/opt/cuda/lib64/libcudart.so*",
|
|
||||||
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
|
|
||||||
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
|
|
||||||
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
|
|
||||||
"/usr/local/cuda/lib*/libcudart.so*",
|
|
||||||
"/usr/lib*/libcudart.so*",
|
|
||||||
"/usr/local/lib*/libcudart.so*",
|
|
||||||
}
|
|
||||||
|
|
||||||
var CudartWindowsGlobs = []string{
|
|
||||||
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var NvmlWindowsGlobs = []string{
|
|
||||||
"c:\\Windows\\System32\\nvml.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var NvcudaLinuxGlobs = []string{
|
|
||||||
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
|
|
||||||
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
|
|
||||||
"/usr/lib/*-linux-gnu/libcuda.so*",
|
|
||||||
"/usr/lib/wsl/lib/libcuda.so*",
|
|
||||||
"/usr/lib/wsl/drivers/*/libcuda.so*",
|
|
||||||
"/opt/cuda/lib*/libcuda.so*",
|
|
||||||
"/usr/local/cuda/lib*/libcuda.so*",
|
|
||||||
"/usr/lib*/libcuda.so*",
|
|
||||||
"/usr/local/lib*/libcuda.so*",
|
|
||||||
}
|
|
||||||
|
|
||||||
var NvcudaWindowsGlobs = []string{
|
|
||||||
"c:\\windows\\system*\\nvcuda.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var OneapiWindowsGlobs = []string{
|
|
||||||
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var OneapiLinuxGlobs = []string{
|
|
||||||
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
|
|
||||||
"/usr/lib*/libze_intel_gpu.so*",
|
|
||||||
}
|
|
||||||
|
|
||||||
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
||||||
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||||
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||||
|
@ -139,47 +89,24 @@ func initCudaHandles() *cudaHandles {
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Debug("searching for GPU discovery libraries for NVIDIA")
|
slog.Debug("searching for GPU discovery libraries for NVIDIA")
|
||||||
var cudartMgmtName string
|
|
||||||
var cudartMgmtPatterns []string
|
var cudartMgmtPatterns []string
|
||||||
var nvcudaMgmtName string
|
|
||||||
var nvcudaMgmtPatterns []string
|
|
||||||
var nvmlMgmtName string
|
|
||||||
var nvmlMgmtPatterns []string
|
|
||||||
|
|
||||||
tmpDir, _ := PayloadsDir()
|
// Aligned with driver, we can't carry as payloads
|
||||||
switch runtime.GOOS {
|
nvcudaMgmtPatterns := NvcudaGlobs
|
||||||
case "windows":
|
|
||||||
cudartMgmtName = "cudart64_*.dll"
|
if runtime.GOOS == "windows" {
|
||||||
localAppData := os.Getenv("LOCALAPPDATA")
|
localAppData := os.Getenv("LOCALAPPDATA")
|
||||||
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
|
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
|
||||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
|
|
||||||
// Aligned with driver, we can't carry as payloads
|
|
||||||
nvcudaMgmtName = "nvcuda.dll"
|
|
||||||
nvcudaMgmtPatterns = NvcudaWindowsGlobs
|
|
||||||
|
|
||||||
// Use nvml to refresh free memory on windows only
|
|
||||||
nvmlMgmtName = "nvml.dll"
|
|
||||||
nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
|
|
||||||
copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
|
|
||||||
|
|
||||||
case "linux":
|
|
||||||
cudartMgmtName = "libcudart.so*"
|
|
||||||
if tmpDir != "" {
|
|
||||||
// TODO - add "payloads" for subprocess
|
|
||||||
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
|
|
||||||
}
|
|
||||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
|
|
||||||
// Aligned with driver, we can't carry as payloads
|
|
||||||
nvcudaMgmtName = "libcuda.so*"
|
|
||||||
nvcudaMgmtPatterns = NvcudaLinuxGlobs
|
|
||||||
|
|
||||||
// nvml omitted on linux
|
|
||||||
default:
|
|
||||||
return cHandles
|
|
||||||
}
|
}
|
||||||
|
tmpDir, _ := PayloadsDir()
|
||||||
|
if tmpDir != "" {
|
||||||
|
// TODO - add "payloads" for subprocess
|
||||||
|
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
|
||||||
|
}
|
||||||
|
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
|
||||||
|
|
||||||
if len(nvmlMgmtPatterns) > 0 {
|
if len(NvmlGlobs) > 0 {
|
||||||
nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
|
nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
|
||||||
if len(nvmlLibPaths) > 0 {
|
if len(nvmlLibPaths) > 0 {
|
||||||
nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
|
nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
|
||||||
if nvml != nil {
|
if nvml != nil {
|
||||||
|
@ -190,7 +117,7 @@ func initCudaHandles() *cudaHandles {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
|
nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
|
||||||
if len(nvcudaLibPaths) > 0 {
|
if len(nvcudaLibPaths) > 0 {
|
||||||
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
|
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
|
||||||
if nvcuda != nil {
|
if nvcuda != nil {
|
||||||
|
@ -202,7 +129,7 @@ func initCudaHandles() *cudaHandles {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
|
cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
|
||||||
if len(cudartLibPaths) > 0 {
|
if len(cudartLibPaths) > 0 {
|
||||||
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
|
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
|
||||||
if cudart != nil {
|
if cudart != nil {
|
||||||
|
@ -220,8 +147,6 @@ func initCudaHandles() *cudaHandles {
|
||||||
// Note: gpuMutex must already be held
|
// Note: gpuMutex must already be held
|
||||||
func initOneAPIHandles() *oneapiHandles {
|
func initOneAPIHandles() *oneapiHandles {
|
||||||
oHandles := &oneapiHandles{}
|
oHandles := &oneapiHandles{}
|
||||||
var oneapiMgmtName string
|
|
||||||
var oneapiMgmtPatterns []string
|
|
||||||
|
|
||||||
// Short Circuit if we already know which library to use
|
// Short Circuit if we already know which library to use
|
||||||
if oneapiLibPath != "" {
|
if oneapiLibPath != "" {
|
||||||
|
@ -229,18 +154,7 @@ func initOneAPIHandles() *oneapiHandles {
|
||||||
return oHandles
|
return oHandles
|
||||||
}
|
}
|
||||||
|
|
||||||
switch runtime.GOOS {
|
oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
|
||||||
case "windows":
|
|
||||||
oneapiMgmtName = "ze_intel_gpu64.dll"
|
|
||||||
oneapiMgmtPatterns = OneapiWindowsGlobs
|
|
||||||
case "linux":
|
|
||||||
oneapiMgmtName = "libze_intel_gpu.so"
|
|
||||||
oneapiMgmtPatterns = OneapiLinuxGlobs
|
|
||||||
default:
|
|
||||||
return oHandles
|
|
||||||
}
|
|
||||||
|
|
||||||
oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
|
|
||||||
if len(oneapiLibPaths) > 0 {
|
if len(oneapiLibPaths) > 0 {
|
||||||
oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
|
oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
|
||||||
}
|
}
|
||||||
|
@ -290,7 +204,7 @@ func GetGPUInfo() GpuInfoList {
|
||||||
if !bootstrapped {
|
if !bootstrapped {
|
||||||
slog.Debug("Detecting GPUs")
|
slog.Debug("Detecting GPUs")
|
||||||
needRefresh = false
|
needRefresh = false
|
||||||
cpuCapability = getCPUCapability()
|
cpuCapability = GetCPUCapability()
|
||||||
var memInfo C.mem_info_t
|
var memInfo C.mem_info_t
|
||||||
|
|
||||||
mem, err := GetCPUMem()
|
mem, err := GetCPUMem()
|
||||||
|
@ -301,14 +215,14 @@ func GetGPUInfo() GpuInfoList {
|
||||||
GpuInfo: GpuInfo{
|
GpuInfo: GpuInfo{
|
||||||
memInfo: mem,
|
memInfo: mem,
|
||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: cpuCapability.ToVariant(),
|
Variant: cpuCapability,
|
||||||
ID: "0",
|
ID: "0",
|
||||||
},
|
},
|
||||||
}}
|
}}
|
||||||
|
|
||||||
// Fallback to CPU mode if we're lacking required vector extensions on x86
|
// Fallback to CPU mode if we're lacking required vector extensions on x86
|
||||||
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
|
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
|
||||||
slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability.ToString(), "detected", cpuCapability.ToString())
|
slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability)
|
||||||
bootstrapped = true
|
bootstrapped = true
|
||||||
// No need to do any GPU discovery, since we can't run on them
|
// No need to do any GPU discovery, since we can't run on them
|
||||||
return GpuInfoList{cpus[0].GpuInfo}
|
return GpuInfoList{cpus[0].GpuInfo}
|
||||||
|
@ -357,8 +271,8 @@ func GetGPUInfo() GpuInfoList {
|
||||||
gpuInfo.MinimumMemory = cudaMinimumMemory
|
gpuInfo.MinimumMemory = cudaMinimumMemory
|
||||||
gpuInfo.DependencyPath = depPath
|
gpuInfo.DependencyPath = depPath
|
||||||
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
||||||
gpuInfo.DriverMajor = int(driverMajor)
|
gpuInfo.DriverMajor = driverMajor
|
||||||
gpuInfo.DriverMinor = int(driverMinor)
|
gpuInfo.DriverMinor = driverMinor
|
||||||
|
|
||||||
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
|
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
|
||||||
cudaGPUs = append(cudaGPUs, gpuInfo)
|
cudaGPUs = append(cudaGPUs, gpuInfo)
|
||||||
|
@ -374,16 +288,16 @@ func GetGPUInfo() GpuInfoList {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
|
devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
|
||||||
for i := 0; i < int(devCount); i++ {
|
for i := range devCount {
|
||||||
gpuInfo := OneapiGPUInfo{
|
gpuInfo := OneapiGPUInfo{
|
||||||
GpuInfo: GpuInfo{
|
GpuInfo: GpuInfo{
|
||||||
Library: "oneapi",
|
Library: "oneapi",
|
||||||
},
|
},
|
||||||
driverIndex: d,
|
driverIndex: d,
|
||||||
gpuIndex: i,
|
gpuIndex: int(i),
|
||||||
}
|
}
|
||||||
// TODO - split bootstrapping from updating free memory
|
// TODO - split bootstrapping from updating free memory
|
||||||
C.oneapi_check_vram(*oHandles.oneapi, C.int(d), C.int(i), &memInfo)
|
C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
|
||||||
// TODO - convert this to MinimumMemory based on testing...
|
// TODO - convert this to MinimumMemory based on testing...
|
||||||
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
|
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
|
||||||
memInfo.free = C.uint64_t(totalFreeMem)
|
memInfo.free = C.uint64_t(totalFreeMem)
|
||||||
|
@ -505,22 +419,6 @@ func GetGPUInfo() GpuInfoList {
|
||||||
return resp
|
return resp
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
|
||||||
if runtime.GOOS == "linux" {
|
|
||||||
return GetLinuxMemInfo()
|
|
||||||
}
|
|
||||||
var ret memInfo
|
|
||||||
var info C.mem_info_t
|
|
||||||
C.cpu_check_ram(&info)
|
|
||||||
if info.err != nil {
|
|
||||||
defer C.free(unsafe.Pointer(info.err))
|
|
||||||
return ret, fmt.Errorf(C.GoString(info.err))
|
|
||||||
}
|
|
||||||
ret.FreeMemory = uint64(info.free)
|
|
||||||
ret.TotalMemory = uint64(info.total)
|
|
||||||
return ret, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
||||||
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
||||||
var ldPaths []string
|
var ldPaths []string
|
||||||
|
@ -646,7 +544,7 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
|
||||||
slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
|
slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
|
||||||
C.free(unsafe.Pointer(resp.err))
|
C.free(unsafe.Pointer(resp.err))
|
||||||
} else {
|
} else {
|
||||||
for i := 0; i < int(resp.oh.num_drivers); i++ {
|
for i := range resp.oh.num_drivers {
|
||||||
num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
|
num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
|
||||||
}
|
}
|
||||||
return num_devices, &resp.oh, libPath
|
return num_devices, &resp.oh, libPath
|
||||||
|
@ -682,42 +580,3 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
|
||||||
return "", ""
|
return "", ""
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetLinuxMemInfo() (memInfo, error) {
|
|
||||||
var mem memInfo
|
|
||||||
var total, available, free, buffers, cached uint64
|
|
||||||
f, err := os.Open("/proc/meminfo")
|
|
||||||
if err != nil {
|
|
||||||
return mem, err
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
s := bufio.NewScanner(f)
|
|
||||||
for s.Scan() {
|
|
||||||
switch {
|
|
||||||
case bytes.HasPrefix(s.Bytes(), []byte(`MemTotal:`)):
|
|
||||||
_, err = fmt.Sscanf(s.Text(), "MemTotal:%d", &total)
|
|
||||||
case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)):
|
|
||||||
_, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available)
|
|
||||||
case bytes.HasPrefix(s.Bytes(), []byte(`MemFree:`)):
|
|
||||||
_, err = fmt.Sscanf(s.Text(), "MemFree:%d", &free)
|
|
||||||
case bytes.HasPrefix(s.Bytes(), []byte(`Buffers:`)):
|
|
||||||
_, err = fmt.Sscanf(s.Text(), "Buffers:%d", &buffers)
|
|
||||||
case bytes.HasPrefix(s.Bytes(), []byte(`Cached:`)):
|
|
||||||
_, err = fmt.Sscanf(s.Text(), "Cached:%d", &cached)
|
|
||||||
default:
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
return mem, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if total > 0 && available > 0 {
|
|
||||||
mem.TotalMemory = total * 1024
|
|
||||||
mem.FreeMemory = available * 1024
|
|
||||||
return mem, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mem.TotalMemory = total * 1024
|
|
||||||
mem.FreeMemory = (free + buffers + cached) * 1024
|
|
||||||
return mem, nil
|
|
||||||
}
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ func GetGPUInfo() GpuInfoList {
|
||||||
return []GpuInfo{
|
return []GpuInfo{
|
||||||
{
|
{
|
||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: GetCPUVariant(),
|
Variant: GetCPUCapability(),
|
||||||
memInfo: mem,
|
memInfo: mem,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -47,7 +47,7 @@ func GetCPUInfo() GpuInfoList {
|
||||||
return []GpuInfo{
|
return []GpuInfo{
|
||||||
{
|
{
|
||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: GetCPUVariant(),
|
Variant: GetCPUCapability(),
|
||||||
memInfo: mem,
|
memInfo: mem,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,41 +0,0 @@
|
||||||
#include "gpu_info.h"
|
|
||||||
// Fallbacks for CPU mode
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
#include <sysinfoapi.h>
|
|
||||||
void cpu_check_ram(mem_info_t *resp) {
|
|
||||||
resp->err = NULL;
|
|
||||||
MEMORYSTATUSEX info;
|
|
||||||
info.dwLength = sizeof(info);
|
|
||||||
if (GlobalMemoryStatusEx(&info) != 0) {
|
|
||||||
resp->total = info.ullTotalPhys;
|
|
||||||
resp->free = info.ullAvailPhys;
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
|
||||||
} else {
|
|
||||||
resp->err = LOAD_ERR();
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif __linux__
|
|
||||||
#include <errno.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <sys/sysinfo.h>
|
|
||||||
void cpu_check_ram(mem_info_t *resp) {
|
|
||||||
struct sysinfo info;
|
|
||||||
resp->err = NULL;
|
|
||||||
if (sysinfo(&info) != 0) {
|
|
||||||
resp->err = strdup(strerror(errno));
|
|
||||||
} else {
|
|
||||||
resp->total = info.totalram * info.mem_unit;
|
|
||||||
resp->free = info.freeram * info.mem_unit;
|
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif __APPLE__
|
|
||||||
// Unused - see gpu_darwin.go
|
|
||||||
#else
|
|
||||||
#error "Unsupported platform"
|
|
||||||
#endif
|
|
|
@ -4,8 +4,7 @@
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
|
||||||
{
|
|
||||||
ze_result_t ret;
|
ze_result_t ret;
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
resp->oh.devices = NULL;
|
resp->oh.devices = NULL;
|
||||||
|
@ -15,8 +14,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
||||||
const int buflen = 256;
|
const int buflen = 256;
|
||||||
char buf[buflen + 1];
|
char buf[buflen + 1];
|
||||||
int i, d, count;
|
int i, d, count;
|
||||||
struct lookup
|
struct lookup {
|
||||||
{
|
|
||||||
char *s;
|
char *s;
|
||||||
void **p;
|
void **p;
|
||||||
} l[] = {
|
} l[] = {
|
||||||
|
@ -32,8 +30,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
||||||
};
|
};
|
||||||
|
|
||||||
resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
|
resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
|
||||||
if (!resp->oh.handle)
|
if (!resp->oh.handle) {
|
||||||
{
|
|
||||||
char *msg = LOAD_ERR();
|
char *msg = LOAD_ERR();
|
||||||
snprintf(buf, buflen,
|
snprintf(buf, buflen,
|
||||||
"Unable to load %s library to query for Intel GPUs: %s\n",
|
"Unable to load %s library to query for Intel GPUs: %s\n",
|
||||||
|
@ -48,14 +45,12 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
||||||
"wiring Level-Zero management library functions in %s\n",
|
"wiring Level-Zero management library functions in %s\n",
|
||||||
oneapi_lib_path);
|
oneapi_lib_path);
|
||||||
|
|
||||||
for (i = 0; l[i].s != NULL; i++)
|
for (i = 0; l[i].s != NULL; i++) {
|
||||||
{
|
|
||||||
// TODO once we've squashed the remaining corner cases remove this log
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
|
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
|
||||||
|
|
||||||
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
|
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
|
||||||
if (!l[i].p)
|
if (!l[i].p) {
|
||||||
{
|
|
||||||
resp->oh.handle = NULL;
|
resp->oh.handle = NULL;
|
||||||
char *msg = LOAD_ERR();
|
char *msg = LOAD_ERR();
|
||||||
LOG(resp->oh.verbose, "dlerr: %s\n", msg);
|
LOG(resp->oh.verbose, "dlerr: %s\n", msg);
|
||||||
|
@ -68,8 +63,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = (*resp->oh.zesInit)(0);
|
ret = (*resp->oh.zesInit)(0);
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
{
|
|
||||||
LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
|
LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
|
||||||
snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
|
snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
|
@ -79,8 +73,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
||||||
|
|
||||||
count = 0;
|
count = 0;
|
||||||
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
|
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
{
|
|
||||||
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
|
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
|
||||||
snprintf(buf, buflen, "unable to get driver count: %x", ret);
|
snprintf(buf, buflen, "unable to get driver count: %x", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
|
@ -91,10 +84,10 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
||||||
resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
|
resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
|
||||||
resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
|
resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
|
||||||
memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
|
memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
|
||||||
resp->oh.devices = malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t*));
|
resp->oh.devices =
|
||||||
|
malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
|
||||||
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
|
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
{
|
|
||||||
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
|
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
|
||||||
snprintf(buf, buflen, "unable to get driver count: %x", ret);
|
snprintf(buf, buflen, "unable to get driver count: %x", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
|
@ -103,19 +96,20 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
||||||
}
|
}
|
||||||
|
|
||||||
for (d = 0; d < resp->oh.num_drivers; d++) {
|
for (d = 0; d < resp->oh.num_drivers; d++) {
|
||||||
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], &resp->oh.num_devices[d], NULL);
|
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
&resp->oh.num_devices[d], NULL);
|
||||||
{
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
|
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
|
||||||
snprintf(buf, buflen, "unable to get device count: %x", ret);
|
snprintf(buf, buflen, "unable to get device count: %x", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
oneapi_release(resp->oh);
|
oneapi_release(resp->oh);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
resp->oh.devices[d] = malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
|
resp->oh.devices[d] =
|
||||||
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
|
malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
ret = (*resp->oh.zesDeviceGet)(
|
||||||
{
|
resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
|
||||||
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
|
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
|
||||||
snprintf(buf, buflen, "unable to get device count: %x", ret);
|
snprintf(buf, buflen, "unable to get device count: %x", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
|
@ -128,8 +122,8 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *resp)
|
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
|
||||||
{
|
mem_info_t *resp) {
|
||||||
ze_result_t ret;
|
ze_result_t ret;
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
uint64_t totalMem = 0;
|
uint64_t totalMem = 0;
|
||||||
|
@ -138,12 +132,11 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
|
||||||
char buf[buflen + 1];
|
char buf[buflen + 1];
|
||||||
int i, d, m;
|
int i, d, m;
|
||||||
|
|
||||||
if (h.handle == NULL)
|
if (h.handle == NULL) {
|
||||||
{
|
|
||||||
resp->err = strdup("Level-Zero handle not initialized");
|
resp->err = strdup("Level-Zero handle not initialized");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (driver > h.num_drivers || device > h.num_devices[driver]) {
|
if (driver > h.num_drivers || device > h.num_devices[driver]) {
|
||||||
resp->err = strdup("driver of device index out of bounds");
|
resp->err = strdup("driver of device index out of bounds");
|
||||||
return;
|
return;
|
||||||
|
@ -161,8 +154,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
|
||||||
props.pNext = &ext_props;
|
props.pNext = &ext_props;
|
||||||
|
|
||||||
ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
|
ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
{
|
|
||||||
snprintf(buf, buflen, "unable to get device properties: %d", ret);
|
snprintf(buf, buflen, "unable to get device properties: %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
|
@ -175,8 +167,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
|
||||||
// TODO - the driver isn't included - what if there are multiple drivers?
|
// TODO - the driver isn't included - what if there are multiple drivers?
|
||||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
|
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
|
||||||
|
|
||||||
if (h.verbose)
|
if (h.verbose) {
|
||||||
{
|
|
||||||
// When in verbose mode, report more information about
|
// When in verbose mode, report more information about
|
||||||
// the card we discover.
|
// the card we discover.
|
||||||
LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
|
LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
|
||||||
|
@ -195,11 +186,11 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
|
||||||
// Compute Capability equivalent in resp->major, resp->minor, resp->patch
|
// Compute Capability equivalent in resp->major, resp->minor, resp->patch
|
||||||
|
|
||||||
uint32_t memCount = 0;
|
uint32_t memCount = 0;
|
||||||
ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, NULL);
|
ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
NULL);
|
||||||
{
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
snprintf(buf, buflen,
|
snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
|
||||||
"unable to enumerate Level-Zero memory modules: %x", ret);
|
ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -209,14 +200,12 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
|
||||||
zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
|
zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
|
||||||
(*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
|
(*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
|
||||||
|
|
||||||
for (m = 0; m < memCount; m++)
|
for (m = 0; m < memCount; m++) {
|
||||||
{
|
|
||||||
zes_mem_state_t state;
|
zes_mem_state_t state;
|
||||||
state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
|
state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
|
||||||
state.pNext = NULL;
|
state.pNext = NULL;
|
||||||
ret = (*h.zesMemoryGetState)(mems[m], &state);
|
ret = (*h.zesMemoryGetState)(mems[m], &state);
|
||||||
if (ret != ZE_RESULT_SUCCESS)
|
if (ret != ZE_RESULT_SUCCESS) {
|
||||||
{
|
|
||||||
snprintf(buf, buflen, "unable to get memory state: %x", ret);
|
snprintf(buf, buflen, "unable to get memory state: %x", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
free(mems);
|
free(mems);
|
||||||
|
@ -230,29 +219,23 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
|
||||||
free(mems);
|
free(mems);
|
||||||
}
|
}
|
||||||
|
|
||||||
void oneapi_release(oneapi_handle_t h)
|
void oneapi_release(oneapi_handle_t h) {
|
||||||
{
|
|
||||||
int d;
|
int d;
|
||||||
LOG(h.verbose, "releasing oneapi library\n");
|
LOG(h.verbose, "releasing oneapi library\n");
|
||||||
for (d = 0; d < h.num_drivers; d++)
|
for (d = 0; d < h.num_drivers; d++) {
|
||||||
{
|
if (h.devices != NULL && h.devices[d] != NULL) {
|
||||||
if (h.devices != NULL && h.devices[d] != NULL)
|
|
||||||
{
|
|
||||||
free(h.devices[d]);
|
free(h.devices[d]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (h.devices != NULL)
|
if (h.devices != NULL) {
|
||||||
{
|
|
||||||
free(h.devices);
|
free(h.devices);
|
||||||
h.devices = NULL;
|
h.devices = NULL;
|
||||||
}
|
}
|
||||||
if (h.num_devices != NULL)
|
if (h.num_devices != NULL) {
|
||||||
{
|
|
||||||
free(h.num_devices);
|
free(h.num_devices);
|
||||||
h.num_devices = NULL;
|
h.num_devices = NULL;
|
||||||
}
|
}
|
||||||
if (h.drivers != NULL)
|
if (h.drivers != NULL) {
|
||||||
{
|
|
||||||
free(h.drivers);
|
free(h.drivers);
|
||||||
h.drivers = NULL;
|
h.drivers = NULL;
|
||||||
}
|
}
|
||||||
|
@ -261,14 +244,11 @@ void oneapi_release(oneapi_handle_t h)
|
||||||
h.handle = NULL;
|
h.handle = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
int oneapi_get_device_count(oneapi_handle_t h, int driver)
|
int oneapi_get_device_count(oneapi_handle_t h, int driver) {
|
||||||
{
|
if (h.handle == NULL || h.num_devices == NULL) {
|
||||||
if (h.handle == NULL || h.num_devices == NULL)
|
|
||||||
{
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (driver > h.num_drivers)
|
if (driver > h.num_drivers) {
|
||||||
{
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return (int)h.num_devices[driver];
|
return (int)h.num_devices[driver];
|
||||||
|
|
|
@ -9,8 +9,7 @@
|
||||||
#define ZE_BIT(_i) (1 << _i)
|
#define ZE_BIT(_i) (1 << _i)
|
||||||
|
|
||||||
// Just enough typedef's to dlopen/dlsym for memory information
|
// Just enough typedef's to dlopen/dlsym for memory information
|
||||||
typedef enum ze_result_t
|
typedef enum ze_result_t {
|
||||||
{
|
|
||||||
ZE_RESULT_SUCCESS = 0,
|
ZE_RESULT_SUCCESS = 0,
|
||||||
// Other values omitted for now...
|
// Other values omitted for now...
|
||||||
} ze_result_t;
|
} ze_result_t;
|
||||||
|
@ -20,13 +19,11 @@ typedef struct _zes_driver_handle_t *zes_driver_handle_t;
|
||||||
typedef struct _zes_device_handle_t *zes_device_handle_t;
|
typedef struct _zes_device_handle_t *zes_device_handle_t;
|
||||||
typedef struct _zes_mem_handle_t *zes_mem_handle_t;
|
typedef struct _zes_mem_handle_t *zes_mem_handle_t;
|
||||||
|
|
||||||
typedef enum _ze_structure_type_t
|
typedef enum _ze_structure_type_t {
|
||||||
{
|
|
||||||
ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
|
ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
|
||||||
} ze_structure_type_t;
|
} ze_structure_type_t;
|
||||||
|
|
||||||
typedef enum _zes_structure_type_t
|
typedef enum _zes_structure_type_t {
|
||||||
{
|
|
||||||
ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
|
ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
|
||||||
ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
|
ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
|
||||||
ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
|
ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
|
||||||
|
@ -34,35 +31,29 @@ typedef enum _zes_structure_type_t
|
||||||
ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
|
ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
|
||||||
} zes_structure_type_t;
|
} zes_structure_type_t;
|
||||||
|
|
||||||
typedef enum _zes_mem_type_t
|
typedef enum _zes_mem_type_t {
|
||||||
{
|
|
||||||
ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
|
ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
|
||||||
} zes_mem_type_t;
|
} zes_mem_type_t;
|
||||||
|
|
||||||
typedef enum _zes_mem_loc_t
|
typedef enum _zes_mem_loc_t {
|
||||||
{
|
|
||||||
ZES_MEM_LOC_SYSTEM = 0,
|
ZES_MEM_LOC_SYSTEM = 0,
|
||||||
ZES_MEM_LOC_DEVICE = 1,
|
ZES_MEM_LOC_DEVICE = 1,
|
||||||
ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
|
ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
|
||||||
} zes_mem_loc_t;
|
} zes_mem_loc_t;
|
||||||
|
|
||||||
typedef enum _zes_mem_health_t
|
typedef enum _zes_mem_health_t {
|
||||||
{
|
|
||||||
ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
|
ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
|
||||||
} zes_mem_health_t;
|
} zes_mem_health_t;
|
||||||
|
|
||||||
typedef struct _ze_device_uuid_t
|
typedef struct _ze_device_uuid_t {
|
||||||
{
|
|
||||||
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
|
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
|
||||||
} ze_device_uuid_t;
|
} ze_device_uuid_t;
|
||||||
|
|
||||||
typedef struct _zes_uuid_t
|
typedef struct _zes_uuid_t {
|
||||||
{
|
|
||||||
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
|
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
|
||||||
} zes_uuid_t;
|
} zes_uuid_t;
|
||||||
|
|
||||||
typedef enum _ze_device_type_t
|
typedef enum _ze_device_type_t {
|
||||||
{
|
|
||||||
ZE_DEVICE_TYPE_GPU = 1,
|
ZE_DEVICE_TYPE_GPU = 1,
|
||||||
ZE_DEVICE_TYPE_CPU = 2,
|
ZE_DEVICE_TYPE_CPU = 2,
|
||||||
ZE_DEVICE_TYPE_FPGA = 3,
|
ZE_DEVICE_TYPE_FPGA = 3,
|
||||||
|
@ -71,8 +62,7 @@ typedef enum _ze_device_type_t
|
||||||
ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
|
ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
|
||||||
} ze_device_type_t;
|
} ze_device_type_t;
|
||||||
|
|
||||||
typedef enum _zes_device_type_t
|
typedef enum _zes_device_type_t {
|
||||||
{
|
|
||||||
ZES_DEVICE_TYPE_GPU = 1,
|
ZES_DEVICE_TYPE_GPU = 1,
|
||||||
ZES_DEVICE_TYPE_CPU = 2,
|
ZES_DEVICE_TYPE_CPU = 2,
|
||||||
ZES_DEVICE_TYPE_FPGA = 3,
|
ZES_DEVICE_TYPE_FPGA = 3,
|
||||||
|
@ -82,8 +72,7 @@ typedef enum _zes_device_type_t
|
||||||
} zes_device_type_t;
|
} zes_device_type_t;
|
||||||
|
|
||||||
typedef uint32_t ze_device_property_flags_t;
|
typedef uint32_t ze_device_property_flags_t;
|
||||||
typedef enum _ze_device_property_flag_t
|
typedef enum _ze_device_property_flag_t {
|
||||||
{
|
|
||||||
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
|
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
|
||||||
ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
|
ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
|
||||||
ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
|
ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
|
||||||
|
@ -92,8 +81,7 @@ typedef enum _ze_device_property_flag_t
|
||||||
} ze_device_property_flag_t;
|
} ze_device_property_flag_t;
|
||||||
|
|
||||||
typedef uint32_t zes_device_property_flags_t;
|
typedef uint32_t zes_device_property_flags_t;
|
||||||
typedef enum _zes_device_property_flag_t
|
typedef enum _zes_device_property_flag_t {
|
||||||
{
|
|
||||||
ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
|
ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
|
||||||
ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
|
ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
|
||||||
ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
|
ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
|
||||||
|
@ -101,8 +89,7 @@ typedef enum _zes_device_property_flag_t
|
||||||
ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
|
ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
|
||||||
} zes_device_property_flag_t;
|
} zes_device_property_flag_t;
|
||||||
|
|
||||||
typedef struct _ze_device_properties_t
|
typedef struct _ze_device_properties_t {
|
||||||
{
|
|
||||||
ze_structure_type_t stype;
|
ze_structure_type_t stype;
|
||||||
void *pNext;
|
void *pNext;
|
||||||
ze_device_type_t type;
|
ze_device_type_t type;
|
||||||
|
@ -126,8 +113,7 @@ typedef struct _ze_device_properties_t
|
||||||
char name[ZE_MAX_DEVICE_NAME];
|
char name[ZE_MAX_DEVICE_NAME];
|
||||||
} ze_device_properties_t;
|
} ze_device_properties_t;
|
||||||
|
|
||||||
typedef struct _zes_device_properties_t
|
typedef struct _zes_device_properties_t {
|
||||||
{
|
|
||||||
zes_structure_type_t stype;
|
zes_structure_type_t stype;
|
||||||
void *pNext;
|
void *pNext;
|
||||||
ze_device_properties_t core;
|
ze_device_properties_t core;
|
||||||
|
@ -140,8 +126,7 @@ typedef struct _zes_device_properties_t
|
||||||
char driverVersion[ZES_STRING_PROPERTY_SIZE];
|
char driverVersion[ZES_STRING_PROPERTY_SIZE];
|
||||||
} zes_device_properties_t;
|
} zes_device_properties_t;
|
||||||
|
|
||||||
typedef struct _zes_device_ext_properties_t
|
typedef struct _zes_device_ext_properties_t {
|
||||||
{
|
|
||||||
zes_structure_type_t stype;
|
zes_structure_type_t stype;
|
||||||
void *pNext;
|
void *pNext;
|
||||||
zes_uuid_t uuid;
|
zes_uuid_t uuid;
|
||||||
|
@ -149,8 +134,7 @@ typedef struct _zes_device_ext_properties_t
|
||||||
zes_device_property_flags_t flags;
|
zes_device_property_flags_t flags;
|
||||||
} zes_device_ext_properties_t;
|
} zes_device_ext_properties_t;
|
||||||
|
|
||||||
typedef struct _zes_mem_properties_t
|
typedef struct _zes_mem_properties_t {
|
||||||
{
|
|
||||||
zes_structure_type_t stype;
|
zes_structure_type_t stype;
|
||||||
void *pNext;
|
void *pNext;
|
||||||
zes_mem_type_t type;
|
zes_mem_type_t type;
|
||||||
|
@ -162,8 +146,7 @@ typedef struct _zes_mem_properties_t
|
||||||
int32_t numChannels;
|
int32_t numChannels;
|
||||||
} zes_mem_properties_t;
|
} zes_mem_properties_t;
|
||||||
|
|
||||||
typedef struct _zes_mem_state_t
|
typedef struct _zes_mem_state_t {
|
||||||
{
|
|
||||||
zes_structure_type_t stype;
|
zes_structure_type_t stype;
|
||||||
const void *pNext;
|
const void *pNext;
|
||||||
zes_mem_health_t health;
|
zes_mem_health_t health;
|
||||||
|
@ -171,15 +154,14 @@ typedef struct _zes_mem_state_t
|
||||||
uint64_t size;
|
uint64_t size;
|
||||||
} zes_mem_state_t;
|
} zes_mem_state_t;
|
||||||
|
|
||||||
typedef struct oneapi_handle
|
typedef struct oneapi_handle {
|
||||||
{
|
|
||||||
void *handle;
|
void *handle;
|
||||||
uint16_t verbose;
|
uint16_t verbose;
|
||||||
|
|
||||||
uint32_t num_drivers;
|
uint32_t num_drivers;
|
||||||
zes_driver_handle_t *drivers;
|
zes_driver_handle_t *drivers;
|
||||||
uint32_t *num_devices;
|
uint32_t *num_devices;
|
||||||
zes_device_handle_t **devices;
|
zes_device_handle_t **devices;
|
||||||
|
|
||||||
// TODO Driver major, minor information
|
// TODO Driver major, minor information
|
||||||
// int driver_major;
|
// int driver_major;
|
||||||
|
@ -201,20 +183,19 @@ typedef struct oneapi_handle
|
||||||
|
|
||||||
} oneapi_handle_t;
|
} oneapi_handle_t;
|
||||||
|
|
||||||
typedef struct oneapi_init_resp
|
typedef struct oneapi_init_resp {
|
||||||
{
|
|
||||||
char *err; // If err is non-null handle is invalid
|
char *err; // If err is non-null handle is invalid
|
||||||
oneapi_handle_t oh;
|
oneapi_handle_t oh;
|
||||||
} oneapi_init_resp_t;
|
} oneapi_init_resp_t;
|
||||||
|
|
||||||
typedef struct oneapi_version_resp
|
typedef struct oneapi_version_resp {
|
||||||
{
|
|
||||||
ze_result_t status;
|
ze_result_t status;
|
||||||
char *str; // Contains version or error string if status != 0
|
char *str; // Contains version or error string if status != 0
|
||||||
} oneapi_version_resp_t;
|
} oneapi_version_resp_t;
|
||||||
|
|
||||||
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
|
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
|
||||||
void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *resp);
|
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
|
||||||
|
mem_info_t *resp);
|
||||||
void oneapi_release(oneapi_handle_t h);
|
void oneapi_release(oneapi_handle_t h);
|
||||||
int oneapi_get_device_count(oneapi_handle_t h, int driver);
|
int oneapi_get_device_count(oneapi_handle_t h, int driver);
|
||||||
|
|
||||||
|
|
89
gpu/gpu_linux.go
Normal file
89
gpu/gpu_linux.go
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
package gpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/format"
|
||||||
|
)
|
||||||
|
|
||||||
|
var CudartGlobs = []string{
|
||||||
|
"/usr/local/cuda/lib64/libcudart.so*",
|
||||||
|
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
|
||||||
|
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
|
||||||
|
"/usr/lib/wsl/lib/libcudart.so*",
|
||||||
|
"/usr/lib/wsl/drivers/*/libcudart.so*",
|
||||||
|
"/opt/cuda/lib64/libcudart.so*",
|
||||||
|
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
|
||||||
|
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
|
||||||
|
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
|
||||||
|
"/usr/local/cuda/lib*/libcudart.so*",
|
||||||
|
"/usr/lib*/libcudart.so*",
|
||||||
|
"/usr/local/lib*/libcudart.so*",
|
||||||
|
}
|
||||||
|
|
||||||
|
var NvmlGlobs = []string{}
|
||||||
|
|
||||||
|
var NvcudaGlobs = []string{
|
||||||
|
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
|
||||||
|
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
|
||||||
|
"/usr/lib/*-linux-gnu/libcuda.so*",
|
||||||
|
"/usr/lib/wsl/lib/libcuda.so*",
|
||||||
|
"/usr/lib/wsl/drivers/*/libcuda.so*",
|
||||||
|
"/opt/cuda/lib*/libcuda.so*",
|
||||||
|
"/usr/local/cuda/lib*/libcuda.so*",
|
||||||
|
"/usr/lib*/libcuda.so*",
|
||||||
|
"/usr/local/lib*/libcuda.so*",
|
||||||
|
}
|
||||||
|
|
||||||
|
var OneapiGlobs = []string{
|
||||||
|
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
|
||||||
|
"/usr/lib*/libze_intel_gpu.so*",
|
||||||
|
}
|
||||||
|
|
||||||
|
var CudartMgmtName = "libcudart.so*"
|
||||||
|
var NvcudaMgmtName = "libcuda.so*"
|
||||||
|
var NvmlMgmtName = "" // not currently wired on linux
|
||||||
|
var OneapiMgmtName = "libze_intel_gpu.so"
|
||||||
|
|
||||||
|
func GetCPUMem() (memInfo, error) {
|
||||||
|
var mem memInfo
|
||||||
|
var total, available, free, buffers, cached uint64
|
||||||
|
f, err := os.Open("/proc/meminfo")
|
||||||
|
if err != nil {
|
||||||
|
return mem, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
s := bufio.NewScanner(f)
|
||||||
|
for s.Scan() {
|
||||||
|
line := s.Text()
|
||||||
|
switch {
|
||||||
|
case strings.HasPrefix(line, "MemTotal:"):
|
||||||
|
_, err = fmt.Sscanf(line, "MemTotal:%d", &total)
|
||||||
|
case strings.HasPrefix(line, "MemAvailable:"):
|
||||||
|
_, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
|
||||||
|
case strings.HasPrefix(line, "MemFree:"):
|
||||||
|
_, err = fmt.Sscanf(line, "MemFree:%d", &free)
|
||||||
|
case strings.HasPrefix(line, "Buffers:"):
|
||||||
|
_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
|
||||||
|
case strings.HasPrefix(line, "Cached:"):
|
||||||
|
_, err = fmt.Sscanf(line, "Cached:%d", &cached)
|
||||||
|
default:
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return mem, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if total > 0 && available > 0 {
|
||||||
|
mem.TotalMemory = total * format.KibiByte
|
||||||
|
mem.FreeMemory = available * format.KibiByte
|
||||||
|
return mem, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mem.TotalMemory = total * format.KibiByte
|
||||||
|
mem.FreeMemory = (free + buffers + cached) * format.KibiByte
|
||||||
|
return mem, nil
|
||||||
|
}
|
55
gpu/gpu_windows.go
Normal file
55
gpu/gpu_windows.go
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
package gpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"syscall"
|
||||||
|
"unsafe"
|
||||||
|
)
|
||||||
|
|
||||||
|
type MEMORYSTATUSEX struct {
|
||||||
|
length uint32
|
||||||
|
MemoryLoad uint32
|
||||||
|
TotalPhys uint64
|
||||||
|
AvailPhys uint64
|
||||||
|
TotalPageFile uint64
|
||||||
|
AvailPageFile uint64
|
||||||
|
TotalVirtual uint64
|
||||||
|
AvailVirtual uint64
|
||||||
|
AvailExtendedVirtual uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
k32 = syscall.NewLazyDLL("kernel32.dll")
|
||||||
|
globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
|
||||||
|
sizeofMemoryStatusEx = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
|
||||||
|
)
|
||||||
|
|
||||||
|
var CudartGlobs = []string{
|
||||||
|
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
var NvmlGlobs = []string{
|
||||||
|
"c:\\Windows\\System32\\nvml.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
var NvcudaGlobs = []string{
|
||||||
|
"c:\\windows\\system*\\nvcuda.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
var OneapiGlobs = []string{
|
||||||
|
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
var CudartMgmtName = "cudart64_*.dll"
|
||||||
|
var NvcudaMgmtName = "nvcuda.dll"
|
||||||
|
var NvmlMgmtName = "nvml.dll"
|
||||||
|
var OneapiMgmtName = "ze_intel_gpu64.dll"
|
||||||
|
|
||||||
|
func GetCPUMem() (memInfo, error) {
|
||||||
|
memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
|
||||||
|
r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
|
||||||
|
if r1 == 0 {
|
||||||
|
return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
|
||||||
|
}
|
||||||
|
return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys}, nil
|
||||||
|
}
|
33
gpu/types.go
33
gpu/types.go
|
@ -18,7 +18,7 @@ type GpuInfo struct {
|
||||||
Library string `json:"library,omitempty"`
|
Library string `json:"library,omitempty"`
|
||||||
|
|
||||||
// Optional variant to select (e.g. versions, cpu feature flags)
|
// Optional variant to select (e.g. versions, cpu feature flags)
|
||||||
Variant string `json:"variant,omitempty"`
|
Variant CPUCapability `json:"variant"`
|
||||||
|
|
||||||
// MinimumMemory represents the minimum memory required to use the GPU
|
// MinimumMemory represents the minimum memory required to use the GPU
|
||||||
MinimumMemory uint64 `json:"-"`
|
MinimumMemory uint64 `json:"-"`
|
||||||
|
@ -44,21 +44,21 @@ type CPUInfo struct {
|
||||||
|
|
||||||
type CudaGPUInfo struct {
|
type CudaGPUInfo struct {
|
||||||
GpuInfo
|
GpuInfo
|
||||||
index int // nolint: unused
|
index int //nolint:unused,nolintlint
|
||||||
}
|
}
|
||||||
type CudaGPUInfoList []CudaGPUInfo
|
type CudaGPUInfoList []CudaGPUInfo
|
||||||
|
|
||||||
type RocmGPUInfo struct {
|
type RocmGPUInfo struct {
|
||||||
GpuInfo
|
GpuInfo
|
||||||
usedFilepath string // nolint: unused
|
usedFilepath string //nolint:unused,nolintlint
|
||||||
index int // nolint: unused
|
index int //nolint:unused,nolintlint
|
||||||
}
|
}
|
||||||
type RocmGPUInfoList []RocmGPUInfo
|
type RocmGPUInfoList []RocmGPUInfo
|
||||||
|
|
||||||
type OneapiGPUInfo struct {
|
type OneapiGPUInfo struct {
|
||||||
GpuInfo
|
GpuInfo
|
||||||
driverIndex int // nolint: unused
|
driverIndex int //nolint:unused,nolintlint
|
||||||
gpuIndex int // nolint: unused
|
gpuIndex int //nolint:unused,nolintlint
|
||||||
}
|
}
|
||||||
type OneapiGPUInfoList []OneapiGPUInfo
|
type OneapiGPUInfoList []OneapiGPUInfo
|
||||||
|
|
||||||
|
@ -71,8 +71,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
||||||
for _, info := range l {
|
for _, info := range l {
|
||||||
found := false
|
found := false
|
||||||
requested := info.Library
|
requested := info.Library
|
||||||
if info.Variant != "" {
|
if info.Variant != CPUCapabilityNone {
|
||||||
requested += "_" + info.Variant
|
requested += "_" + info.Variant.String()
|
||||||
}
|
}
|
||||||
for i, lib := range libs {
|
for i, lib := range libs {
|
||||||
if lib == requested {
|
if lib == requested {
|
||||||
|
@ -117,30 +117,19 @@ type CPUCapability uint32
|
||||||
var GPURunnerCPUCapability = CPUCapabilityAVX
|
var GPURunnerCPUCapability = CPUCapabilityAVX
|
||||||
|
|
||||||
const (
|
const (
|
||||||
CPUCapabilityBase CPUCapability = iota
|
CPUCapabilityNone CPUCapability = iota
|
||||||
CPUCapabilityAVX
|
CPUCapabilityAVX
|
||||||
CPUCapabilityAVX2
|
CPUCapabilityAVX2
|
||||||
// TODO AVX512
|
// TODO AVX512
|
||||||
)
|
)
|
||||||
|
|
||||||
func (c CPUCapability) ToString() string {
|
func (c CPUCapability) String() string {
|
||||||
switch c {
|
|
||||||
case CPUCapabilityAVX:
|
|
||||||
return "AVX"
|
|
||||||
case CPUCapabilityAVX2:
|
|
||||||
return "AVX2"
|
|
||||||
default:
|
|
||||||
return "no vector extensions"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c CPUCapability) ToVariant() string {
|
|
||||||
switch c {
|
switch c {
|
||||||
case CPUCapabilityAVX:
|
case CPUCapabilityAVX:
|
||||||
return "avx"
|
return "avx"
|
||||||
case CPUCapabilityAVX2:
|
case CPUCapabilityAVX2:
|
||||||
return "avx2"
|
return "avx2"
|
||||||
default:
|
default:
|
||||||
return ""
|
return "no vector extensions"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,8 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestContextExhaustion(t *testing.T) {
|
func TestContextExhaustion(t *testing.T) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute) // Longer needed for small footprint GPUs
|
// Longer needed for small footprint GPUs
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
// Set up the test data
|
// Set up the test data
|
||||||
req := api.GenerateRequest{
|
req := api.GenerateRequest{
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
package llm
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
@ -69,13 +68,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
|
|
||||||
// Conditional output size on GPU 0
|
// Conditional output size on GPU 0
|
||||||
var memoryLayerOutput uint64
|
var memoryLayerOutput uint64
|
||||||
var includeOutput bool
|
|
||||||
|
|
||||||
// One extra layer as a pad for each GPU
|
// The sizes of a layer
|
||||||
var layerBuffer uint64
|
var layerSize uint64
|
||||||
|
|
||||||
// The sizes of the main layers
|
|
||||||
var layerSizes []uint64
|
|
||||||
|
|
||||||
// The sum of all the layer sizes (just for logging)
|
// The sum of all the layer sizes (just for logging)
|
||||||
var memoryWeights uint64
|
var memoryWeights uint64
|
||||||
|
@ -102,12 +97,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
layers := ggml.Tensors().Layers()
|
layers := ggml.Tensors().Layers()
|
||||||
// add one layer worth of memory as a buffer
|
// add one layer worth of memory as a buffer
|
||||||
if blk0, ok := layers["blk.0"]; ok {
|
if blk0, ok := layers["blk.0"]; ok {
|
||||||
layerBuffer = blk0.size()
|
layerSize = blk0.size()
|
||||||
|
} else {
|
||||||
|
slog.Warn("model missing blk.0 layer size")
|
||||||
}
|
}
|
||||||
|
|
||||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
||||||
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
||||||
|
|
||||||
|
// KV is proportional to the number of layers
|
||||||
|
layerSize += kv / ggml.KV().BlockCount()
|
||||||
|
|
||||||
graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||||
if graphPartialOffload == 0 {
|
if graphPartialOffload == 0 {
|
||||||
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||||
|
@ -119,6 +119,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
// on metal there's no partial offload overhead
|
// on metal there's no partial offload overhead
|
||||||
if gpus[0].Library == "metal" {
|
if gpus[0].Library == "metal" {
|
||||||
graphPartialOffload = graphFullOffload
|
graphPartialOffload = graphFullOffload
|
||||||
|
} else if len(gpus) > 1 {
|
||||||
|
// multigpu should always use the partial graph size
|
||||||
|
graphFullOffload = graphPartialOffload
|
||||||
}
|
}
|
||||||
|
|
||||||
if layer, ok := layers["output_norm"]; ok {
|
if layer, ok := layers["output_norm"]; ok {
|
||||||
|
@ -130,16 +133,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
memoryLayerOutput += layer.size()
|
memoryLayerOutput += layer.size()
|
||||||
}
|
}
|
||||||
|
|
||||||
if gpus[0].Library == "metal" && opts.UseMMap {
|
// Output layer handled at the end if we have space
|
||||||
includeOutput = true
|
|
||||||
} else if gpus[0].Library != "metal" || !opts.UseMMap {
|
|
||||||
includeOutput = true
|
|
||||||
}
|
|
||||||
|
|
||||||
gpuZeroOverhead := projectorSize
|
gpuZeroOverhead := projectorSize
|
||||||
if includeOutput {
|
|
||||||
gpuZeroOverhead += memoryLayerOutput
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
||||||
var layerCount int
|
var layerCount int
|
||||||
|
@ -156,12 +151,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
gzo = gpuZeroOverhead
|
gzo = gpuZeroOverhead
|
||||||
}
|
}
|
||||||
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
||||||
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer {
|
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
||||||
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
|
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
||||||
gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full
|
gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
|
||||||
}
|
}
|
||||||
|
|
||||||
var gpuZeroID int
|
var gpuZeroID int
|
||||||
|
@ -170,23 +165,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
gpuAllocations[gpuZeroID] += gpuZeroOverhead
|
gpuAllocations[gpuZeroID] += gpuZeroOverhead
|
||||||
}
|
}
|
||||||
|
|
||||||
layerSizes = make([]uint64, int(ggml.KV().BlockCount()))
|
|
||||||
for i := range int(ggml.KV().BlockCount()) {
|
|
||||||
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
|
||||||
memoryLayer := blk.size()
|
|
||||||
|
|
||||||
// KV is proportional to the number of layers
|
|
||||||
memoryLayer += kv / ggml.KV().BlockCount()
|
|
||||||
layerSizes[i] = memoryLayer
|
|
||||||
memoryWeights += memoryLayer
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// For all the layers, find where they can fit on the GPU(s)
|
// For all the layers, find where they can fit on the GPU(s)
|
||||||
for i := range layerSizes {
|
for i := range int(ggml.KV().BlockCount()) {
|
||||||
if layerSizes[i] == 0 {
|
memoryWeights += layerSize
|
||||||
continue
|
|
||||||
}
|
|
||||||
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
||||||
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
// Stop allocating on GPU(s) once we hit the users target NumGPU
|
||||||
continue
|
continue
|
||||||
|
@ -196,8 +178,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
for j := len(gpusWithSpace); j > 0; j-- {
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
g := gpusWithSpace[i%j]
|
g := gpusWithSpace[i%j]
|
||||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
if g.g.FreeMemory > used+layerSizes[i] {
|
if g.g.FreeMemory > used+layerSize {
|
||||||
gpuAllocations[g.i] += layerSizes[i]
|
gpuAllocations[g.i] += layerSize
|
||||||
layerCounts[g.i]++
|
layerCounts[g.i]++
|
||||||
layerCount++
|
layerCount++
|
||||||
break
|
break
|
||||||
|
@ -205,17 +187,18 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
|
gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
if layerCount >= int(ggml.KV().BlockCount()) {
|
if layerCount >= int(ggml.KV().BlockCount()) {
|
||||||
fullyLoaded = true
|
fullyLoaded = true
|
||||||
} else {
|
} else {
|
||||||
for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
|
for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
|
||||||
overflow += layerSizes[i]
|
overflow += layerSize
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Find where the output fits
|
|
||||||
if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
|
// Determine if we need to consider output then find where it fits
|
||||||
|
if ((gpus[0].Library == "metal" && opts.UseMMap) || (gpus[0].Library != "metal" || !opts.UseMMap)) &&
|
||||||
|
memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
|
||||||
for j := len(gpusWithSpace); j > 0; j-- {
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
g := gpusWithSpace[layerCount%j]
|
g := gpusWithSpace[layerCount%j]
|
||||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
|
@ -226,6 +209,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if layerCount < int(ggml.KV().BlockCount())+1 {
|
if layerCount < int(ggml.KV().BlockCount())+1 {
|
||||||
fullyLoaded = false
|
fullyLoaded = false
|
||||||
overflow += memoryLayerOutput
|
overflow += memoryLayerOutput
|
||||||
|
@ -253,7 +237,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
var memoryRequiredPartial, memoryRequiredTotal uint64
|
var memoryRequiredPartial, memoryRequiredTotal uint64
|
||||||
for i := range gpuAllocations {
|
for i := range gpuAllocations {
|
||||||
memoryRequiredPartial += gpuAllocations[i]
|
memoryRequiredPartial += gpuAllocations[i]
|
||||||
|
|
||||||
}
|
}
|
||||||
memoryRequiredTotal = memoryRequiredPartial + overflow
|
memoryRequiredTotal = memoryRequiredPartial + overflow
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||||
envconfig.Debug = true
|
envconfig.Debug = true
|
||||||
modelName := "dummy"
|
modelName := "dummy"
|
||||||
f, err := os.CreateTemp(t.TempDir(), modelName)
|
f, err := os.CreateTemp(t.TempDir(), modelName)
|
||||||
assert.Nil(t, err)
|
require.NoError(t, err)
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
gguf := NewGGUFV3(binary.LittleEndian)
|
gguf := NewGGUFV3(binary.LittleEndian)
|
||||||
inputLayerCount := 5
|
inputLayerCount := 5
|
||||||
|
@ -30,7 +30,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||||
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||||
}
|
}
|
||||||
assert.Equal(t, inputLayerCount+1, len(tensors))
|
assert.Len(t, tensors, inputLayerCount+1)
|
||||||
err = gguf.Encode(f, KV{
|
err = gguf.Encode(f, KV{
|
||||||
"general.architecture": "llama",
|
"general.architecture": "llama",
|
||||||
"general.name": "name",
|
"general.name": "name",
|
||||||
|
@ -56,9 +56,11 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||||
}
|
}
|
||||||
projectors := []string{}
|
projectors := []string{}
|
||||||
opts := api.DefaultOptions()
|
opts := api.DefaultOptions()
|
||||||
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
t.Run("cpu", func(t *testing.T) {
|
||||||
assert.Equal(t, 0, estimate.Layers)
|
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
assert.Equal(t, uint64(0), estimate.Graph)
|
assert.Equal(t, 0, estimate.Layers)
|
||||||
|
assert.Equal(t, uint64(0), estimate.Graph)
|
||||||
|
})
|
||||||
|
|
||||||
// derived from the dummy ggml file above
|
// derived from the dummy ggml file above
|
||||||
graphPartialOffload := uint64(202377216)
|
graphPartialOffload := uint64(202377216)
|
||||||
|
@ -80,7 +82,10 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
|
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
|
||||||
for i, s := range [][]uint64{
|
for i, s := range []struct {
|
||||||
|
layer0, layer1 uint64
|
||||||
|
expect0, expect1 uint64
|
||||||
|
}{
|
||||||
{1, 1, 1, 1},
|
{1, 1, 1, 1},
|
||||||
{2, 1, 2, 1},
|
{2, 1, 2, 1},
|
||||||
{2, 2, 2, 2},
|
{2, 2, 2, 2},
|
||||||
|
@ -90,27 +95,33 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||||
{6, 6, 3, 3},
|
{6, 6, 3, 3},
|
||||||
{0, 3, 0, 3},
|
{0, 3, 0, 3},
|
||||||
} {
|
} {
|
||||||
gpus[0].FreeMemory = 0
|
t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
|
||||||
gpus[1].FreeMemory = 0
|
gpus[0].FreeMemory = 0
|
||||||
gpus[0].FreeMemory += projectorSize + memoryLayerOutput
|
gpus[1].FreeMemory = 0
|
||||||
gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1
|
gpus[0].FreeMemory += projectorSize
|
||||||
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1
|
if s.layer0 > 0 {
|
||||||
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
|
gpus[0].FreeMemory += memoryLayerOutput
|
||||||
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
|
} else {
|
||||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
gpus[1].FreeMemory += memoryLayerOutput
|
||||||
assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s)
|
}
|
||||||
assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s)
|
gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
|
||||||
var layerSums uint64
|
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
|
||||||
for _, b := range estimate.GPUSizes {
|
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
|
||||||
layerSums += b
|
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
|
||||||
}
|
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
if estimate.Layers < inputLayerCount+1 {
|
assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
|
||||||
assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
|
||||||
assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
var layerSums uint64
|
||||||
} else {
|
for _, b := range estimate.GPUSizes {
|
||||||
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
layerSums += b
|
||||||
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
}
|
||||||
}
|
if estimate.Layers < inputLayerCount+1 {
|
||||||
|
assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
||||||
|
assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
||||||
|
} else {
|
||||||
|
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
||||||
|
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
|
||||||
// glob workDir for files that start with ollama_
|
// glob workDir for files that start with ollama_
|
||||||
availableServers := availableServers()
|
availableServers := availableServers()
|
||||||
requested := info.Library
|
requested := info.Library
|
||||||
if info.Variant != "" {
|
if info.Variant != gpu.CPUCapabilityNone {
|
||||||
requested += "_" + info.Variant
|
requested += "_" + info.Variant.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
servers := []string{}
|
servers := []string{}
|
||||||
|
@ -117,14 +117,14 @@ func serversForGpu(info gpu.GpuInfo) []string {
|
||||||
|
|
||||||
// Load up the best CPU variant if not primary requested
|
// Load up the best CPU variant if not primary requested
|
||||||
if info.Library != "cpu" {
|
if info.Library != "cpu" {
|
||||||
variant := gpu.GetCPUVariant()
|
variant := gpu.GetCPUCapability()
|
||||||
// If no variant, then we fall back to default
|
// If no variant, then we fall back to default
|
||||||
// If we have a variant, try that if we find an exact match
|
// If we have a variant, try that if we find an exact match
|
||||||
// Attempting to run the wrong CPU instructions will panic the
|
// Attempting to run the wrong CPU instructions will panic the
|
||||||
// process
|
// process
|
||||||
if variant != "" {
|
if variant != gpu.CPUCapabilityNone {
|
||||||
for cmp := range availableServers {
|
for cmp := range availableServers {
|
||||||
if cmp == "cpu_"+variant {
|
if cmp == "cpu_"+variant.String() {
|
||||||
servers = append(servers, cmp)
|
servers = append(servers, cmp)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
@ -146,11 +146,11 @@ func serverForCpu() string {
|
||||||
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
||||||
return "metal"
|
return "metal"
|
||||||
}
|
}
|
||||||
variant := gpu.GetCPUVariant()
|
variant := gpu.GetCPUCapability()
|
||||||
availableServers := availableServers()
|
availableServers := availableServers()
|
||||||
if variant != "" {
|
if variant != gpu.CPUCapabilityNone {
|
||||||
for cmp := range availableServers {
|
for cmp := range availableServers {
|
||||||
if cmp == "cpu_"+variant {
|
if cmp == "cpu_"+variant.String() {
|
||||||
return cmp
|
return cmp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,7 @@ type LlamaServer interface {
|
||||||
Close() error
|
Close() error
|
||||||
EstimatedVRAM() uint64 // Total VRAM across all GPUs
|
EstimatedVRAM() uint64 // Total VRAM across all GPUs
|
||||||
EstimatedTotal() uint64
|
EstimatedTotal() uint64
|
||||||
EstimagedVRAMByGPU(gpuID string) uint64
|
EstimatedVRAMByGPU(gpuID string) uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
// llmServer is an instance of the llama.cpp server
|
// llmServer is an instance of the llama.cpp server
|
||||||
|
@ -1016,7 +1016,7 @@ func (s *llmServer) EstimatedTotal() uint64 {
|
||||||
return s.estimate.TotalSize
|
return s.estimate.TotalSize
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) EstimagedVRAMByGPU(gpuID string) uint64 {
|
func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
|
||||||
for i, gpu := range s.gpus {
|
for i, gpu := range s.gpus {
|
||||||
if gpu.ID == gpuID {
|
if gpu.ID == gpuID {
|
||||||
return s.estimate.GPUSizes[i]
|
return s.estimate.GPUSizes[i]
|
||||||
|
|
|
@ -182,7 +182,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
// We want to avoid loading on any GPUs that have other
|
// We want to avoid loading on any GPUs that have other
|
||||||
// models still loading on them to avoid potential races
|
// models still loading on them to avoid potential races
|
||||||
// with VRAM consumption ramping up during load
|
// with VRAM consumption ramping up during load
|
||||||
availGpus := s.filterGPUsWithLoadingModels(gpus)
|
availGpus := s.filterGPUsWithoutLoadingModels(gpus)
|
||||||
|
|
||||||
// Update free memory from currently loaded models
|
// Update free memory from currently loaded models
|
||||||
s.updateFreeSpace(availGpus)
|
s.updateFreeSpace(availGpus)
|
||||||
|
@ -414,9 +414,7 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
||||||
r.refMu.Lock()
|
r.refMu.Lock()
|
||||||
if r.llama != nil {
|
if r.llama != nil {
|
||||||
for _, gpu := range allGpus {
|
for _, gpu := range allGpus {
|
||||||
// if slices.Contains(gpuIDs, gpu.ID) {
|
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
|
||||||
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimagedVRAMByGPU(gpu.ID)
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
|
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
|
||||||
|
@ -448,7 +446,7 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
||||||
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
|
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
|
||||||
// This routine returns the set of GPUs that do not have an active loading model.
|
// This routine returns the set of GPUs that do not have an active loading model.
|
||||||
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
|
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
|
||||||
func (s *Scheduler) filterGPUsWithLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
|
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
|
||||||
ret := append(gpu.GpuInfoList{}, allGpus...)
|
ret := append(gpu.GpuInfoList{}, allGpus...)
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
defer s.loadedMu.Unlock()
|
defer s.loadedMu.Unlock()
|
||||||
|
@ -702,5 +700,4 @@ func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML,
|
||||||
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
|
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
|
||||||
|
|
||||||
return s.findRunnerToUnload()
|
return s.findRunnerToUnload()
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -156,7 +156,7 @@ func TestRequests(t *testing.T) {
|
||||||
|
|
||||||
// Same model, same request
|
// Same model, same request
|
||||||
scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
|
scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
|
||||||
scenario1a.req.sessionDuration = 0
|
scenario1a.req.sessionDuration = 5 * time.Millisecond
|
||||||
scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
|
scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
|
||||||
scenario1b.req.model = scenario1a.req.model
|
scenario1b.req.model = scenario1a.req.model
|
||||||
scenario1b.ggml = scenario1a.ggml
|
scenario1b.ggml = scenario1a.ggml
|
||||||
|
@ -167,6 +167,7 @@ func TestRequests(t *testing.T) {
|
||||||
tmpModel := *scenario1a.req.model
|
tmpModel := *scenario1a.req.model
|
||||||
scenario2a.req.model = &tmpModel
|
scenario2a.req.model = &tmpModel
|
||||||
scenario2a.ggml = scenario1a.ggml
|
scenario2a.ggml = scenario1a.ggml
|
||||||
|
scenario2a.req.sessionDuration = 5 * time.Millisecond
|
||||||
|
|
||||||
// Multiple loaded models
|
// Multiple loaded models
|
||||||
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
|
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
|
||||||
|
@ -316,7 +317,6 @@ func TestGetRunner(t *testing.T) {
|
||||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
|
|
||||||
// Same model, same request
|
|
||||||
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
|
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
|
||||||
scenario1a.req.sessionDuration = 0
|
scenario1a.req.sessionDuration = 0
|
||||||
scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
|
scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
|
||||||
|
@ -475,6 +475,40 @@ func TestUpdateFreeSpace(t *testing.T) {
|
||||||
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
|
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
|
||||||
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
|
defer done()
|
||||||
|
gpus := gpu.GpuInfoList{
|
||||||
|
{
|
||||||
|
Library: "cuda",
|
||||||
|
ID: "0",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Library: "cuda",
|
||||||
|
ID: "1",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
|
||||||
|
|
||||||
|
s := InitScheduler(ctx)
|
||||||
|
s.loadedMu.Lock()
|
||||||
|
s.loaded["a"] = r1
|
||||||
|
s.loadedMu.Unlock()
|
||||||
|
|
||||||
|
tmp := s.filterGPUsWithoutLoadingModels(gpus)
|
||||||
|
require.Len(t, tmp, 1)
|
||||||
|
require.Equal(t, "1", tmp[0].ID)
|
||||||
|
|
||||||
|
r1.gpus = gpu.GpuInfoList{gpus[1]}
|
||||||
|
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
||||||
|
require.Len(t, tmp, 1)
|
||||||
|
require.Equal(t, "0", tmp[0].ID)
|
||||||
|
|
||||||
|
r1.gpus = gpu.GpuInfoList{}
|
||||||
|
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
||||||
|
require.Len(t, tmp, 2)
|
||||||
|
}
|
||||||
|
|
||||||
func TestFindRunnerToUnload(t *testing.T) {
|
func TestFindRunnerToUnload(t *testing.T) {
|
||||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
|
@ -607,4 +641,4 @@ func (s *mockLlm) Close() error {
|
||||||
}
|
}
|
||||||
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
|
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
|
||||||
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
|
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
|
||||||
func (s *mockLlm) EstimagedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
|
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
|
||||||
|
|
Loading…
Add table
Reference in a new issue