diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index eac55c42..97907511 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -34,7 +34,7 @@ func GetGPUInfo() GpuInfo { mem, _ := getCPUMem() if runtime.GOARCH == "amd64" { return GpuInfo{ - Library: "default", + Library: "cpu", Variant: GetCPUVariant(), memInfo: mem, } diff --git a/llm/llm.go b/llm/llm.go index d7667675..613519f0 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -2,6 +2,7 @@ package llm import ( "context" + "fmt" "log" "os" "runtime" @@ -50,7 +51,6 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) graph := int64(ggml.NumGQA()) * kv / 6 info := gpu.GetGPUInfo() - library := info.Library switch runtime.GOOS { case "darwin": if opts.NumGPU == 0 { @@ -59,13 +59,15 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) if size+kv+graph > vram { log.Println("not enough vram available, falling back to CPU only") + info.Library = "cpu" + info.Variant = gpu.GetCPUVariant() opts.NumGPU = 0 break } opts.NumGPU = 1 default: - if library == "cpu" || library == "default" { + if info.Library == "cpu" { log.Println("GPU not available, falling back to CPU") opts.NumGPU = 0 break @@ -73,7 +75,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) // don't use GPU at all if no layers are loaded if opts.NumGPU == 0 { - library = "cpu" + info.Library = "cpu" + info.Variant = gpu.GetCPUVariant() break } @@ -100,7 +103,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) min := graph + kv*layers/maxlayers if layers <= 0 || min > avg { log.Printf("not enough vram available, falling back to CPU only") - library = "cpu" + info.Library = "cpu" + info.Variant = gpu.GetCPUVariant() opts.NumGPU = 0 break } @@ -110,8 +114,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) opts.RopeFrequencyBase = 0.0 opts.RopeFrequencyScale = 0.0 - gpuInfo := gpu.GetGPUInfo() - return newLlmServer(gpuInfo, model, adapters, projectors, opts) + return newLlmServer(info, model, adapters, projectors, opts) } // Give any native cgo implementations an opportunity to initialize diff --git a/llm/payload_common.go b/llm/payload_common.go index f6976768..0ae02be1 100644 --- a/llm/payload_common.go +++ b/llm/payload_common.go @@ -28,6 +28,13 @@ func getDynLibs(gpuInfo gpu.GpuInfo) []string { if gpuInfo.Library == "default" { return []string{"default"} } + // TODO - temporary until we have multiple CPU variations for Darwin + // Short circuit on darwin with metal only + if len(availableDynLibs) == 1 { + if _, onlyMetal := availableDynLibs["metal"]; onlyMetal { + return []string{availableDynLibs["metal"]} + } + } exactMatch := "" dynLibs := []string{} diff --git a/llm/payload_test.go b/llm/payload_test.go index 7a644713..44537b0a 100644 --- a/llm/payload_test.go +++ b/llm/payload_test.go @@ -16,39 +16,43 @@ func TestGetDynLibs(t *testing.T) { assert.Len(t, res, 1) assert.Equal(t, availableDynLibs["cpu"], res[0]) + variant := gpu.GetCPUVariant() + if variant != "" { + variant = "_" + variant + } availableDynLibs = map[string]string{ - "rocm_v5": "X_rocm_v5", - "rocm_v6": "X_rocm_v6", - "cpu": "X_cpu", + "rocm_v5": "X_rocm_v5", + "rocm_v6": "X_rocm_v6", + "cpu" + variant: "X_cpu", } assert.Equal(t, true, rocmDynLibPresent()) res = getDynLibs(gpu.GpuInfo{Library: "rocm"}) assert.Len(t, res, 3) assert.Equal(t, availableDynLibs["rocm_v5"], res[0]) assert.Equal(t, availableDynLibs["rocm_v6"], res[1]) - assert.Equal(t, availableDynLibs["cpu"], res[2]) + assert.Equal(t, availableDynLibs["cpu"+variant], res[2]) res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"}) assert.Len(t, res, 3) assert.Equal(t, availableDynLibs["rocm_v6"], res[0]) assert.Equal(t, availableDynLibs["rocm_v5"], res[1]) - assert.Equal(t, availableDynLibs["cpu"], res[2]) + assert.Equal(t, availableDynLibs["cpu"+variant], res[2]) res = getDynLibs(gpu.GpuInfo{Library: "cuda"}) assert.Len(t, res, 1) - assert.Equal(t, availableDynLibs["cpu"], res[0]) + assert.Equal(t, availableDynLibs["cpu"+variant], res[0]) res = getDynLibs(gpu.GpuInfo{Library: "default"}) assert.Len(t, res, 1) assert.Equal(t, "default", res[0]) availableDynLibs = map[string]string{ - "rocm": "X_rocm_v5", - "cpu": "X_cpu", + "rocm": "X_rocm_v5", + "cpu" + variant: "X_cpu", } assert.Equal(t, true, rocmDynLibPresent()) res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"}) assert.Len(t, res, 2) assert.Equal(t, availableDynLibs["rocm"], res[0]) - assert.Equal(t, availableDynLibs["cpu"], res[1]) + assert.Equal(t, availableDynLibs["cpu"+variant], res[1]) }