From f0c454ab577f4cf716e23ff56cbc3c316c88a78c Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Wed, 1 May 2024 11:46:03 -0400 Subject: [PATCH] gpu: add 512MiB to darwin minimum, metal doesn't have partial offloading overhead (#4068) --- gpu/gpu_darwin.go | 8 +++++++- llm/memory.go | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 2ff6b351..f8cc1adb 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -10,6 +10,12 @@ package gpu import "C" import ( "runtime" + + "github.com/ollama/ollama/format" +) + +const ( + metalMinimumMemory = 512 * format.MebiByte ) func GetGPUInfo() GpuInfoList { @@ -32,7 +38,7 @@ func GetGPUInfo() GpuInfoList { // TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work) info.FreeMemory = info.TotalMemory - info.MinimumMemory = 0 + info.MinimumMemory = metalMinimumMemory return []GpuInfo{info} } diff --git a/llm/memory.go b/llm/memory.go index d1e79e26..b705aefe 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -88,6 +88,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts graphFullOffload *= uint64(len(gpus)) graphPartialOffload *= uint64(len(gpus)) + // on metal there's no partial offload overhead + if gpus[0].Library == "metal" { + graphPartialOffload = graphFullOffload + } + // memoryRequiredTotal represents the memory required for full GPU offloading (all layers) memoryRequiredTotal := memoryMinimum + graphFullOffload