From f0c454ab577f4cf716e23ff56cbc3c316c88a78c Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Wed, 1 May 2024 11:46:03 -0400
Subject: [PATCH] gpu: add 512MiB to darwin minimum, metal doesn't have partial
 offloading overhead (#4068)

---
 gpu/gpu_darwin.go | 8 +++++++-
 llm/memory.go     | 5 +++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go
index 2ff6b351..f8cc1adb 100644
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -10,6 +10,12 @@ package gpu
 import "C"
 import (
 	"runtime"
+
+	"github.com/ollama/ollama/format"
+)
+
+const (
+	metalMinimumMemory = 512 * format.MebiByte
 )
 
 func GetGPUInfo() GpuInfoList {
@@ -32,7 +38,7 @@ func GetGPUInfo() GpuInfoList {
 	// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
 	info.FreeMemory = info.TotalMemory
 
-	info.MinimumMemory = 0
+	info.MinimumMemory = metalMinimumMemory
 	return []GpuInfo{info}
 }
 
diff --git a/llm/memory.go b/llm/memory.go
index d1e79e26..b705aefe 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -88,6 +88,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	graphFullOffload *= uint64(len(gpus))
 	graphPartialOffload *= uint64(len(gpus))
 
+	// on metal there's no partial offload overhead
+	if gpus[0].Library == "metal" {
+		graphPartialOffload = graphFullOffload
+	}
+
 	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
 	memoryRequiredTotal := memoryMinimum + graphFullOffload