From 26df67478590b950ba56d1a3af7f592393e2e3fd Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 16 Apr 2024 14:44:13 -0700
Subject: [PATCH] scale graph based on gpu count

---
 gpu/gpu_darwin.go | 2 +-
 llm/server.go     | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go
index 39e257e4..bf764ce6 100644
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -55,6 +55,6 @@ func getCPUMem() (memInfo, error) {
 	return memInfo{
 		TotalMemory: uint64(C.getPhysicalMemory()),
 		FreeMemory:  0,
-		DeviceCount: 0,
+		DeviceCount: 1,
 	}, nil
 }
diff --git a/llm/server.go b/llm/server.go
index 0bbdebce..4e808085 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -79,6 +79,9 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 		graphFullOffload = graphPartialOffload
 	}
 
+	graphFullOffload *= uint64(info.DeviceCount)
+	graphPartialOffload *= uint64(info.DeviceCount)
+
 	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
 	memoryRequiredTotal := memoryMinimum + graphFullOffload