From 26df67478590b950ba56d1a3af7f592393e2e3fd Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Tue, 16 Apr 2024 14:44:13 -0700 Subject: [PATCH] scale graph based on gpu count --- gpu/gpu_darwin.go | 2 +- llm/server.go | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 39e257e4..bf764ce6 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -55,6 +55,6 @@ func getCPUMem() (memInfo, error) { return memInfo{ TotalMemory: uint64(C.getPhysicalMemory()), FreeMemory: 0, - DeviceCount: 0, + DeviceCount: 1, }, nil } diff --git a/llm/server.go b/llm/server.go index 0bbdebce..4e808085 100644 --- a/llm/server.go +++ b/llm/server.go @@ -79,6 +79,9 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option graphFullOffload = graphPartialOffload } + graphFullOffload *= uint64(info.DeviceCount) + graphPartialOffload *= uint64(info.DeviceCount) + // memoryRequiredTotal represents the memory required for full GPU offloading (all layers) memoryRequiredTotal := memoryMinimum + graphFullOffload