diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 39e257e4..bf764ce6 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -55,6 +55,6 @@ func getCPUMem() (memInfo, error) { return memInfo{ TotalMemory: uint64(C.getPhysicalMemory()), FreeMemory: 0, - DeviceCount: 0, + DeviceCount: 1, }, nil } diff --git a/llm/server.go b/llm/server.go index 0bbdebce..4e808085 100644 --- a/llm/server.go +++ b/llm/server.go @@ -79,6 +79,9 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option graphFullOffload = graphPartialOffload } + graphFullOffload *= uint64(info.DeviceCount) + graphPartialOffload *= uint64(info.DeviceCount) + // memoryRequiredTotal represents the memory required for full GPU offloading (all layers) memoryRequiredTotal := memoryMinimum + graphFullOffload