Merge pull request #3678 from ollama/mxyng/fix-darwin-partial-offloading

darwin: no partial offloading if required memory greater than system
2024-04-16 12:05:56 -07:00 · 2024-04-16 12:05:56 -07:00 · 7afb2e125a
commit 7afb2e125a
parent f335722275 41a272de9f
4 changed files with 17 additions and 9 deletions
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@ -32,6 +32,7 @@ func CheckVRAM() (uint64, error) {
 		// gpu not supported, this may not be metal
 		return 0, nil
 	}
+
 	return uint64(C.getRecommendedMaxVRAM()), nil
 }

@ -52,7 +53,7 @@ func GetGPUInfo() GpuInfo {

 func getCPUMem() (memInfo, error) {
 	return memInfo{
-		TotalMemory: 0,
+		TotalMemory: uint64(C.getPhysicalMemory()),
 		FreeMemory:  0,
 		DeviceCount: 0,
 	}, nil
--- a/gpu/gpu_info_darwin.h
+++ b/gpu/gpu_info_darwin.h
@ -1,3 +1,4 @@
 #import <Metal/Metal.h>
 #include <stdint.h>
 uint64_t getRecommendedMaxVRAM();
+uint64_t getPhysicalMemory();
--- a/gpu/gpu_info_darwin.m
+++ b/gpu/gpu_info_darwin.m
@ -1,11 +1,13 @@
-//go:build darwin
+// go:build darwin
 #include "gpu_info_darwin.h"

-uint64_t getRecommendedMaxVRAM()
-{
-	id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-	uint64_t result = device.recommendedMaxWorkingSetSize;
-	CFRelease(device);
-	return result;
+uint64_t getRecommendedMaxVRAM() {
+  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+  uint64_t result = device.recommendedMaxWorkingSetSize;
+  CFRelease(device);
+  return result;
 }

+uint64_t getPhysicalMemory() {
+  return [[NSProcessInfo processInfo] physicalMemory];
+}
--- a/llm/server.go
+++ b/llm/server.go
@ -108,7 +108,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option

 	memoryLayerOutput := layers["output"].size()
 	memoryRequiredTotal += memoryLayerOutput
-	if memoryAvailable > memoryRequiredTotal {
+
+	if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
+		// disable partial offloading when model is greater than total system memory
+		opts.NumGPU = 0
+	} else if memoryAvailable > memoryRequiredTotal {
 		layerCount = int(ggml.KV().BlockCount()) + 1
 		memoryRequiredPartial = memoryRequiredTotal
 	}