Merge pull request #3678 from ollama/mxyng/fix-darwin-partial-offloading

darwin: no partial offloading if required memory greater than system
2024-04-16 12:05:56 -07:00 · 2024-04-16 12:05:56 -07:00 · 7afb2e125a
commit 7afb2e125a
parent f335722275 41a272de9f
4 changed files with 17 additions and 9 deletions
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@ -32,6 +32,7 @@ func CheckVRAM() (uint64, error) {
 		// gpu not supported, this may not be metal
 		return 0, nil
 	}
 	return uint64(C.getRecommendedMaxVRAM()), nil
 }
@ -52,7 +53,7 @@ func GetGPUInfo() GpuInfo {
 func getCPUMem() (memInfo, error) {
 	return memInfo{
-		TotalMemory: 0,
+		TotalMemory: uint64(C.getPhysicalMemory()),
 		FreeMemory:  0,
 		DeviceCount: 0,
 	}, nil
--- a/gpu/gpu_info_darwin.h
+++ b/gpu/gpu_info_darwin.h
@ -1,3 +1,4 @@
 #import <Metal/Metal.h>
 #include <stdint.h>
 uint64_t getRecommendedMaxVRAM();
 uint64_t getPhysicalMemory();
--- a/gpu/gpu_info_darwin.m
+++ b/gpu/gpu_info_darwin.m
@ -1,11 +1,13 @@
 // go:build darwin
 #include "gpu_info_darwin.h"
-uint64_t getRecommendedMaxVRAM()
+uint64_t getRecommendedMaxVRAM() {
 {
  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
  uint64_t result = device.recommendedMaxWorkingSetSize;
  CFRelease(device);
  return result;
 }
 uint64_t getPhysicalMemory() {
  return [[NSProcessInfo processInfo] physicalMemory];
 }
--- a/llm/server.go
+++ b/llm/server.go
@ -108,7 +108,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 	memoryLayerOutput := layers["output"].size()
 	memoryRequiredTotal += memoryLayerOutput
-	if memoryAvailable > memoryRequiredTotal {
+
 	if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
 		// disable partial offloading when model is greater than total system memory
 		opts.NumGPU = 0
 	} else if memoryAvailable > memoryRequiredTotal {
 		layerCount = int(ggml.KV().BlockCount()) + 1
 		memoryRequiredPartial = memoryRequiredTotal
 	}