Merge pull request #3678 from ollama/mxyng/fix-darwin-partial-offloading

darwin: no partial offloading if required memory greater than system
This commit is contained in:
Michael Yang 2024-04-16 12:05:56 -07:00 committed by GitHub
commit 7afb2e125a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 17 additions and 9 deletions

View file

@ -32,6 +32,7 @@ func CheckVRAM() (uint64, error) {
// gpu not supported, this may not be metal
return 0, nil
}
return uint64(C.getRecommendedMaxVRAM()), nil
}
@ -52,7 +53,7 @@ func GetGPUInfo() GpuInfo {
func getCPUMem() (memInfo, error) {
return memInfo{
TotalMemory: 0,
TotalMemory: uint64(C.getPhysicalMemory()),
FreeMemory: 0,
DeviceCount: 0,
}, nil

View file

@ -1,3 +1,4 @@
#import <Metal/Metal.h>
#include <stdint.h>
uint64_t getRecommendedMaxVRAM();
uint64_t getPhysicalMemory();

View file

@ -1,11 +1,13 @@
//go:build darwin
// go:build darwin
#include "gpu_info_darwin.h"
uint64_t getRecommendedMaxVRAM()
{
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
uint64_t result = device.recommendedMaxWorkingSetSize;
CFRelease(device);
return result;
uint64_t getRecommendedMaxVRAM() {
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
uint64_t result = device.recommendedMaxWorkingSetSize;
CFRelease(device);
return result;
}
uint64_t getPhysicalMemory() {
return [[NSProcessInfo processInfo] physicalMemory];
}

View file

@ -108,7 +108,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
memoryLayerOutput := layers["output"].size()
memoryRequiredTotal += memoryLayerOutput
if memoryAvailable > memoryRequiredTotal {
if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
// disable partial offloading when model is greater than total system memory
opts.NumGPU = 0
} else if memoryAvailable > memoryRequiredTotal {
layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal
}