Merge pull request #3678 from ollama/mxyng/fix-darwin-partial-offloading

darwin: no partial offloading if required memory greater than system
This commit is contained in:
Michael Yang 2024-04-16 12:05:56 -07:00 committed by GitHub
commit 7afb2e125a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 17 additions and 9 deletions

View file

@ -32,6 +32,7 @@ func CheckVRAM() (uint64, error) {
// gpu not supported, this may not be metal // gpu not supported, this may not be metal
return 0, nil return 0, nil
} }
return uint64(C.getRecommendedMaxVRAM()), nil return uint64(C.getRecommendedMaxVRAM()), nil
} }
@ -52,7 +53,7 @@ func GetGPUInfo() GpuInfo {
func getCPUMem() (memInfo, error) { func getCPUMem() (memInfo, error) {
return memInfo{ return memInfo{
TotalMemory: 0, TotalMemory: uint64(C.getPhysicalMemory()),
FreeMemory: 0, FreeMemory: 0,
DeviceCount: 0, DeviceCount: 0,
}, nil }, nil

View file

@ -1,3 +1,4 @@
#import <Metal/Metal.h> #import <Metal/Metal.h>
#include <stdint.h> #include <stdint.h>
uint64_t getRecommendedMaxVRAM(); uint64_t getRecommendedMaxVRAM();
uint64_t getPhysicalMemory();

View file

@ -1,11 +1,13 @@
//go:build darwin // go:build darwin
#include "gpu_info_darwin.h" #include "gpu_info_darwin.h"
uint64_t getRecommendedMaxVRAM() uint64_t getRecommendedMaxVRAM() {
{
id<MTLDevice> device = MTLCreateSystemDefaultDevice(); id<MTLDevice> device = MTLCreateSystemDefaultDevice();
uint64_t result = device.recommendedMaxWorkingSetSize; uint64_t result = device.recommendedMaxWorkingSetSize;
CFRelease(device); CFRelease(device);
return result; return result;
} }
uint64_t getPhysicalMemory() {
return [[NSProcessInfo processInfo] physicalMemory];
}

View file

@ -108,7 +108,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
memoryLayerOutput := layers["output"].size() memoryLayerOutput := layers["output"].size()
memoryRequiredTotal += memoryLayerOutput memoryRequiredTotal += memoryLayerOutput
if memoryAvailable > memoryRequiredTotal {
if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
// disable partial offloading when model is greater than total system memory
opts.NumGPU = 0
} else if memoryAvailable > memoryRequiredTotal {
layerCount = int(ggml.KV().BlockCount()) + 1 layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal memoryRequiredPartial = memoryRequiredTotal
} }