Merge pull request #3678 from ollama/mxyng/fix-darwin-partial-offloading
darwin: no partial offloading if required memory greater than system
This commit is contained in:
commit
7afb2e125a
4 changed files with 17 additions and 9 deletions
|
@ -32,6 +32,7 @@ func CheckVRAM() (uint64, error) {
|
|||
// gpu not supported, this may not be metal
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return uint64(C.getRecommendedMaxVRAM()), nil
|
||||
}
|
||||
|
||||
|
@ -52,7 +53,7 @@ func GetGPUInfo() GpuInfo {
|
|||
|
||||
func getCPUMem() (memInfo, error) {
|
||||
return memInfo{
|
||||
TotalMemory: 0,
|
||||
TotalMemory: uint64(C.getPhysicalMemory()),
|
||||
FreeMemory: 0,
|
||||
DeviceCount: 0,
|
||||
}, nil
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#import <Metal/Metal.h>
|
||||
#include <stdint.h>
|
||||
uint64_t getRecommendedMaxVRAM();
|
||||
uint64_t getPhysicalMemory();
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
//go:build darwin
|
||||
// go:build darwin
|
||||
#include "gpu_info_darwin.h"
|
||||
|
||||
uint64_t getRecommendedMaxVRAM()
|
||||
{
|
||||
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
||||
uint64_t result = device.recommendedMaxWorkingSetSize;
|
||||
CFRelease(device);
|
||||
return result;
|
||||
uint64_t getRecommendedMaxVRAM() {
|
||||
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
||||
uint64_t result = device.recommendedMaxWorkingSetSize;
|
||||
CFRelease(device);
|
||||
return result;
|
||||
}
|
||||
|
||||
uint64_t getPhysicalMemory() {
|
||||
return [[NSProcessInfo processInfo] physicalMemory];
|
||||
}
|
||||
|
|
|
@ -108,7 +108,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
|||
|
||||
memoryLayerOutput := layers["output"].size()
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
if memoryAvailable > memoryRequiredTotal {
|
||||
|
||||
if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
|
||||
// disable partial offloading when model is greater than total system memory
|
||||
opts.NumGPU = 0
|
||||
} else if memoryAvailable > memoryRequiredTotal {
|
||||
layerCount = int(ggml.KV().BlockCount()) + 1
|
||||
memoryRequiredPartial = memoryRequiredTotal
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue