Merge pull request #3678 from ollama/mxyng/fix-darwin-partial-offloading
darwin: no partial offloading if required memory greater than system
This commit is contained in:
commit
7afb2e125a
4 changed files with 17 additions and 9 deletions
|
@ -32,6 +32,7 @@ func CheckVRAM() (uint64, error) {
|
||||||
// gpu not supported, this may not be metal
|
// gpu not supported, this may not be metal
|
||||||
return 0, nil
|
return 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return uint64(C.getRecommendedMaxVRAM()), nil
|
return uint64(C.getRecommendedMaxVRAM()), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,7 +53,7 @@ func GetGPUInfo() GpuInfo {
|
||||||
|
|
||||||
func getCPUMem() (memInfo, error) {
|
func getCPUMem() (memInfo, error) {
|
||||||
return memInfo{
|
return memInfo{
|
||||||
TotalMemory: 0,
|
TotalMemory: uint64(C.getPhysicalMemory()),
|
||||||
FreeMemory: 0,
|
FreeMemory: 0,
|
||||||
DeviceCount: 0,
|
DeviceCount: 0,
|
||||||
}, nil
|
}, nil
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
#import <Metal/Metal.h>
|
#import <Metal/Metal.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
uint64_t getRecommendedMaxVRAM();
|
uint64_t getRecommendedMaxVRAM();
|
||||||
|
uint64_t getPhysicalMemory();
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
//go:build darwin
|
// go:build darwin
|
||||||
#include "gpu_info_darwin.h"
|
#include "gpu_info_darwin.h"
|
||||||
|
|
||||||
uint64_t getRecommendedMaxVRAM()
|
uint64_t getRecommendedMaxVRAM() {
|
||||||
{
|
|
||||||
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
||||||
uint64_t result = device.recommendedMaxWorkingSetSize;
|
uint64_t result = device.recommendedMaxWorkingSetSize;
|
||||||
CFRelease(device);
|
CFRelease(device);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t getPhysicalMemory() {
|
||||||
|
return [[NSProcessInfo processInfo] physicalMemory];
|
||||||
|
}
|
||||||
|
|
|
@ -108,7 +108,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||||
|
|
||||||
memoryLayerOutput := layers["output"].size()
|
memoryLayerOutput := layers["output"].size()
|
||||||
memoryRequiredTotal += memoryLayerOutput
|
memoryRequiredTotal += memoryLayerOutput
|
||||||
if memoryAvailable > memoryRequiredTotal {
|
|
||||||
|
if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
|
||||||
|
// disable partial offloading when model is greater than total system memory
|
||||||
|
opts.NumGPU = 0
|
||||||
|
} else if memoryAvailable > memoryRequiredTotal {
|
||||||
layerCount = int(ggml.KV().BlockCount()) + 1
|
layerCount = int(ggml.KV().BlockCount()) + 1
|
||||||
memoryRequiredPartial = memoryRequiredTotal
|
memoryRequiredPartial = memoryRequiredTotal
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue