Refine mmap default logic on linux
If we try to use mmap when the model is larger than the system free space, loading is slower than the no-mmap approach.
This commit is contained in:
parent
e01e535cbb
commit
5bf5aeec01
1 changed files with 16 additions and 12 deletions
|
@ -81,7 +81,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
var err error
|
var err error
|
||||||
var cpuRunner string
|
var cpuRunner string
|
||||||
var estimate MemoryEstimate
|
var estimate MemoryEstimate
|
||||||
var systemMemory uint64
|
var systemTotalMemory uint64
|
||||||
|
var systemFreeMemory uint64
|
||||||
|
|
||||||
|
systemMemInfo, err := gpu.GetCPUMem()
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("failed to lookup system memory", "error", err)
|
||||||
|
} else {
|
||||||
|
systemTotalMemory = systemMemInfo.TotalMemory
|
||||||
|
systemFreeMemory = systemMemInfo.FreeMemory
|
||||||
|
slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
|
||||||
|
}
|
||||||
|
|
||||||
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
||||||
if opts.NumGPU == 0 {
|
if opts.NumGPU == 0 {
|
||||||
|
@ -91,19 +101,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
cpuRunner = serverForCpu()
|
cpuRunner = serverForCpu()
|
||||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
} else {
|
} else {
|
||||||
if gpus[0].Library == "metal" {
|
|
||||||
memInfo, err := gpu.GetCPUMem()
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("failed to lookup system memory", "error", err)
|
|
||||||
} else {
|
|
||||||
systemMemory = memInfo.TotalMemory
|
|
||||||
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
|
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
|
||||||
// disable partial offloading when model is greater than total system memory as this
|
// disable partial offloading when model is greater than total system memory as this
|
||||||
// can lead to locking up the system
|
// can lead to locking up the system
|
||||||
opts.NumGPU = 0
|
opts.NumGPU = 0
|
||||||
|
@ -211,7 +212,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
}
|
}
|
||||||
|
|
||||||
// Windows CUDA should not use mmap for best performance
|
// Windows CUDA should not use mmap for best performance
|
||||||
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse {
|
// Linux with a model larger than free space, mmap leads to thrashing
|
||||||
|
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
|
||||||
|
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
|
||||||
|
opts.UseMMap == api.TriStateFalse {
|
||||||
params = append(params, "--no-mmap")
|
params = append(params, "--no-mmap")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue