From 5bf5aeec0140a70eeb94b65c61dbb3b75ff33e56 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 20 Jun 2024 11:07:04 -0700 Subject: [PATCH] Refine mmap default logic on linux If we try to use mmap when the model is larger than the system free space, loading is slower than the no-mmap approach. --- llm/server.go | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/llm/server.go b/llm/server.go index ed5f288f..da83416e 100644 --- a/llm/server.go +++ b/llm/server.go @@ -81,7 +81,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr var err error var cpuRunner string var estimate MemoryEstimate - var systemMemory uint64 + var systemTotalMemory uint64 + var systemFreeMemory uint64 + + systemMemInfo, err := gpu.GetCPUMem() + if err != nil { + slog.Error("failed to lookup system memory", "error", err) + } else { + systemTotalMemory = systemMemInfo.TotalMemory + systemFreeMemory = systemMemInfo.FreeMemory + slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory) + } // If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info if opts.NumGPU == 0 { @@ -91,19 +101,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr cpuRunner = serverForCpu() estimate = EstimateGPULayers(gpus, ggml, projectors, opts) } else { - if gpus[0].Library == "metal" { - memInfo, err := gpu.GetCPUMem() - if err != nil { - slog.Error("failed to lookup system memory", "error", err) - } else { - systemMemory = memInfo.TotalMemory - slog.Debug("system memory", "total", format.HumanBytes2(systemMemory)) - } - } estimate = EstimateGPULayers(gpus, ggml, projectors, opts) switch { - case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory: + case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory: // disable partial offloading when model is greater than total system memory as this // can lead to locking up the system opts.NumGPU = 0 @@ -211,7 +212,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } // Windows CUDA should not use mmap for best performance - if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse { + // Linux with a model larger than free space, mmap leads to thrashing + if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) || + (runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) || + opts.UseMMap == api.TriStateFalse { params = append(params, "--no-mmap") }