diff --git a/gpu/gpu.go b/gpu/gpu.go index b9cb538a..143e1467 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -131,10 +131,11 @@ func getCPUMem() (memInfo, error) { func CheckVRAM() (int64, error) { gpuInfo := GetGPUInfo() if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { - // allocate 384MiB for llama.cpp overhead (outside of model) - overhead := uint64(384 * 1024 * 1024) - if gpuInfo.FreeMemory <= overhead { - return 0, nil + // leave 10% or 400MiB of VRAM free for overhead + overhead := gpuInfo.FreeMemory / 10 + minOverhead := 400 * 1024 * 1024 + if overhead < minOverhead { + overhead = minOverhead } return int64(gpuInfo.FreeMemory - overhead), nil diff --git a/llm/llm.go b/llm/llm.go index 1026debc..a0230205 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -117,6 +117,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers())) log.Println("bytes per layer:", bytesPerLayer) layers := available / bytesPerLayer + log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer)) if layers < int64(opts.NumGPU) { opts.NumGPU = int(layers) }