From 5445aaa94e6f4c5626222177ed53ebd07225dd9a Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 23 Apr 2024 17:09:02 -0700 Subject: [PATCH] Add back memory escape valve If we get our predictions wrong, this can be used to set a lower memory limit as a workaround. Recent multi-gpu refactoring accidentally removed it, so this adds it back. --- llm/memory.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/llm/memory.go b/llm/memory.go index 0dff54d3..dc620828 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -3,6 +3,8 @@ package llm import ( "fmt" "log/slog" + "os" + "strconv" "strings" "github.com/ollama/ollama/api" @@ -49,6 +51,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts for _, info := range gpus { memoryAvailable += info.FreeMemory } + userLimit := os.Getenv("OLLAMA_MAX_VRAM") + if userLimit != "" { + avail, err := strconv.ParseUint(userLimit, 10, 64) + if err != nil { + slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err) + } else { + slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable) + memoryAvailable = avail + } + } + slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable)) // TODO - this is probably wrong, first GPU vs secondaries will have different overheads