Add back memory escape valve

If we get our predictions wrong, this can be used to set a lower memory limit as a workaround. Recent multi-gpu refactoring accidentally removed it, so this adds it back.
2024-04-23 17:09:02 -07:00 · 2024-04-23 17:09:02 -07:00 · 5445aaa94e
commit 5445aaa94e
parent 2ac3dd6853
1 changed files with 13 additions and 0 deletions
--- a/llm/memory.go
+++ b/llm/memory.go
@ -3,6 +3,8 @@ package llm
 import (
 	"fmt"
 	"log/slog"
 	"os"
 	"strconv"
 	"strings"
 	"github.com/ollama/ollama/api"
@ -49,6 +51,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	for _, info := range gpus {
 		memoryAvailable += info.FreeMemory
 	}
 	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
 	if userLimit != "" {
 		avail, err := strconv.ParseUint(userLimit, 10, 64)
 		if err != nil {
 			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
 		} else {
 			slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
 			memoryAvailable = avail
 		}
 	}
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
 	// TODO - this is probably wrong, first GPU vs secondaries will have different overheads