Add back memory escape valve

If we get our predictions wrong, this can be used to
set a lower memory limit as a workaround.  Recent multi-gpu
refactoring accidentally removed it, so this adds it back.
This commit is contained in:
Daniel Hiltgen 2024-04-23 17:09:02 -07:00
parent 2ac3dd6853
commit 5445aaa94e

View file

@ -3,6 +3,8 @@ package llm
import ( import (
"fmt" "fmt"
"log/slog" "log/slog"
"os"
"strconv"
"strings" "strings"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
@ -49,6 +51,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
for _, info := range gpus { for _, info := range gpus {
memoryAvailable += info.FreeMemory memoryAvailable += info.FreeMemory
} }
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
if userLimit != "" {
avail, err := strconv.ParseUint(userLimit, 10, 64)
if err != nil {
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
} else {
slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
memoryAvailable = avail
}
}
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable)) slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
// TODO - this is probably wrong, first GPU vs secondaries will have different overheads // TODO - this is probably wrong, first GPU vs secondaries will have different overheads