From cc269ba0943ee1fa0bddcce8027d0a6d1b86fec5 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 22 Jul 2024 09:08:11 -0700 Subject: [PATCH] Remove no longer supported max vram var The OLLAMA_MAX_VRAM env var was a temporary workaround for OOM scenarios. With Concurrency this was no longer wired up, and the simplistic value doesn't map to multi-GPU setups. Users can still set `num_gpu` to limit memory usage to avoid OOM if we get our predictions wrong. --- cmd/cmd.go | 1 - envconfig/config.go | 13 ------------- integration/concurrency_test.go | 4 ++-- 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index 2252a905..b761d018 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1344,7 +1344,6 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_TMPDIR"], envVars["OLLAMA_FLASH_ATTENTION"], envVars["OLLAMA_LLM_LIBRARY"], - envVars["OLLAMA_MAX_VRAM"], }) default: appendEnvDocs(cmd, envs) diff --git a/envconfig/config.go b/envconfig/config.go index 62d661eb..0abc6968 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -43,8 +43,6 @@ var ( MaxRunners int // Set via OLLAMA_MAX_QUEUE in the environment MaxQueuedRequests int - // Set via OLLAMA_MAX_VRAM in the environment - MaxVRAM uint64 // Set via OLLAMA_MODELS in the environment ModelsDir string // Set via OLLAMA_NOHISTORY in the environment @@ -89,7 +87,6 @@ func AsMap() map[string]EnvVar { "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"}, "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"}, "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"}, - "OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"}, "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"}, @@ -194,16 +191,6 @@ func LoadConfig() { TmpDir = clean("OLLAMA_TMPDIR") - userLimit := clean("OLLAMA_MAX_VRAM") - if userLimit != "" { - avail, err := strconv.ParseUint(userLimit, 10, 64) - if err != nil { - slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err) - } else { - MaxVRAM = avail - } - } - LLMLibrary = clean("OLLAMA_LLM_LIBRARY") if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" { diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index d66ba9f0..8593285b 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -69,7 +69,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { reqLimit := len(req) iterLimit := 5 - vram := os.Getenv("OLLAMA_MAX_VRAM") + vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM if vram != "" { max, err := strconv.ParseUint(vram, 10, 64) require.NoError(t, err) @@ -106,7 +106,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { // Stress the system if we know how much VRAM it has, and attempt to load more models than will fit func TestMultiModelStress(t *testing.T) { - vram := os.Getenv("OLLAMA_MAX_VRAM") + vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM if vram == "" { t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test") }