From 5e8ff556cb23d80e41cf5c018775b10a431e31ba Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 8 May 2024 14:32:42 -0700 Subject: [PATCH] Support forced spreading for multi GPU Our default behavior today is to try to fit into a single GPU if possible. Some users would prefer the old behavior of always spreading across multiple GPUs even if the model can fit into one. This exposes that tunable behavior. --- envconfig/config.go | 12 ++++++++++++ server/sched.go | 10 ++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/envconfig/config.go b/envconfig/config.go index 3a682c42..4d2150b7 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -53,6 +53,8 @@ var ( NumParallel int // Set via OLLAMA_RUNNERS_DIR in the environment RunnersDir string + // Set via OLLAMA_SCHED_SPREAD in the environment + SchedSpread bool // Set via OLLAMA_TMPDIR in the environment TmpDir string ) @@ -79,6 +81,7 @@ func AsMap() map[string]EnvVar { "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"}, "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"}, + "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"}, "OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"}, } } @@ -191,6 +194,15 @@ func LoadConfig() { NoHistory = true } + if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" { + s, err := strconv.ParseBool(spread) + if err == nil { + SchedSpread = s + } else { + SchedSpread = true + } + } + if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" { NoPrune = true } diff --git a/server/sched.go b/server/sched.go index c36486f7..6bf5eb1f 100644 --- a/server/sched.go +++ b/server/sched.go @@ -558,10 +558,12 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu. sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl))) // First attempt to fit the model into a single GPU - for _, g := range sgl { - if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { - slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) - return []gpu.GpuInfo{g} + if !envconfig.SchedSpread { + for _, g := range sgl { + if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { + slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) + return []gpu.GpuInfo{g} + } } }