Support forced spreading for multi GPU
Our default behavior today is to try to fit into a single GPU if possible. Some users would prefer the old behavior of always spreading across multiple GPUs even if the model can fit into one. This exposes that tunable behavior.
This commit is contained in:
parent
6fd04ca922
commit
5e8ff556cb
2 changed files with 18 additions and 4 deletions
|
@ -53,6 +53,8 @@ var (
|
||||||
NumParallel int
|
NumParallel int
|
||||||
// Set via OLLAMA_RUNNERS_DIR in the environment
|
// Set via OLLAMA_RUNNERS_DIR in the environment
|
||||||
RunnersDir string
|
RunnersDir string
|
||||||
|
// Set via OLLAMA_SCHED_SPREAD in the environment
|
||||||
|
SchedSpread bool
|
||||||
// Set via OLLAMA_TMPDIR in the environment
|
// Set via OLLAMA_TMPDIR in the environment
|
||||||
TmpDir string
|
TmpDir string
|
||||||
)
|
)
|
||||||
|
@ -79,6 +81,7 @@ func AsMap() map[string]EnvVar {
|
||||||
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
|
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
|
||||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
|
||||||
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
|
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
|
||||||
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
|
||||||
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
|
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -191,6 +194,15 @@ func LoadConfig() {
|
||||||
NoHistory = true
|
NoHistory = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
|
||||||
|
s, err := strconv.ParseBool(spread)
|
||||||
|
if err == nil {
|
||||||
|
SchedSpread = s
|
||||||
|
} else {
|
||||||
|
SchedSpread = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
|
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
|
||||||
NoPrune = true
|
NoPrune = true
|
||||||
}
|
}
|
||||||
|
|
|
@ -558,12 +558,14 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
|
||||||
sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
|
sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
|
||||||
|
|
||||||
// First attempt to fit the model into a single GPU
|
// First attempt to fit the model into a single GPU
|
||||||
|
if !envconfig.SchedSpread {
|
||||||
for _, g := range sgl {
|
for _, g := range sgl {
|
||||||
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||||
slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
||||||
return []gpu.GpuInfo{g}
|
return []gpu.GpuInfo{g}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO future refinements
|
// TODO future refinements
|
||||||
// - if multiple Libraries, see if any single GPU in any Library will fit
|
// - if multiple Libraries, see if any single GPU in any Library will fit
|
||||||
|
|
Loading…
Reference in a new issue