From 830fdd271536ee257db72c29c2be5b5629e58389 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 23 Jul 2024 15:14:28 -0700 Subject: [PATCH] Better explain multi-gpu behavior --- cmd/cmd.go | 1 + docs/faq.md | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index b761d018..610fddcb 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1341,6 +1341,7 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_NUM_PARALLEL"], envVars["OLLAMA_NOPRUNE"], envVars["OLLAMA_ORIGINS"], + envVars["OLLAMA_SCHED_SPREAD"], envVars["OLLAMA_TMPDIR"], envVars["OLLAMA_FLASH_ATTENTION"], envVars["OLLAMA_LLM_LIBRARY"], diff --git a/docs/faq.md b/docs/faq.md index da1848f7..16c80549 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -272,4 +272,8 @@ The following server settings may be used to adjust how Ollama handles concurren - `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default will auto-select either 4 or 1 based on available memory. - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512 -Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM. \ No newline at end of file +Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM. + +## How does Ollama load models on multiple GPUs? + +Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs. \ No newline at end of file