From 9929751cc8b415e7b83d5151742dad734e8b5efc Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 19 Jun 2024 13:35:38 -0700 Subject: [PATCH] Disable concurrency for AMD + Windows Until ROCm v6.2 ships, we wont be able to get accurate free memory reporting on windows, which makes automatic concurrency too risky. Users can still opt-in but will need to pay attention to model sizes otherwise they may thrash/page VRAM or cause OOM crashes. All other platforms and GPUs have accurate VRAM reporting wired up now, so we can turn on concurrency by default. --- envconfig/config.go | 8 ++++---- gpu/amd_windows.go | 5 +++-- gpu/types.go | 5 +++++ server/sched.go | 36 ++++++++++++++++++++++++++++++++---- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/envconfig/config.go b/envconfig/config.go index cb456448..0f0f7f05 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -85,13 +85,13 @@ func AsMap() map[string]EnvVar { "OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"}, "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"}, "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"}, - "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU (default 4)"}, + "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU (default auto)"}, "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"}, "OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"}, "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"}, - "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"}, + "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default auto)"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"}, "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"}, @@ -129,8 +129,8 @@ func clean(key string) string { func init() { // default values - NumParallel = 0 - MaxRunners = 4 + NumParallel = 0 // Autoselect + MaxRunners = 0 // Autoselect MaxQueuedRequests = 512 LoadConfig() diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index 21585277..8b6fabeb 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -115,8 +115,6 @@ func AMDGetGPUInfo() []RocmGPUInfo { continue } - // TODO revisit this once ROCm v6 is available on windows. - // v5.7 only reports VRAM used by this process, so it's completely wrong and unusable slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory)) slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory)) gpuInfo := RocmGPUInfo{ @@ -126,6 +124,9 @@ func AMDGetGPUInfo() []RocmGPUInfo { TotalMemory: totalMemory, FreeMemory: freeMemory, }, + // Free memory reporting on Windows is not reliable until we bump to ROCm v6.2 + UnreliableFreeMemory: true, + ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices DependencyPath: libDir, MinimumMemory: rocmMinimumMemory, diff --git a/gpu/types.go b/gpu/types.go index 9920db5f..2eaa9bae 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -29,6 +29,11 @@ type GpuInfo struct { // Extra environment variables specific to the GPU as list of [key,value] EnvWorkarounds [][2]string `json:"envs,omitempty"` + // Set to true if we can NOT reliably discover FreeMemory. A value of true indicates + // the FreeMemory is best effort, and may over or under report actual memory usage + // False indicates FreeMemory can generally be trusted on this GPU + UnreliableFreeMemory bool + // GPU information ID string `json:"gpu_id"` // string to use for selection of this specific GPU Name string `json:"name"` // user friendly name if available diff --git a/server/sched.go b/server/sched.go index 31ef560f..de8c9d28 100644 --- a/server/sched.go +++ b/server/sched.go @@ -46,6 +46,16 @@ type Scheduler struct { reschedDelay time.Duration } +// Default automatic value for number of models we allow per GPU +// Model will still need to fit in VRAM, but loading many small models +// on a large GPU can cause stalling +var defaultModelsPerGPU = 3 + +// Default automatic value for parallel setting +// Model will still need to fit in VRAM. If this setting wont fit +// we'll back off down to 1 to try to get it to fit +var defaultParallel = 4 + var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded") func InitScheduler(ctx context.Context) *Scheduler { @@ -100,7 +110,6 @@ func (s *Scheduler) Run(ctx context.Context) { } func (s *Scheduler) processPending(ctx context.Context) { - maxRunnerFactor := 1 // number of GPUs or 1 for { select { case <-ctx.Done(): @@ -143,7 +152,7 @@ func (s *Scheduler) processPending(ctx context.Context) { pending.useLoadedRunner(runner, s.finishedReqCh) break } - } else if envconfig.MaxRunners > 0 && loadedCount >= (maxRunnerFactor*envconfig.MaxRunners) { + } else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners { slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount) runnerToExpire = s.findRunnerToUnload() } else { @@ -155,7 +164,26 @@ func (s *Scheduler) processPending(ctx context.Context) { } else { gpus = s.getGpuFn() } - maxRunnerFactor = max(len(gpus), 1) + + if envconfig.MaxRunners <= 0 { + // No user specified MaxRunners, so figure out what automatic setting to use + // If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs + // if any GPU has unreliable free memory reporting, 1x the number of GPUs + allReliable := true + for _, gpu := range gpus { + if gpu.UnreliableFreeMemory { + allReliable = false + break + } + } + if allReliable { + envconfig.MaxRunners = defaultModelsPerGPU * len(gpus) + slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus)) + } else { + slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency") + envconfig.MaxRunners = len(gpus) + } + } // Load model for fitting ggml, err := llm.LoadModel(pending.model.ModelPath) @@ -647,7 +675,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP var numParallelToTry []int if *numParallel <= 0 { // If no specific parallel setting was provided, try larger then smaller, always end with 1 - numParallelToTry = append(numParallelToTry, 4, 1) + numParallelToTry = append(numParallelToTry, defaultParallel, 1) } else { numParallelToTry = []int{*numParallel} }