diff --git a/envconfig/config.go b/envconfig/config.go index 0f0f7f05..c02c4878 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -85,13 +85,13 @@ func AsMap() map[string]EnvVar { "OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"}, "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"}, "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"}, - "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU (default auto)"}, + "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"}, "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"}, "OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"}, "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"}, - "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default auto)"}, + "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"}, "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"}, diff --git a/server/sched.go b/server/sched.go index 87da1db4..71b535ae 100644 --- a/server/sched.go +++ b/server/sched.go @@ -23,7 +23,7 @@ type LlmRequest struct { ctx context.Context //nolint:containedctx model *Model opts api.Options - origNumCTX int // Track the initial ctx request + origNumCtx int // Track the initial ctx request sessionDuration time.Duration successCh chan *runnerRef errCh chan error @@ -118,8 +118,8 @@ func (s *Scheduler) processPending(ctx context.Context) { case pending := <-s.pendingReqCh: // Block other requests until we get this pending request running pending.schedAttempts++ - if pending.origNumCTX == 0 { - pending.origNumCTX = pending.opts.NumCtx + if pending.origNumCtx == 0 { + pending.origNumCtx = pending.opts.NumCtx } if pending.ctx.Err() != nil { @@ -135,7 +135,7 @@ func (s *Scheduler) processPending(ctx context.Context) { } // Keep NumCtx and numParallel in sync if numParallel > 1 { - pending.opts.NumCtx = pending.origNumCTX * numParallel + pending.opts.NumCtx = pending.origNumCtx * numParallel } for { @@ -197,7 +197,7 @@ func (s *Scheduler) processPending(ctx context.Context) { // simplifying assumption of defaultParallel when in CPU mode if numParallel <= 0 { numParallel = defaultParallel - pending.opts.NumCtx = pending.origNumCTX * numParallel + pending.opts.NumCtx = pending.origNumCtx * numParallel } if loadedCount == 0 { @@ -691,7 +691,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP // First attempt to fit the model into a single GPU for _, p := range numParallelToTry { - req.opts.NumCtx = req.origNumCTX * p + req.opts.NumCtx = req.origNumCtx * p if !envconfig.SchedSpread { for _, g := range sgl { if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { @@ -709,7 +709,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP // Now try all the GPUs for _, p := range numParallelToTry { - req.opts.NumCtx = req.origNumCTX * p + req.opts.NumCtx = req.origNumCtx * p if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM)) *numParallel = p