diff --git a/server/sched.go b/server/sched.go index fa034d28..1102a690 100644 --- a/server/sched.go +++ b/server/sched.go @@ -46,6 +46,7 @@ type Scheduler struct { // TODO set this to zero after a release or two, to enable multiple models by default var loadedMax = 1 // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners) var maxQueuedRequests = 10 // TODO configurable +var numParallel = 1 func InitScheduler(ctx context.Context) *Scheduler { maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS") @@ -57,6 +58,14 @@ func InitScheduler(ctx context.Context) *Scheduler { loadedMax = m } } + if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" { + p, err := strconv.Atoi(onp) + if err != nil || p <= 0 { + slog.Error("invalid parallel setting, must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err) + } else { + numParallel = p + } + } sched := &Scheduler{ pendingReqCh: make(chan *LlmRequest, maxQueuedRequests), @@ -81,6 +90,8 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, successCh: make(chan *runnerRef), errCh: make(chan error, 1), } + // context split across parallel threads + opts.NumCtx = opts.NumCtx * numParallel select { case s.pendingReqCh <- req: default: