From e4ff73297db2f53f1ea4b603df5670c5bde6a944 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 8 Jul 2024 22:32:15 -0700 Subject: [PATCH] server: fix model reloads when setting `OLLAMA_NUM_PARALLEL` (#5560) * server: fix unneeded model reloads when setting `OLLAMA_NUM_PARALLEL` * remove whitespace change * undo some changes --- server/sched.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/server/sched.go b/server/sched.go index 9dff2ae0..48047bfe 100644 --- a/server/sched.go +++ b/server/sched.go @@ -133,10 +133,6 @@ func (s *Scheduler) processPending(ctx context.Context) { numParallel = 1 slog.Warn("multimodal models don't support parallel requests yet") } - // Keep NumCtx and numParallel in sync - if numParallel > 1 { - pending.opts.NumCtx = pending.origNumCtx * numParallel - } for { cpus := s.getCpuFn() @@ -234,9 +230,10 @@ func (s *Scheduler) processPending(ctx context.Context) { // simplifying assumption of defaultParallel when in CPU mode if numParallel <= 0 { numParallel = defaultParallel - pending.opts.NumCtx = pending.origNumCtx * numParallel } + pending.opts.NumCtx = pending.origNumCtx * numParallel + if loadedCount == 0 { slog.Debug("cpu mode with first model, loading") s.loadFn(pending, ggml, gpus, numParallel)