server: fix model reloads when setting OLLAMA_NUM_PARALLEL (#5560)

* server: fix unneeded model reloads when setting `OLLAMA_NUM_PARALLEL`

* remove whitespace change

* undo some changes
This commit is contained in:
Jeffrey Morgan 2024-07-08 22:32:15 -07:00 committed by GitHub
parent 53da2c6965
commit e4ff73297d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -133,10 +133,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
numParallel = 1 numParallel = 1
slog.Warn("multimodal models don't support parallel requests yet") slog.Warn("multimodal models don't support parallel requests yet")
} }
// Keep NumCtx and numParallel in sync
if numParallel > 1 {
pending.opts.NumCtx = pending.origNumCtx * numParallel
}
for { for {
cpus := s.getCpuFn() cpus := s.getCpuFn()
@ -234,9 +230,10 @@ func (s *Scheduler) processPending(ctx context.Context) {
// simplifying assumption of defaultParallel when in CPU mode // simplifying assumption of defaultParallel when in CPU mode
if numParallel <= 0 { if numParallel <= 0 {
numParallel = defaultParallel numParallel = defaultParallel
pending.opts.NumCtx = pending.origNumCtx * numParallel
} }
pending.opts.NumCtx = pending.origNumCtx * numParallel
if loadedCount == 0 { if loadedCount == 0 {
slog.Debug("cpu mode with first model, loading") slog.Debug("cpu mode with first model, loading")
s.loadFn(pending, ggml, gpus, numParallel) s.loadFn(pending, ggml, gpus, numParallel)