server: fix model reloads when setting OLLAMA_NUM_PARALLEL
(#5560)
* server: fix unneeded model reloads when setting `OLLAMA_NUM_PARALLEL` * remove whitespace change * undo some changes
This commit is contained in:
parent
53da2c6965
commit
e4ff73297d
1 changed files with 2 additions and 5 deletions
|
@ -133,10 +133,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||
numParallel = 1
|
||||
slog.Warn("multimodal models don't support parallel requests yet")
|
||||
}
|
||||
// Keep NumCtx and numParallel in sync
|
||||
if numParallel > 1 {
|
||||
pending.opts.NumCtx = pending.origNumCtx * numParallel
|
||||
}
|
||||
|
||||
for {
|
||||
cpus := s.getCpuFn()
|
||||
|
@ -234,9 +230,10 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||
// simplifying assumption of defaultParallel when in CPU mode
|
||||
if numParallel <= 0 {
|
||||
numParallel = defaultParallel
|
||||
pending.opts.NumCtx = pending.origNumCtx * numParallel
|
||||
}
|
||||
|
||||
pending.opts.NumCtx = pending.origNumCtx * numParallel
|
||||
|
||||
if loadedCount == 0 {
|
||||
slog.Debug("cpu mode with first model, loading")
|
||||
s.loadFn(pending, ggml, gpus, numParallel)
|
||||
|
|
Loading…
Reference in a new issue