diff --git a/server/sched.go b/server/sched.go index d40a45ad..f3d5c276 100644 --- a/server/sched.go +++ b/server/sched.go @@ -93,6 +93,9 @@ func InitScheduler(ctx context.Context) *Scheduler { // context must be canceled to decrement ref count and release the runner func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) { + // allocate a large enough kv cache for all parallel requests + opts.NumCtx = opts.NumCtx * numParallel + req := &LlmRequest{ ctx: c, model: model, @@ -101,8 +104,7 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, successCh: make(chan *runnerRef), errCh: make(chan error, 1), } - // context split across parallel threads - opts.NumCtx = opts.NumCtx * numParallel + select { case s.pendingReqCh <- req: default: