From 942c97923288770d97ca99c2867e964992843d14 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 5 May 2024 15:59:32 -0700 Subject: [PATCH] allocate a large enough kv cache for all parallel requests (#4162) --- server/sched.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/sched.go b/server/sched.go index d40a45ad..f3d5c276 100644 --- a/server/sched.go +++ b/server/sched.go @@ -93,6 +93,9 @@ func InitScheduler(ctx context.Context) *Scheduler { // context must be canceled to decrement ref count and release the runner func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) { + // allocate a large enough kv cache for all parallel requests + opts.NumCtx = opts.NumCtx * numParallel + req := &LlmRequest{ ctx: c, model: model, @@ -101,8 +104,7 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, successCh: make(chan *runnerRef), errCh: make(chan error, 1), } - // context split across parallel threads - opts.NumCtx = opts.NumCtx * numParallel + select { case s.pendingReqCh <- req: default: