allocate a large enough kv cache for all parallel requests (#4162)

This commit is contained in:
Jeffrey Morgan 2024-05-05 15:59:32 -07:00 committed by GitHub
parent 06164911dd
commit 942c979232
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -93,6 +93,9 @@ func InitScheduler(ctx context.Context) *Scheduler {
// context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
// allocate a large enough kv cache for all parallel requests
opts.NumCtx = opts.NumCtx * numParallel
req := &LlmRequest{
ctx: c,
model: model,
@ -101,8 +104,7 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options,
successCh: make(chan *runnerRef),
errCh: make(chan error, 1),
}
// context split across parallel threads
opts.NumCtx = opts.NumCtx * numParallel
select {
case s.pendingReqCh <- req:
default: