allocate a large enough kv cache for all parallel requests (#4162)
This commit is contained in:
parent
06164911dd
commit
942c979232
1 changed files with 4 additions and 2 deletions
|
@ -93,6 +93,9 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
|||
|
||||
// context must be canceled to decrement ref count and release the runner
|
||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
||||
// allocate a large enough kv cache for all parallel requests
|
||||
opts.NumCtx = opts.NumCtx * numParallel
|
||||
|
||||
req := &LlmRequest{
|
||||
ctx: c,
|
||||
model: model,
|
||||
|
@ -101,8 +104,7 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options,
|
|||
successCh: make(chan *runnerRef),
|
||||
errCh: make(chan error, 1),
|
||||
}
|
||||
// context split across parallel threads
|
||||
opts.NumCtx = opts.NumCtx * numParallel
|
||||
|
||||
select {
|
||||
case s.pendingReqCh <- req:
|
||||
default:
|
||||
|
|
Loading…
Reference in a new issue