allocate a large enough kv cache for all parallel requests (#4162)
This commit is contained in:
parent
06164911dd
commit
942c979232
1 changed files with 4 additions and 2 deletions
|
@ -93,6 +93,9 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
||||||
|
|
||||||
// context must be canceled to decrement ref count and release the runner
|
// context must be canceled to decrement ref count and release the runner
|
||||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
||||||
|
// allocate a large enough kv cache for all parallel requests
|
||||||
|
opts.NumCtx = opts.NumCtx * numParallel
|
||||||
|
|
||||||
req := &LlmRequest{
|
req := &LlmRequest{
|
||||||
ctx: c,
|
ctx: c,
|
||||||
model: model,
|
model: model,
|
||||||
|
@ -101,8 +104,7 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options,
|
||||||
successCh: make(chan *runnerRef),
|
successCh: make(chan *runnerRef),
|
||||||
errCh: make(chan error, 1),
|
errCh: make(chan error, 1),
|
||||||
}
|
}
|
||||||
// context split across parallel threads
|
|
||||||
opts.NumCtx = opts.NumCtx * numParallel
|
|
||||||
select {
|
select {
|
||||||
case s.pendingReqCh <- req:
|
case s.pendingReqCh <- req:
|
||||||
default:
|
default:
|
||||||
|
|
Loading…
Reference in a new issue