Fix case for NumCtx

This commit is contained in:
Daniel Hiltgen 2024-07-01 09:43:59 -07:00
parent 3518aaef33
commit cff3f44f4a

View file

@ -23,7 +23,7 @@ type LlmRequest struct {
ctx context.Context //nolint:containedctx ctx context.Context //nolint:containedctx
model *Model model *Model
opts api.Options opts api.Options
origNumCTX int // Track the initial ctx request origNumCtx int // Track the initial ctx request
sessionDuration time.Duration sessionDuration time.Duration
successCh chan *runnerRef successCh chan *runnerRef
errCh chan error errCh chan error
@ -118,8 +118,8 @@ func (s *Scheduler) processPending(ctx context.Context) {
case pending := <-s.pendingReqCh: case pending := <-s.pendingReqCh:
// Block other requests until we get this pending request running // Block other requests until we get this pending request running
pending.schedAttempts++ pending.schedAttempts++
if pending.origNumCTX == 0 { if pending.origNumCtx == 0 {
pending.origNumCTX = pending.opts.NumCtx pending.origNumCtx = pending.opts.NumCtx
} }
if pending.ctx.Err() != nil { if pending.ctx.Err() != nil {
@ -135,7 +135,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
} }
// Keep NumCtx and numParallel in sync // Keep NumCtx and numParallel in sync
if numParallel > 1 { if numParallel > 1 {
pending.opts.NumCtx = pending.origNumCTX * numParallel pending.opts.NumCtx = pending.origNumCtx * numParallel
} }
for { for {
@ -197,7 +197,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
// simplifying assumption of defaultParallel when in CPU mode // simplifying assumption of defaultParallel when in CPU mode
if numParallel <= 0 { if numParallel <= 0 {
numParallel = defaultParallel numParallel = defaultParallel
pending.opts.NumCtx = pending.origNumCTX * numParallel pending.opts.NumCtx = pending.origNumCtx * numParallel
} }
if loadedCount == 0 { if loadedCount == 0 {
@ -691,7 +691,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
// First attempt to fit the model into a single GPU // First attempt to fit the model into a single GPU
for _, p := range numParallelToTry { for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCTX * p req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread { if !envconfig.SchedSpread {
for _, g := range sgl { for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
@ -709,7 +709,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
// Now try all the GPUs // Now try all the GPUs
for _, p := range numParallelToTry { for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCTX * p req.opts.NumCtx = req.origNumCtx * p
if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM)) slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
*numParallel = p *numParallel = p