fix: parallel queueing race condition caused silent failure (#1445)
* fix: queued request failures - increase parallel requests to 2 to complete queued request, queueing is managed in ollama * log steam errors
This commit is contained in:
parent
9e1406e4ed
commit
bbe41ce41a
1 changed files with 29 additions and 25 deletions
54
llm/llama.go
54
llm/llama.go
|
@ -341,6 +341,7 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
|
||||||
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
|
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
|
||||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||||
"--n-gpu-layers", fmt.Sprintf("%d", numGPU),
|
"--n-gpu-layers", fmt.Sprintf("%d", numGPU),
|
||||||
|
"--parallel", "2",
|
||||||
"--embedding",
|
"--embedding",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -631,34 +632,37 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if evt, ok := bytes.CutPrefix(line, []byte("data: ")); ok {
|
evt, ok := bytes.CutPrefix(line, []byte("data: "))
|
||||||
var p prediction
|
if !ok {
|
||||||
if err := json.Unmarshal(evt, &p); err != nil {
|
return fmt.Errorf("error parsing llm response stream: %s", line)
|
||||||
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if p.Content != "" {
|
var p prediction
|
||||||
fn(PredictResult{
|
if err := json.Unmarshal(evt, &p); err != nil {
|
||||||
Model: predict.Model,
|
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
|
||||||
CreatedAt: time.Now().UTC(),
|
}
|
||||||
Content: p.Content,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
if p.Stop {
|
if p.Content != "" {
|
||||||
fn(PredictResult{
|
fn(PredictResult{
|
||||||
Model: predict.Model,
|
Model: predict.Model,
|
||||||
CreatedAt: time.Now().UTC(),
|
CreatedAt: time.Now().UTC(),
|
||||||
TotalDuration: time.Since(predict.CheckpointStart),
|
Content: p.Content,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
Done: true,
|
if p.Stop {
|
||||||
PromptEvalCount: p.Timings.PromptN,
|
fn(PredictResult{
|
||||||
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
|
Model: predict.Model,
|
||||||
EvalCount: p.Timings.PredictedN,
|
CreatedAt: time.Now().UTC(),
|
||||||
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
|
TotalDuration: time.Since(predict.CheckpointStart),
|
||||||
})
|
|
||||||
return nil
|
Done: true,
|
||||||
}
|
PromptEvalCount: p.Timings.PromptN,
|
||||||
|
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
|
||||||
|
EvalCount: p.Timings.PredictedN,
|
||||||
|
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
|
||||||
|
})
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue