fix: parallel queueing race condition caused silent failure (#1445)

* fix: queued request failures - increase parallel requests to 2 to complete queued request, queueing is managed in ollama * log steam errors
2023-12-09 14:14:02 -05:00 · 2023-12-09 14:14:02 -05:00 · bbe41ce41a
commit bbe41ce41a
parent 9e1406e4ed
1 changed files with 29 additions and 25 deletions
--- a/llm/llama.go
+++ b/llm/llama.go
@ -341,6 +341,7 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
 		"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
 		"--n-gpu-layers", fmt.Sprintf("%d", numGPU),
+		"--parallel", "2",
 		"--embedding",
 	}

@ -631,7 +632,11 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
 				continue
 			}

-			if evt, ok := bytes.CutPrefix(line, []byte("data: ")); ok {
+			evt, ok := bytes.CutPrefix(line, []byte("data: "))
+			if !ok {
+				return fmt.Errorf("error parsing llm response stream: %s", line)
+			}
+
 			var p prediction
 			if err := json.Unmarshal(evt, &p); err != nil {
 				return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
@ -661,7 +666,6 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
 			}
 		}
 	}
-	}

 	if err := scanner.Err(); err != nil {
 		if strings.Contains(err.Error(), "unexpected EOF") {