retry on concurrent request failure (#1483)

- remove parallel
This commit is contained in:
Bruce MacDonald 2023-12-12 12:14:35 -05:00 committed by GitHub
parent 5314fc9b63
commit c0960e29b5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -412,10 +412,6 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
params := append(params, "--port", strconv.Itoa(port))
if runner.Type == "gguf" {
params = append(params, "--parallel", "2")
}
ctx, cancel := context.WithCancel(context.Background())
cmd := exec.CommandContext(
ctx,
@ -549,6 +545,8 @@ type prediction struct {
}
const maxBufferSize = 512 * format.KiloByte
const maxRetries = 3
const retryDelay = 1 * time.Second
type PredictOpts struct {
Prompt string
@ -570,6 +568,11 @@ type PredictResult struct {
EvalDuration time.Duration
}
// IsRetryable checks if the line matches a condition that can be retried
func isRetryable(line []byte) bool {
return bytes.Contains(line, []byte("slot unavailable"))
}
func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
imageData := llm.ImageData
if len(predict.Images) > 0 {
@ -607,6 +610,11 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
request["grammar"] = jsonGrammar
}
for retries := 0; retries < maxRetries; retries++ {
if retries > 0 {
time.Sleep(retryDelay) // wait before retrying
}
// Handling JSON marshaling with special characters unescaped.
buffer := &bytes.Buffer{}
enc := json.NewEncoder(buffer)
@ -642,6 +650,8 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
// increase the buffer size to avoid running out of space
buf := make([]byte, 0, maxBufferSize)
scanner.Buffer(buf, maxBufferSize)
retryNeeded := false
for scanner.Scan() {
select {
case <-ctx.Done():
@ -653,6 +663,11 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
continue
}
if isRetryable(line) {
retryNeeded = true
break
}
evt, ok := bytes.CutPrefix(line, []byte("data: "))
if !ok {
return fmt.Errorf("error parsing llm response stream: %s", line)
@ -698,7 +713,13 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
return fmt.Errorf("error reading llm response: %v", err)
}
return nil
if !retryNeeded {
return nil // success
}
}
// should never reach here ideally
return fmt.Errorf("max retries exceeded")
}
type TokenizeRequest struct {