diff --git a/llm/server.go b/llm/server.go index 206f9e39..54fad92c 100644 --- a/llm/server.go +++ b/llm/server.go @@ -699,10 +699,9 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu } defer s.sem.Release(1) - // only allow maximum 10 "context shifts" to avoid infinite generation + // put an upper limit on num_predict to avoid the model running on forever if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx { req.Options.NumPredict = 10 * s.options.NumCtx - slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict) } request := map[string]any{