bugfix: truncate completion max_tokens to fit context length by default

2023-07-09 18:13:29 -04:00 · 2023-07-09 18:13:29 -04:00 · a86bfdf0a5
commit a86bfdf0a5
parent 6f70cc4b7d
1 changed files with 6 additions and 10 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -824,18 +824,14 @@ class Llama:
        if self.verbose:
            llama_cpp.llama_reset_timings(self.ctx)
-        if max_tokens <= 0:
+        if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx):
            # Unlimited, depending on n_ctx.
            if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)):
            raise ValueError(
                f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
            )
-            else:
+
-                max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens)
+        if max_tokens <= 0:
-        elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
+            # Unlimited, depending on n_ctx.
-            raise ValueError(
+            max_tokens = llama_cpp.llama_n_ctx(self.ctx) - len(prompt_tokens)
                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}"
            )
        # Truncate max_tokens if requested tokens would exceed the context window
        max_tokens = (