Add support for stream parameter. Closes #1

2023-03-28 04:03:57 -04:00 · 2023-03-28 04:03:57 -04:00 · 3dbb3fd3f6
commit 3dbb3fd3f6
parent 30fc0f3866
2 changed files with 129 additions and 33 deletions
--- a/examples/high_level_api_streaming.py
+++ b/examples/high_level_api_streaming.py
@ -0,0 +1,20 @@
+import json
+import argparse
+
+from llama_cpp import Llama
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-m", "--model", type=str, default=".//models/...")
+args = parser.parse_args()
+
+llm = Llama(model_path=args.model)
+
+stream = llm(
+    "Question: What are the names of the planets in the solar system? Answer: ",
+    max_tokens=48,
+    stop=["Q:", "\n"],
+    stream=True,
+)
+
+for output in stream:
+    print(json.dumps(output, indent=2))
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -88,7 +88,7 @@ class Llama:
            True,
        )
        if n_tokens < 0:
-            raise RuntimeError(f"Failed to tokenize: text=\"{text}\" n_tokens={n_tokens}")
+            raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}')
        return list(tokens[:n_tokens])

    def detokenize(self, tokens: List[int]) -> bytes:
@ -105,7 +105,6 @@ class Llama:
            output += llama_cpp.llama_token_to_str(self.ctx, token)
        return output

-
    def _eval(self, tokens: List[int], n_past):
        rc = llama_cpp.llama_eval(
            self.ctx,
@ -137,12 +136,12 @@ class Llama:
                top_p=top_p,
                top_k=top_k,
                temp=temp,
-                repeat_penalty=repeat_penalty
+                repeat_penalty=repeat_penalty,
            )
            yield token
            self._eval([token], len(past_tokens) + i)

-    def __call__(
+    def _call(
        self,
        prompt: str,
        suffix: Optional[str] = None,
@ -154,34 +153,11 @@ class Llama:
        stop: List[str] = [],
        repeat_penalty: float = 1.1,
        top_k: int = 40,
+        stream: bool = False,
    ):
-        """Generate text from a prompt.
-
-        Args:
-            prompt: The prompt to generate text from.
-            suffix: A suffix to append to the generated text. If None, no suffix is appended.
-            max_tokens: The maximum number of tokens to generate.
-            temperature: The temperature to use for sampling.
-            top_p: The top-p value to use for sampling.
-            logprobs: The number of logprobs to return. If None, no logprobs are returned.
-            echo: Whether to echo the prompt.
-            stop: A list of strings to stop generation when encountered.
-            repeat_penalty: The penalty to apply to repeated tokens.
-            top_k: The top-k value to use for sampling.
-
-        Raises:
-            ValueError: If the requested tokens exceed the context window.
-            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
-
-        Returns:
-            Response object containing the generated text.
-        """
        completion_id = f"cmpl-{str(uuid.uuid4())}"
        created = int(time.time())
-        text = b""
        completion_tokens = []
-        last_n_tokens = deque([0] * self.last_n, maxlen=self.last_n)
-
        prompt_tokens = self.tokenize(prompt.encode("utf-8"))

        if len(prompt_tokens) + max_tokens > llama_cpp.llama_n_ctx(self.ctx):
@ -198,14 +174,15 @@ class Llama:
            stop = [s.encode("utf-8") for s in stop]

        finish_reason = None
-        for token in self._generate(prompt_tokens, max_tokens, top_p, top_k, temperature, repeat_penalty):
+        for token in self._generate(
+            prompt_tokens, max_tokens, top_p, top_k, temperature, repeat_penalty
+        ):
            if token == llama_cpp.llama_token_eos():
                finish_reason = "stop"
                break
-            text += self.detokenize([token])
-            last_n_tokens.append(token)
            completion_tokens.append(token)

+            text = self.detokenize(completion_tokens)
            any_stop = [s for s in stop if s in text]
            if len(any_stop) > 0:
                first_stop = any_stop[0]
@ -213,9 +190,55 @@ class Llama:
                finish_reason = "stop"
                break

+            if stream:
+                start = len(self.detokenize(completion_tokens[:-1]))
+                longest = 0
+                for s in stop:
+                    for i in range(len(s), 0, -1):
+                        if s[-i:] == text[-i:]:
+                            if i > longest:
+                                longest = i
+                            break
+                yield {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": created,
+                    "model": self.model_path,
+                    "choices": [
+                        {
+                            "text": text[start : len(text) - longest].decode("utf-8"),
+                            "index": 0,
+                            "logprobs": None,
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+
        if finish_reason is None:
            finish_reason = "length"

+        if stream:
+            if finish_reason == "stop":
+                start = len(self.detokenize(completion_tokens[:-1]))
+                text = text[start:].decode("utf-8")
+            else:
+                text = ""
+            yield {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": self.model_path,
+                "choices": [
+                    {
+                        "text": text,
+                        "index": 0,
+                        "logprobs": None,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+            }
+            return
+
        text = text.decode("utf-8")

        if echo:
@ -229,7 +252,7 @@ class Llama:
                self.ctx,
            )[:logprobs]

-        return {
+        yield {
            "id": completion_id,
            "object": "text_completion",
            "created": created,
@ -249,5 +272,58 @@ class Llama:
            },
        }

+    def __call__(
+        self,
+        prompt: str,
+        suffix: Optional[str] = None,
+        max_tokens: int = 16,
+        temperature: float = 0.8,
+        top_p: float = 0.95,
+        logprobs: Optional[int] = None,
+        echo: bool = False,
+        stop: List[str] = [],
+        repeat_penalty: float = 1.1,
+        top_k: int = 40,
+        stream: bool = False,
+    ):
+        """Generate text from a prompt.
+
+        Args:
+            prompt: The prompt to generate text from.
+            suffix: A suffix to append to the generated text. If None, no suffix is appended.
+            max_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            logprobs: The number of logprobs to return. If None, no logprobs are returned.
+            echo: Whether to echo the prompt.
+            stop: A list of strings to stop generation when encountered.
+            repeat_penalty: The penalty to apply to repeated tokens.
+            top_k: The top-k value to use for sampling.
+            stream: Whether to stream the results.
+
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+
+        Returns:
+            Response object containing the generated text.
+        """
+        call = self._call(
+            prompt=prompt,
+            suffix=suffix,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            logprobs=logprobs,
+            echo=echo,
+            stop=stop,
+            repeat_penalty=repeat_penalty,
+            top_k=top_k,
+            stream=stream,
+        )
+        if stream:
+            return call
+        return next(call)
+
    def __del__(self):
        llama_cpp.llama_free(self.ctx)