Merge branch 'main' of github.com:abetlen/llama_cpp_python into main

2024-02-09 13:32:31 -05:00 · 2024-02-09 13:32:31 -05:00 · 4abb8c9386
commit 4abb8c9386
parent 85d3374b4d e16f06e6eb
2 changed files with 13 additions and 9 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -950,8 +950,7 @@ class Llama:

            if stream:
                remaining_tokens = completion_tokens[returned_tokens:]
-                prev_tokens = completion_tokens[:returned_tokens]
-                remaining_text = self.detokenize(completion_tokens, prev_tokens)
+                remaining_text = self.detokenize(remaining_tokens)
                remaining_length = len(remaining_text)

                # We want to avoid yielding any characters from
@ -973,13 +972,13 @@ class Llama:
                    for token in remaining_tokens:
                        if token == self.token_bos():
                            continue
-                        token_end_position += len(remaining_text)
+                        token_end_position += len(self.detokenize([token]))
                        # Check if stop sequence is in the token
                        if token_end_position > (
                            remaining_length - first_stop_position
                        ):
                            break
-                        token_str = remaining_text.decode(
+                        token_str = self.detokenize([token]).decode(
                            "utf-8", errors="ignore"
                        )
                        text_offset = len(prompt) + len(
@ -1004,7 +1003,11 @@ class Llama:
                        }
                        top_logprob.update({token_str: current_logprobs[int(token)]})
                        logprobs_or_none = {
-                            "tokens": [token_str],
+                            "tokens": [
+                                self.detokenize([token]).decode(
+                                    "utf-8", errors="ignore"
+                                )
+                            ],
                            "text_offset": [text_offset],
                            "token_logprobs": [current_logprobs[int(token)]],
                            "top_logprobs": [top_logprob],
@ -1017,7 +1020,9 @@ class Llama:
                            "model": model_name,
                            "choices": [
                                {
-                                    "text": token_str,
+                                    "text": self.detokenize([token]).decode(
+                                        "utf-8", errors="ignore"
+                                    ),
                                    "index": 0,
                                    "logprobs": logprobs_or_none,
                                    "finish_reason": None,
@ -1029,7 +1034,7 @@ class Llama:
                        decode_success = False
                        for i in range(1, len(remaining_tokens) + 1):
                            try:
-                                bs = remaining_text
+                                bs = self.detokenize(remaining_tokens[:i])
                                ts = bs.decode("utf-8")
                                decode_success = True
                                break
@ -1065,7 +1070,6 @@ class Llama:

            if len(completion_tokens) >= max_tokens:
                text = self.detokenize(completion_tokens)
-                
                finish_reason = "length"
                break

--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit b08f22c882a1443e6b97081f3ce718a4d1a741f8
+Subproject commit 8e6a9d2de0096af7120606c74ee2f26684e87b41