From ce1277549012a33e5c2360f42bf53aaf1b95e528 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 6 Feb 2024 18:50:56 -0500 Subject: [PATCH 1/3] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b08f22c..213d143 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b08f22c882a1443e6b97081f3ce718a4d1a741f8 +Subproject commit 213d1439fadefe182f69c5f7e8dd3b4b6572ebcb From dfc1b173414b550f8f5be1b94430af16b53a63cb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Feb 2024 23:38:12 -0500 Subject: [PATCH 2/3] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 213d143..8e6a9d2 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 213d1439fadefe182f69c5f7e8dd3b4b6572ebcb +Subproject commit 8e6a9d2de0096af7120606c74ee2f26684e87b41 From e16f06e6eb555947f4404c20732921c8ea76c4f7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Feb 2024 02:02:13 -0500 Subject: [PATCH 3/3] fix: revert _create_completions. --- llama_cpp/llama.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index bad75df..f445fb0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -948,8 +948,7 @@ class Llama: if stream: remaining_tokens = completion_tokens[returned_tokens:] - prev_tokens = completion_tokens[:returned_tokens] - remaining_text = self.detokenize(completion_tokens, prev_tokens) + remaining_text = self.detokenize(remaining_tokens) remaining_length = len(remaining_text) # We want to avoid yielding any characters from @@ -971,13 +970,13 @@ class Llama: for token in remaining_tokens: if token == self.token_bos(): continue - token_end_position += len(remaining_text) + token_end_position += len(self.detokenize([token])) # Check if stop sequence is in the token if token_end_position > ( remaining_length - first_stop_position ): break - token_str = remaining_text.decode( + token_str = self.detokenize([token]).decode( "utf-8", errors="ignore" ) text_offset = len(prompt) + len( @@ -1002,7 +1001,11 @@ class Llama: } top_logprob.update({token_str: current_logprobs[int(token)]}) logprobs_or_none = { - "tokens": [token_str], + "tokens": [ + self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + ], "text_offset": [text_offset], "token_logprobs": [current_logprobs[int(token)]], "top_logprobs": [top_logprob], @@ -1015,7 +1018,9 @@ class Llama: "model": model_name, "choices": [ { - "text": token_str, + "text": self.detokenize([token]).decode( + "utf-8", errors="ignore" + ), "index": 0, "logprobs": logprobs_or_none, "finish_reason": None, @@ -1027,7 +1032,7 @@ class Llama: decode_success = False for i in range(1, len(remaining_tokens) + 1): try: - bs = remaining_text + bs = self.detokenize(remaining_tokens[:i]) ts = bs.decode("utf-8") decode_success = True break @@ -1063,7 +1068,6 @@ class Llama: if len(completion_tokens) >= max_tokens: text = self.detokenize(completion_tokens) - finish_reason = "length" break