Dont detect off tokens, detect off detokenized utf8

2023-04-28 13:16:18 +02:00 · 2023-04-28 13:16:18 +02:00 · eed61289b6
commit eed61289b6
parent 3a98747026
1 changed files with 11 additions and 7 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -459,12 +459,6 @@ class Llama:
                finish_reason = "stop"
                break

-            # Contains multi-byte UTF8
-            for num,pattern in [(2, 192), (3, 224), (4, 240)]:
-                # Bitwise AND check
-                if (pattern & token == pattern):
-                    multibyte_fix = num - 1
-
            if self.cache and len(completion_tokens) == 0:
                if prompt_tokens not in self.cache:
                    if self.verbose:
@ -473,12 +467,22 @@ class Llama:

            completion_tokens.append(token)

+            all_text = self.detokenize(completion_tokens)
+
+            # Contains multi-byte UTF8
+            for k,char in enumerate(all_text[-3:]):
+                k = 3 - k
+                char = int.from_bytes(char, "big")
+                for num,pattern in [(2, 192), (3, 224), (4, 240)]:
+                    # Bitwise AND check
+                    if (num > k and pattern & char == pattern):
+                        multibyte_fix = num - k
+
            # Stop incomplete bytes from passing
            if (multibyte_fix > 0):
                multibyte_fix -= 1
                continue

-            all_text = self.detokenize(completion_tokens)
            any_stop = [s for s in stop_sequences if s in all_text]
            if len(any_stop) > 0:
                first_stop = any_stop[0]