fix: detokenization case where first token does not start with a leading space (#1375)

* Fix tokenization edge case where llama output does not start with a space See this notebook: https://colab.research.google.com/drive/1Ooz11nFPk19zyJdMDx42CeesU8aWZMdI#scrollTo=oKpHw5PZ30uC * Update _internals.py Fixing to compare to b' ' instead of (str)' ' --------- Co-authored-by: Andrei <abetlen@gmail.com>
2024-05-04 17:14:59 +03:00 · 2024-05-04 17:14:59 +03:00 · e0d7674e62
commit e0d7674e62
parent 1f56c648c3
1 changed files with 2 additions and 2 deletions
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@ -203,7 +203,7 @@ class _LlamaModel:
        # NOTE: Llama1 models automatically added a space at the start of the prompt
        # this line removes a leading space if the first token is a beginning of sentence token
        return (
-            output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
+            output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b' ' else output
        )

    # Extra
@ -812,4 +812,4 @@ class _LlamaSamplingContext:
    def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool):
        if apply_grammar and self.grammar is not None:
            ctx_main.grammar_accept_token(self.grammar, id)
-        self.prev.append(id)
+        self.prev.append(id)