fix: detokenization case where first token does not start with a leading space (#1375)

* Fix tokenization edge case where llama output does not start with a space

See this notebook:
https://colab.research.google.com/drive/1Ooz11nFPk19zyJdMDx42CeesU8aWZMdI#scrollTo=oKpHw5PZ30uC

* Update _internals.py

Fixing to compare to b' ' instead of (str)' '

---------

Co-authored-by: Andrei <abetlen@gmail.com>
This commit is contained in:
Noam Gat 2024-05-04 17:14:59 +03:00 committed by GitHub
parent 1f56c648c3
commit e0d7674e62
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -203,7 +203,7 @@ class _LlamaModel:
# NOTE: Llama1 models automatically added a space at the start of the prompt
# this line removes a leading space if the first token is a beginning of sentence token
return (
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b' ' else output
)
# Extra
@ -812,4 +812,4 @@ class _LlamaSamplingContext:
def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool):
if apply_grammar and self.grammar is not None:
ctx_main.grammar_accept_token(self.grammar, id)
self.prev.append(id)
self.prev.append(id)