fix: detokenization case where first token does not start with a leading space (#1375)
* Fix tokenization edge case where llama output does not start with a space See this notebook: https://colab.research.google.com/drive/1Ooz11nFPk19zyJdMDx42CeesU8aWZMdI#scrollTo=oKpHw5PZ30uC * Update _internals.py Fixing to compare to b' ' instead of (str)' ' --------- Co-authored-by: Andrei <abetlen@gmail.com>
This commit is contained in:
parent
1f56c648c3
commit
e0d7674e62
1 changed files with 2 additions and 2 deletions
|
@ -203,7 +203,7 @@ class _LlamaModel:
|
|||
# NOTE: Llama1 models automatically added a space at the start of the prompt
|
||||
# this line removes a leading space if the first token is a beginning of sentence token
|
||||
return (
|
||||
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
|
||||
output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b' ' else output
|
||||
)
|
||||
|
||||
# Extra
|
||||
|
@ -812,4 +812,4 @@ class _LlamaSamplingContext:
|
|||
def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool):
|
||||
if apply_grammar and self.grammar is not None:
|
||||
ctx_main.grammar_accept_token(self.grammar, id)
|
||||
self.prev.append(id)
|
||||
self.prev.append(id)
|
||||
|
|
Loading…
Reference in a new issue