diff --git a/CHANGELOG.md b/CHANGELOG.md index 6eb04cd..8b5fbec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,4 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Added first version of the changelog \ No newline at end of file +- Added first version of the changelog + +### Fixed + +- Performance bug in stop sequence check slowing down streaming. \ No newline at end of file diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 012bb86..d7dc625 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -775,20 +775,22 @@ class Llama: break if stream: + remaining_tokens = completion_tokens[returned_tokens:] + remaining_text = self.detokenize(remaining_tokens) + remaining_length = len(remaining_text) + # We want to avoid yielding any characters from # the generated text if they are part of a stop # sequence. first_stop_position = 0 for s in stop_sequences: - for i in range(len(s), 0, -1): - if all_text.endswith(s[:i]): + for i in range(min(len(s), remaining_length), 0, -1): + if remaining_text.endswith(s[:i]): if i > first_stop_position: first_stop_position = i break token_end_position = 0 - remaining_tokens = completion_tokens[returned_tokens:] - remaining_length = len(self.detokenize(remaining_tokens)) for token in remaining_tokens: token_end_position += len(self.detokenize([token])) # Check if stop sequence is in the token