From 97c6372350c57a4fffb6072cb299e5a9bd8b38dc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 21:58:27 -0400 Subject: [PATCH] Rewind model to longest prefix. --- llama_cpp/llama.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 8cd77ee..7a8c25b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -390,18 +390,28 @@ class Llama: """ assert self.ctx is not None - if ( - reset - and len(self.eval_tokens) > 0 - and tuple(self.eval_tokens) == tuple(tokens[: len(self.eval_tokens)]) - ): - if self.verbose: - print("Llama.generate: cache hit", file=sys.stderr) - reset = False - tokens = tokens[len(self.eval_tokens) :] + if reset and len(self.eval_tokens) > 0: + longest_prefix = 0 + for a, b in zip(self.eval_tokens, tokens[:-1]): + if a == b: + longest_prefix += 1 + else: + break + if longest_prefix > 0: + if self.verbose: + print("Llama.generate: prefix-match hit", file=sys.stderr) + reset = False + tokens = tokens[longest_prefix:] + for _ in range(len(self.eval_tokens) - longest_prefix): + self.eval_tokens.pop() + try: + self.eval_logits.pop() + except IndexError: + pass if reset: self.reset() + while True: self.eval(tokens) token = self.sample(