From f74b90ed6767957ac0eb1b5364196a22e10166de Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 May 2023 03:03:01 -0400 Subject: [PATCH] Fix streaming hang on last token when cache is on. --- llama_cpp/llama.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 82246d1..f4b2d49 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -848,11 +848,6 @@ class Llama: finish_reason = "length" break - if self.cache: - if self.verbose: - print("Llama._create_completion: cache save", file=sys.stderr) - self.cache[prompt_tokens + completion_tokens] = self.save_state() - if self.verbose: llama_cpp.llama_print_timings(self.ctx) @@ -941,8 +936,17 @@ class Llama: } ], } + if self.cache: + if self.verbose: + print("Llama._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() return + if self.cache: + if self.verbose: + print("Llama._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() + text_str = text.decode("utf-8", errors="ignore") if echo: