From f74b90ed6767957ac0eb1b5364196a22e10166de Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 26 May 2023 03:03:01 -0400
Subject: [PATCH] Fix streaming hang on last token when cache is on.

---
 llama_cpp/llama.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 82246d1..f4b2d49 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -848,11 +848,6 @@ class Llama:
                 finish_reason = "length"
                 break
 
-        if self.cache:
-            if self.verbose:
-                print("Llama._create_completion: cache save", file=sys.stderr)
-            self.cache[prompt_tokens + completion_tokens] = self.save_state()
-
         if self.verbose:
             llama_cpp.llama_print_timings(self.ctx)
 
@@ -941,8 +936,17 @@ class Llama:
                         }
                     ],
                 }
+            if self.cache:
+                if self.verbose:
+                    print("Llama._create_completion: cache save", file=sys.stderr)
+                self.cache[prompt_tokens + completion_tokens] = self.save_state()
             return
 
+        if self.cache:
+            if self.verbose:
+                print("Llama._create_completion: cache save", file=sys.stderr)
+            self.cache[prompt_tokens + completion_tokens] = self.save_state()
+
         text_str = text.decode("utf-8", errors="ignore")
 
         if echo: