Fix streaming hang on last token when cache is on.

This commit is contained in:
Andrei Betlen 2023-05-26 03:03:01 -04:00
parent 5be8354e11
commit f74b90ed67

View file

@ -848,11 +848,6 @@ class Llama:
finish_reason = "length" finish_reason = "length"
break break
if self.cache:
if self.verbose:
print("Llama._create_completion: cache save", file=sys.stderr)
self.cache[prompt_tokens + completion_tokens] = self.save_state()
if self.verbose: if self.verbose:
llama_cpp.llama_print_timings(self.ctx) llama_cpp.llama_print_timings(self.ctx)
@ -941,8 +936,17 @@ class Llama:
} }
], ],
} }
if self.cache:
if self.verbose:
print("Llama._create_completion: cache save", file=sys.stderr)
self.cache[prompt_tokens + completion_tokens] = self.save_state()
return return
if self.cache:
if self.verbose:
print("Llama._create_completion: cache save", file=sys.stderr)
self.cache[prompt_tokens + completion_tokens] = self.save_state()
text_str = text.decode("utf-8", errors="ignore") text_str = text.decode("utf-8", errors="ignore")
if echo: if echo: