Fix streaming hang on last token when cache is on.

This commit is contained in:
Andrei Betlen 2023-05-26 03:03:01 -04:00
parent 5be8354e11
commit f74b90ed67

View file

@ -848,11 +848,6 @@ class Llama:
finish_reason = "length"
break
if self.cache:
if self.verbose:
print("Llama._create_completion: cache save", file=sys.stderr)
self.cache[prompt_tokens + completion_tokens] = self.save_state()
if self.verbose:
llama_cpp.llama_print_timings(self.ctx)
@ -941,8 +936,17 @@ class Llama:
}
],
}
if self.cache:
if self.verbose:
print("Llama._create_completion: cache save", file=sys.stderr)
self.cache[prompt_tokens + completion_tokens] = self.save_state()
return
if self.cache:
if self.verbose:
print("Llama._create_completion: cache save", file=sys.stderr)
self.cache[prompt_tokens + completion_tokens] = self.save_state()
text_str = text.decode("utf-8", errors="ignore")
if echo: