diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ea8d90..da75478 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - (build-system) Migrate from scikit-build to scikit-build-core ### Fixed + - Truncate max_tokens in create_completion so requested tokens doesn't exceed context size. +- Temporarily disable cache for completion requests ## [v0.1.59] diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4b6ce8c..02fe774 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -831,7 +831,9 @@ class Llama: "logprobs is not supported for models created with logits_all=False" ) - if self.cache: + # Temporarily disable usage of the cache + # See: https://github.com/abetlen/llama-cpp-python/issues/348#issuecomment-1583072408 + if self.cache and False: try: cache_item = self.cache[prompt_tokens] cache_prefix_len = Llama.longest_token_prefix( @@ -1069,14 +1071,14 @@ class Llama: } ], } - if self.cache: + if self.cache and False: if self.verbose: print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() print("Llama._create_completion: cache saved", file=sys.stderr) return - if self.cache: + if self.cache and False: if self.verbose: print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state()