From e3ea35454739a1c70df9d436b630e306fcab8691 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Wed, 5 Apr 2023 14:23:01 +0200
Subject: [PATCH 01/43] Allow local llama library usage

---
 llama_cpp/llama_cpp.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 41055bd..8105f18 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -18,9 +18,12 @@ def _load_shared_library(lib_base_name):
 
     # Construct the paths to the possible shared library names
     _base_path = pathlib.Path(__file__).parent.resolve()
+    _local_path = pathlib.Path.cwd()
     # Searching for the library in the current directory under the name "libllama" (default name
     # for llamacpp) and "llama" (default name for this repo)
     _lib_paths = [
+        _local_path / f"./lib{lib_base_name}{lib_ext}",
+        _local_path / f"./{lib_base_name}{lib_ext}",
         _base_path / f"lib{lib_base_name}{lib_ext}",
         _base_path / f"{lib_base_name}{lib_ext}"
     ]

From c65a621b6b3c7e368cf7d1a4b1354e5a28524a19 Mon Sep 17 00:00:00 2001
From: jm12138 <2286040843@qq.com>
Date: Mon, 10 Apr 2023 10:28:24 +0000
Subject: [PATCH 02/43] Add UTF-8 Encoding in read_text.

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2706b8d..fef3711 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from skbuild import setup
 from pathlib import Path
 
 this_directory = Path(__file__).parent
-long_description = (this_directory / "README.md").read_text()
+long_description = (this_directory / "README.md").read_text(encoding='UTF-8')
 
 setup(
     name="llama_cpp_python",

From 76131d5bb812ad3a84b73134d8356285be4eb297 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Mon, 10 Apr 2023 17:00:35 +0200
Subject: [PATCH 03/43] Use environment variable for library override

---
 llama_cpp/llama_cpp.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 8105f18..e937148 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -18,16 +18,16 @@ def _load_shared_library(lib_base_name):
 
     # Construct the paths to the possible shared library names
     _base_path = pathlib.Path(__file__).parent.resolve()
-    _local_path = pathlib.Path.cwd()
     # Searching for the library in the current directory under the name "libllama" (default name
     # for llamacpp) and "llama" (default name for this repo)
     _lib_paths = [
-        _local_path / f"./lib{lib_base_name}{lib_ext}",
-        _local_path / f"./{lib_base_name}{lib_ext}",
         _base_path / f"lib{lib_base_name}{lib_ext}",
         _base_path / f"{lib_base_name}{lib_ext}"
     ]
 
+    if ("LLAMA_LIB" in os.environ):
+        _lib_paths = [pathlib.Path(os.environ["LLAMA_LIB"]).resolve()]
+
     # Add the library directory to the DLL search path on Windows (if needed)
     if sys.platform == "win32" and sys.version_info >= (3, 8):
         os.add_dll_directory(str(_base_path))

From cf339c9b3c4bc3588b313a07713af3dd5c3e3724 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Mon, 10 Apr 2023 17:06:58 +0200
Subject: [PATCH 04/43] Better custom library debugging

---
 llama_cpp/llama_cpp.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index ab18d98..de37ff9 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -26,6 +26,7 @@ def _load_shared_library(lib_base_name):
     ]
 
     if ("LLAMA_LIB" in os.environ):
+        lib_base_name = os.environ["LLAMA_LIB"]
         _lib_paths = [pathlib.Path(os.environ["LLAMA_LIB"]).resolve()]
 
     # Add the library directory to the DLL search path on Windows (if needed)

From ee71ce8ab73edd07f4c136208d1871e512123f56 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Mon, 10 Apr 2023 17:12:25 +0200
Subject: [PATCH 05/43] Make windows users happy (hopefully)

---
 llama_cpp/llama_cpp.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index de37ff9..0c0581f 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -27,7 +27,9 @@ def _load_shared_library(lib_base_name):
 
     if ("LLAMA_LIB" in os.environ):
         lib_base_name = os.environ["LLAMA_LIB"]
-        _lib_paths = [pathlib.Path(os.environ["LLAMA_LIB"]).resolve()]
+        _lib = pathlib.Path(lib_base_name)
+        _base_path = _lib.parent.resolve()
+        _lib_paths = [_lib.resolve()]
 
     # Add the library directory to the DLL search path on Windows (if needed)
     if sys.platform == "win32" and sys.version_info >= (3, 8):

From 2559e5af9b049737c9f83a96eb3218ed82281846 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Mon, 10 Apr 2023 17:27:17 +0200
Subject: [PATCH 06/43] Changed the environment variable name into
 "LLAMA_CPP_LIB"

---
 llama_cpp/llama_cpp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 0c0581f..8a5869c 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -25,8 +25,8 @@ def _load_shared_library(lib_base_name):
         _base_path / f"{lib_base_name}{lib_ext}"
     ]
 
-    if ("LLAMA_LIB" in os.environ):
-        lib_base_name = os.environ["LLAMA_LIB"]
+    if ("LLAMA_CPP_LIB" in os.environ):
+        lib_base_name = os.environ["LLAMA_CPP_LIB"]
         _lib = pathlib.Path(lib_base_name)
         _base_path = _lib.parent.resolve()
         _lib_paths = [_lib.resolve()]

From adfd9f681c756859f8e60c7f19ba7bd7fb6a67a1 Mon Sep 17 00:00:00 2001
From: jm12138 <2286040843@qq.com>
Date: Mon, 10 Apr 2023 15:33:31 +0000
Subject: [PATCH 07/43] Matched the other encode calls

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fef3711..33c2e26 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from skbuild import setup
 from pathlib import Path
 
 this_directory = Path(__file__).parent
-long_description = (this_directory / "README.md").read_text(encoding='UTF-8')
+long_description = (this_directory / "README.md").read_text(encoding="utf-8")
 
 setup(
     name="llama_cpp_python",

From ffb1e8025104eea18d5043e7b5cb2d39b2e04340 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 10 Apr 2023 11:37:41 -0400
Subject: [PATCH 08/43] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 89c8271..2b0b35b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.30"
+version = "0.1.31"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 33c2e26..a7c6262 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.30",
+    version="0.1.31",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 5247e32d9e9ef9d33950da07865c535ae6df988b Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 10 Apr 2023 12:56:23 -0400
Subject: [PATCH 09/43] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 180b693..684da25 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 180b693a47b6b825288ef9f2c39d24b6eea4eea6
+Subproject commit 684da25926e5c505f725b4f10b5485b218fa1fc7

From 3727ba4d9e932dc90eead9b8210fb5498670cbbc Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 10 Apr 2023 12:56:48 -0400
Subject: [PATCH 10/43] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2b0b35b..50fe7e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.31"
+version = "0.1.32"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index a7c6262..3ce6001 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.31",
+    version="0.1.32",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 213cc5c34082490f7aa88d27aa24d5eae2a39ab9 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 11 Apr 2023 11:54:31 -0400
Subject: [PATCH 11/43] Remove async from function signature to avoid blocking
 the server

---
 llama_cpp/server/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 44ee1f0..80cbe01 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -196,7 +196,7 @@ CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatComplet
     "/v1/chat/completions",
     response_model=CreateChatCompletionResponse,
 )
-async def create_chat_completion(
+def create_chat_completion(
     request: CreateChatCompletionRequest,
 ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
     completion_or_chunks = llama.create_chat_completion(

From 9f1e56559434e51268564532e870298fb0e27d80 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 11 Apr 2023 11:59:03 -0400
Subject: [PATCH 12/43] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 47 +++++++++++++++++++++++++++++++++++-------
 vendor/llama.cpp       |  2 +-
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 8a5869c..0f2b4d5 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1,9 +1,21 @@
 import sys
 import os
 import ctypes
-from ctypes import c_int, c_float, c_char_p, c_void_p, c_bool, POINTER, Structure, Array, c_uint8, c_size_t
+from ctypes import (
+    c_int,
+    c_float,
+    c_char_p,
+    c_void_p,
+    c_bool,
+    POINTER,
+    Structure,
+    Array,
+    c_uint8,
+    c_size_t,
+)
 import pathlib
 
+
 # Load the library
 def _load_shared_library(lib_base_name):
     # Determine the file extension based on the platform
@@ -22,10 +34,10 @@ def _load_shared_library(lib_base_name):
     # for llamacpp) and "llama" (default name for this repo)
     _lib_paths = [
         _base_path / f"lib{lib_base_name}{lib_ext}",
-        _base_path / f"{lib_base_name}{lib_ext}"
+        _base_path / f"{lib_base_name}{lib_ext}",
     ]
 
-    if ("LLAMA_CPP_LIB" in os.environ):
+    if "LLAMA_CPP_LIB" in os.environ:
         lib_base_name = os.environ["LLAMA_CPP_LIB"]
         _lib = pathlib.Path(lib_base_name)
         _base_path = _lib.parent.resolve()
@@ -43,7 +55,10 @@ def _load_shared_library(lib_base_name):
             except Exception as e:
                 raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
 
-    raise FileNotFoundError(f"Shared library with base name '{lib_base_name}' not found")
+    raise FileNotFoundError(
+        f"Shared library with base name '{lib_base_name}' not found"
+    )
+
 
 # Specify the base name of the shared library to load
 _lib_base_name = "llama"
@@ -95,6 +110,10 @@ class llama_context_params(Structure):
 
 llama_context_params_p = POINTER(llama_context_params)
 
+LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0)
+LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3)  # except 1d tensors
 
 # Functions
 
@@ -106,18 +125,23 @@ def llama_context_default_params() -> llama_context_params:
 _lib.llama_context_default_params.argtypes = []
 _lib.llama_context_default_params.restype = llama_context_params
 
+
 def llama_mmap_supported() -> c_bool:
     return _lib.llama_mmap_supported()
 
+
 _lib.llama_mmap_supported.argtypes = []
 _lib.llama_mmap_supported.restype = c_bool
 
+
 def llama_mlock_supported() -> c_bool:
     return _lib.llama_mlock_supported()
 
+
 _lib.llama_mlock_supported.argtypes = []
 _lib.llama_mlock_supported.restype = c_bool
 
+
 # Various functions for loading a ggml llama model.
 # Allocate (almost) all memory needed for the model.
 # Return NULL on failure
@@ -142,42 +166,49 @@ _lib.llama_free.restype = None
 
 # TODO: not great API - very likely to change
 # Returns 0 on success
-def llama_model_quantize(
-    fname_inp: bytes, fname_out: bytes, itype: c_int
-) -> c_int:
+def llama_model_quantize(fname_inp: bytes, fname_out: bytes, itype: c_int) -> c_int:
     return _lib.llama_model_quantize(fname_inp, fname_out, itype)
 
 
 _lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int]
 _lib.llama_model_quantize.restype = c_int
 
+
 # Returns the KV cache that will contain the context for the
 # ongoing prediction with the model.
 def llama_get_kv_cache(ctx: llama_context_p):
     return _lib.llama_get_kv_cache(ctx)
 
+
 _lib.llama_get_kv_cache.argtypes = [llama_context_p]
 _lib.llama_get_kv_cache.restype = POINTER(c_uint8)
 
+
 # Returns the size of the KV cache
 def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t:
     return _lib.llama_get_kv_cache_size(ctx)
 
+
 _lib.llama_get_kv_cache_size.argtypes = [llama_context_p]
 _lib.llama_get_kv_cache_size.restype = c_size_t
 
+
 # Returns the number of tokens in the KV cache
 def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
     return _lib.llama_get_kv_cache_token_count(ctx)
 
+
 _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
 _lib.llama_get_kv_cache_token_count.restype = c_int
 
 
 # Sets the KV cache containing the current context for the model
-def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int):
+def llama_set_kv_cache(
+    ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int
+):
     return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count)
 
+
 _lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int]
 _lib.llama_set_kv_cache.restype = None
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 684da25..3e6e70d 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 684da25926e5c505f725b4f10b5485b218fa1fc7
+Subproject commit 3e6e70d8e8917b5bd14c7c9f9b89a585f1ff0b31

From 2a60eb820f40ada7d0ac83fd3c0f23ef15052a07 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 11 Apr 2023 23:53:46 -0400
Subject: [PATCH 13/43] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 3e6e70d..8b67998 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 3e6e70d8e8917b5bd14c7c9f9b89a585f1ff0b31
+Subproject commit 8b679987cdce292ff36bd741f6715e4927e26f9b

From b3805bb9ccc2a33d68b568cd00e10f89a0f9506b Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Apr 2023 14:05:11 -0400
Subject: [PATCH 14/43] Implement logprobs parameter for text completion.
 Closes #2

---
 llama_cpp/llama.py           | 125 ++++++++++++++++++++++++++++++-----
 llama_cpp/server/__main__.py |   2 +
 2 files changed, 111 insertions(+), 16 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 2d76ec4..3e13776 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -2,6 +2,7 @@ import os
 import sys
 import uuid
 import time
+import math
 import multiprocessing
 from typing import List, Optional, Union, Generator, Sequence, Iterator
 from collections import deque
@@ -76,6 +77,9 @@ class Llama:
         )
         self.tokens_consumed = 0
         self.n_batch = min(n_ctx, n_batch)
+        self.n_tokens = 0
+        self.n_past = 0
+        self.all_logits: List[List[float]] = []  # TODO: Use an array instead of a list.
 
         self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
 
@@ -136,6 +140,9 @@ class Llama:
             [llama_cpp.llama_token(0)] * self.last_n_tokens_size
         )
         self.tokens_consumed = 0
+        self.n_tokens = 0
+        self.n_past = 0
+        self.all_logits = []
 
     def eval(self, tokens: Sequence[llama_cpp.llama_token]):
         """Evaluate a list of tokens.
@@ -147,18 +154,31 @@ class Llama:
         n_ctx = int(llama_cpp.llama_n_ctx(self.ctx))
         for i in range(0, len(tokens), self.n_batch):
             batch = tokens[i : min(len(tokens), i + self.n_batch)]
-            n_past = min(n_ctx - len(batch), self.tokens_consumed)
+            self.n_past = min(n_ctx - len(batch), self.tokens_consumed)
+            self.n_tokens = len(batch)
             return_code = llama_cpp.llama_eval(
                 ctx=self.ctx,
                 tokens=(llama_cpp.llama_token * len(batch))(*batch),
-                n_tokens=llama_cpp.c_int(len(batch)),
-                n_past=llama_cpp.c_int(n_past),
+                n_tokens=llama_cpp.c_int(self.n_tokens),
+                n_past=llama_cpp.c_int(self.n_past),
                 n_threads=llama_cpp.c_int(self.n_threads),
             )
             if int(return_code) != 0:
                 raise RuntimeError(f"llama_eval returned {return_code}")
             self.last_n_tokens_data.extend(batch)
             self.tokens_consumed += len(batch)
+            if self.params.logits_all:
+                self.all_logits.extend(self._logits())
+
+    def _logits(self) -> List[List[float]]:
+        """Return the logits from the last call to llama_eval."""
+        assert self.ctx is not None
+        n_vocab = llama_cpp.llama_n_vocab(self.ctx)
+        cols = int(n_vocab)
+        rows = self.n_tokens if self.params.logits_all else 1
+        logits_view = llama_cpp.llama_get_logits(self.ctx)
+        logits = [[logits_view[i * cols + j] for j in range(cols)] for i in range(rows)]
+        return logits
 
     def sample(
         self,
@@ -327,14 +347,55 @@ class Llama:
         else:
             stop_sequences = []
 
-        finish_reason = None
-        for token in self.generate(
-            prompt_tokens,
-            top_k=top_k,
-            top_p=top_p,
-            temp=temperature,
-            repeat_penalty=repeat_penalty,
-        ):
+        text_offset = 0
+        text_offsets: List[int] = []
+        token_logprobs: List[float] = []
+        tokens: List[str] = []
+        top_logprobs: List[Dict[str, float]] = []
+
+        self.reset()
+        self.eval(prompt_tokens)
+
+        if logprobs is not None and self.params.logits_all is False:
+            raise ValueError(
+                "logprobs is not supported for models created with logits_all=False"
+            )
+
+        if logprobs is not None:
+            token_strs = [
+                self.detokenize([token]).decode("utf-8") for token in prompt_tokens
+            ]
+            logprobs_all = [
+                [Llama.logit_to_logprob(logit) for logit in row]
+                for row in self.all_logits
+            ]
+            for token, token_str, logprobs_token in zip(
+                prompt_tokens, token_strs, logprobs_all
+            ):
+                text_offsets.append(text_offset)
+                text_offset += len(token_str)
+                tokens.append(token_str)
+                sorted_logprobs = list(
+                    sorted(
+                        zip(logprobs_token, range(len(logprobs_token))), reverse=True
+                    )
+                )
+                token_logprobs.append(sorted_logprobs[int(token)][0])
+                top_logprob = {
+                    self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8"): logprob
+                    for logprob, i in sorted_logprobs[:logprobs]
+                }
+                top_logprob.update({token_str: sorted_logprobs[int(token)][0]})
+                top_logprobs.append(top_logprob)
+
+        finish_reason = "length"
+        while True:
+            token = self.sample(
+                top_k=top_k,
+                top_p=top_p,
+                temp=temperature,
+                repeat_penalty=repeat_penalty,
+            )
             if token == llama_cpp.llama_token_eos():
                 text = self.detokenize(completion_tokens)
                 finish_reason = "stop"
@@ -377,13 +438,35 @@ class Llama:
                         }
                     ],
                 }
+
+            if logprobs is not None:
+                # TODO: Confirm wether this should happen before or after
+                # next eval.
+                token_str = self.detokenize([token]).decode("utf-8")
+                text_offsets.append(text_offset)
+                text_offset += len(token_str)
+                tokens.append(token_str)
+                logprobs_token = [
+                    Llama.logit_to_logprob(logit) for logit in self.all_logits[-1]
+                ]
+                sorted_logprobs = list(
+                    sorted(
+                        zip(logprobs_token, range(len(logprobs_token))), reverse=True
+                    )
+                )
+                token_logprobs.append(sorted_logprobs[int(token)][0])
+                top_logprob = {
+                    self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8"): logprob
+                    for logprob, i in sorted_logprobs[:logprobs]
+                }
+                top_logprob.update({token_str: logprobs_token[int(token)]})
+                top_logprobs.append(top_logprob)
+
             if len(completion_tokens) >= max_tokens:
                 text = self.detokenize(completion_tokens)
                 finish_reason = "length"
                 break
-
-        if finish_reason is None:
-            finish_reason = "length"
+            self.eval([token])
 
         if stream:
             yield {
@@ -410,8 +493,14 @@ class Llama:
         if suffix is not None:
             text = text + suffix
 
+        logprobs_or_none: Optional[CompletionLogprobs] = None
         if logprobs is not None:
-            raise NotImplementedError("logprobs not implemented")
+            logprobs_or_none = {
+                "tokens": tokens,
+                "text_offset": text_offsets,
+                "token_logprobs": token_logprobs,
+                "top_logprobs": top_logprobs,
+            }
 
         if self.verbose:
             llama_cpp.llama_print_timings(self.ctx)
@@ -425,7 +514,7 @@ class Llama:
                 {
                     "text": text,
                     "index": 0,
-                    "logprobs": None,
+                    "logprobs": logprobs_or_none,
                     "finish_reason": finish_reason,
                 }
             ],
@@ -704,3 +793,7 @@ class Llama:
     def token_bos() -> llama_cpp.llama_token:
         """Return the beginning-of-sequence token."""
         return llama_cpp.llama_token_bos()
+
+    @staticmethod
+    def logit_to_logprob(x: float) -> float:
+        return math.log(1.0 + math.exp(x))
diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 80cbe01..49a00b2 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -33,6 +33,7 @@ class Settings(BaseSettings):
     use_mlock: bool = False  # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
     embedding: bool = True
     last_n_tokens_size: int = 64
+    logits_all: bool = False
 
 
 app = FastAPI(
@@ -52,6 +53,7 @@ llama = llama_cpp.Llama(
     f16_kv=settings.f16_kv,
     use_mlock=settings.use_mlock,
     embedding=settings.embedding,
+    logits_all=settings.logits_all,
     n_threads=settings.n_threads,
     n_batch=settings.n_batch,
     n_ctx=settings.n_ctx,

From 6cf58765388b85329769cb78405c5e5ff74dc414 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Apr 2023 14:06:04 -0400
Subject: [PATCH 15/43] Deprecate generate method

---
 llama_cpp/llama.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 3e13776..69f7680 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -3,6 +3,7 @@ import sys
 import uuid
 import time
 import math
+import warnings
 import multiprocessing
 from typing import List, Optional, Union, Generator, Sequence, Iterator
 from collections import deque
@@ -239,6 +240,11 @@ class Llama:
         Yields:
             The generated tokens.
         """
+        warnings.warn(
+            "Llama.generate is deprecated and will be removed in v0.2.0",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         assert self.ctx is not None
         self.reset()
         while True:

From 2f9b6490059f57e15b13de71db3cc19381ed33ef Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Apr 2023 14:06:22 -0400
Subject: [PATCH 16/43] Style fix

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 69f7680..45e09d1 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -330,7 +330,7 @@ class Llama:
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
-    ) -> Union[Iterator[Completion], Iterator[CompletionChunk],]:
+    ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
         assert self.ctx is not None
         completion_id = f"cmpl-{str(uuid.uuid4())}"
         created = int(time.time())

From c854c2564b8cd97c87702480c106a86ca1828d31 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Apr 2023 14:07:14 -0400
Subject: [PATCH 17/43] Don't serialize stateful parameters

---
 llama_cpp/llama.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 45e09d1..c545420 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -763,8 +763,6 @@ class Llama:
             use_mlock=self.params.use_mlock,
             embedding=self.params.embedding,
             last_n_tokens_size=self.last_n_tokens_size,
-            last_n_tokens_data=self.last_n_tokens_data,
-            tokens_consumed=self.tokens_consumed,
             n_batch=self.n_batch,
             n_threads=self.n_threads,
         )
@@ -786,9 +784,6 @@ class Llama:
             last_n_tokens_size=state["last_n_tokens_size"],
             verbose=state["verbose"],
         )
-        self.last_n_tokens_data = state["last_n_tokens_data"]
-        self.tokens_consumed = state["tokens_consumed"]
-
 
     @staticmethod
     def token_eos() -> llama_cpp.llama_token:

From 005c78d26c00ae5d7e10166993909a0e2ff4af8d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Apr 2023 14:29:00 -0400
Subject: [PATCH 18/43] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 1 +
 vendor/llama.cpp       | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 0f2b4d5..811f69a 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -114,6 +114,7 @@ LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0)
 LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(4)  # tok_embeddings.weight and output.weight are F16
 
 # Functions
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 8b67998..e7f6997 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 8b679987cdce292ff36bd741f6715e4927e26f9b
+Subproject commit e7f6997f897a18b6372a6460e25c5f89e1469f1d

From 19598ac4e88619b67514b07b2df93e1b9a039df1 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Apr 2023 19:07:53 -0400
Subject: [PATCH 19/43] Fix threading bug. Closes #62

---
 llama_cpp/server/__main__.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 49a00b2..4360506 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -13,12 +13,13 @@ Then visit http://localhost:8000/docs to see the interactive API docs.
 """
 import os
 import json
+from threading import Lock
 from typing import List, Optional, Literal, Union, Iterator, Dict
 from typing_extensions import TypedDict
 
 import llama_cpp
 
-from fastapi import FastAPI
+from fastapi import Depends, FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
 from sse_starlette.sse import EventSourceResponse
@@ -59,6 +60,13 @@ llama = llama_cpp.Llama(
     n_ctx=settings.n_ctx,
     last_n_tokens_size=settings.last_n_tokens_size,
 )
+llama_lock = Lock()
+
+
+def get_llama():
+    with llama_lock:
+        yield llama
+
 
 
 class CreateCompletionRequest(BaseModel):
@@ -101,7 +109,7 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
     "/v1/completions",
     response_model=CreateCompletionResponse,
 )
-def create_completion(request: CreateCompletionRequest):
+def create_completion(request: CreateCompletionRequest, llama: llama_cpp.Llama=Depends(get_llama)):
     if isinstance(request.prompt, list):
         request.prompt = "".join(request.prompt)
 
@@ -146,7 +154,7 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
     "/v1/embeddings",
     response_model=CreateEmbeddingResponse,
 )
-def create_embedding(request: CreateEmbeddingRequest):
+def create_embedding(request: CreateEmbeddingRequest, llama: llama_cpp.Llama=Depends(get_llama)):
     return llama.create_embedding(**request.dict(exclude={"model", "user"}))
 
 
@@ -200,6 +208,7 @@ CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatComplet
 )
 def create_chat_completion(
     request: CreateChatCompletionRequest,
+    llama: llama_cpp.Llama=Depends(get_llama),
 ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
     completion_or_chunks = llama.create_chat_completion(
         **request.dict(

From 0daf16defcc353de715e29e6103e4c7a2422ee58 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Apr 2023 19:08:11 -0400
Subject: [PATCH 20/43] Enable logprobs on completion endpoint

---
 llama_cpp/server/__main__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 4360506..8b9614e 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -118,7 +118,6 @@ def create_completion(request: CreateCompletionRequest, llama: llama_cpp.Llama=D
             exclude={
                 "model",
                 "n",
-                "logprobs",
                 "frequency_penalty",
                 "presence_penalty",
                 "best_of",

From 4f5f99ef2ae4aa6a8e8d636e67eb8aca7fc81184 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Apr 2023 22:40:12 -0400
Subject: [PATCH 21/43] Formatting

---
 llama_cpp/server/__main__.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 8b9614e..c54d91b 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -109,7 +109,9 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
     "/v1/completions",
     response_model=CreateCompletionResponse,
 )
-def create_completion(request: CreateCompletionRequest, llama: llama_cpp.Llama=Depends(get_llama)):
+def create_completion(
+    request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
+):
     if isinstance(request.prompt, list):
         request.prompt = "".join(request.prompt)
 
@@ -153,7 +155,9 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
     "/v1/embeddings",
     response_model=CreateEmbeddingResponse,
 )
-def create_embedding(request: CreateEmbeddingRequest, llama: llama_cpp.Llama=Depends(get_llama)):
+def create_embedding(
+    request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
+):
     return llama.create_embedding(**request.dict(exclude={"model", "user"}))
 
 
@@ -207,7 +211,7 @@ CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatComplet
 )
 def create_chat_completion(
     request: CreateChatCompletionRequest,
-    llama: llama_cpp.Llama=Depends(get_llama),
+    llama: llama_cpp.Llama = Depends(get_llama),
 ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
     completion_or_chunks = llama.create_chat_completion(
         **request.dict(

From 22fa5a621fa2f8249943e0a52dd8c8a21e9baca0 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 13 Apr 2023 00:19:55 -0400
Subject: [PATCH 22/43] Revert "Deprecate generate method"

This reverts commit 6cf58765388b85329769cb78405c5e5ff74dc414.
---
 llama_cpp/llama.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index c545420..67fefe5 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -3,7 +3,6 @@ import sys
 import uuid
 import time
 import math
-import warnings
 import multiprocessing
 from typing import List, Optional, Union, Generator, Sequence, Iterator
 from collections import deque
@@ -240,11 +239,6 @@ class Llama:
         Yields:
             The generated tokens.
         """
-        warnings.warn(
-            "Llama.generate is deprecated and will be removed in v0.2.0",
-            DeprecationWarning,
-            stacklevel=2,
-        )
         assert self.ctx is not None
         self.reset()
         while True:

From 6595ad84bfd5360ac22e311f91eef3d78bdb65f2 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 13 Apr 2023 00:28:00 -0400
Subject: [PATCH 23/43] Add field to disable reseting between generations

---
 llama_cpp/llama.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 67fefe5..db9a337 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -218,6 +218,7 @@ class Llama:
         top_p: float,
         temp: float,
         repeat_penalty: float,
+        reset: bool = True,
     ) -> Generator[
         llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None
     ]:
@@ -235,12 +236,14 @@ class Llama:
             top_p: The top-p sampling parameter.
             temp: The temperature parameter.
             repeat_penalty: The repeat penalty parameter.
+            reset: Whether to reset the model state.
 
         Yields:
             The generated tokens.
         """
         assert self.ctx is not None
-        self.reset()
+        if reset:
+            self.reset()
         while True:
             self.eval(tokens)
             token = self.sample(

From 7dc0838fff5954957f4f0b585831ff8c6732d370 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 13 Apr 2023 00:35:05 -0400
Subject: [PATCH 24/43] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 50fe7e7..a0b6df3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.32"
+version = "0.1.33"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 3ce6001..1648f64 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.32",
+    version="0.1.33",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 26cc4ee029704976db08a5c67ab812200fcf2c9e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 14 Apr 2023 09:59:08 -0400
Subject: [PATCH 25/43] Fix signature for stop parameter

---
 llama_cpp/llama.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index db9a337..ae25137 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -323,7 +323,7 @@ class Llama:
         top_p: float = 0.95,
         logprobs: Optional[int] = None,
         echo: bool = False,
-        stop: List[str] = [],
+        stop: Optional[List[str]] = [],
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
@@ -336,6 +336,7 @@ class Llama:
         prompt_tokens = self.tokenize(b" " + prompt.encode("utf-8"))
         text = b""
         returned_characters = 0
+        stop = stop if not None else []
 
         if self.verbose:
             llama_cpp.llama_reset_timings(self.ctx)
@@ -537,7 +538,7 @@ class Llama:
         top_p: float = 0.95,
         logprobs: Optional[int] = None,
         echo: bool = False,
-        stop: List[str] = [],
+        stop: Optional[List[str]] = [],
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
@@ -592,7 +593,7 @@ class Llama:
         top_p: float = 0.95,
         logprobs: Optional[int] = None,
         echo: bool = False,
-        stop: List[str] = [],
+        stop: Optional[List[str]] = [],
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
@@ -698,7 +699,7 @@ class Llama:
         top_p: float = 0.95,
         top_k: int = 40,
         stream: bool = False,
-        stop: List[str] = [],
+        stop: Optional[List[str]] = [],
         max_tokens: int = 128,
         repeat_penalty: float = 1.1,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
@@ -717,6 +718,7 @@ class Llama:
         Returns:
             Generated chat completion or a stream of chat completion chunks.
         """
+        stop = stop if not None else []
         instructions = """Complete the following chat conversation between the user and the assistant. System messages should be strictly followed as additional instructions."""
         chat_history = "\n".join(
             f'{message["role"]} {message.get("user", "")}: {message["content"]}'

From 6153baab2d2ac7a2c6ce9caa60474d84cf78dca6 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 14 Apr 2023 09:59:33 -0400
Subject: [PATCH 26/43] Clean up logprobs implementation

---
 llama_cpp/llama.py | 106 +++++++++++++++++----------------------------
 1 file changed, 39 insertions(+), 67 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index ae25137..ecfd2f4 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -351,55 +351,19 @@ class Llama:
         else:
             stop_sequences = []
 
-        text_offset = 0
-        text_offsets: List[int] = []
-        token_logprobs: List[float] = []
-        tokens: List[str] = []
-        top_logprobs: List[Dict[str, float]] = []
-
-        self.reset()
-        self.eval(prompt_tokens)
-
         if logprobs is not None and self.params.logits_all is False:
             raise ValueError(
                 "logprobs is not supported for models created with logits_all=False"
             )
 
-        if logprobs is not None:
-            token_strs = [
-                self.detokenize([token]).decode("utf-8") for token in prompt_tokens
-            ]
-            logprobs_all = [
-                [Llama.logit_to_logprob(logit) for logit in row]
-                for row in self.all_logits
-            ]
-            for token, token_str, logprobs_token in zip(
-                prompt_tokens, token_strs, logprobs_all
-            ):
-                text_offsets.append(text_offset)
-                text_offset += len(token_str)
-                tokens.append(token_str)
-                sorted_logprobs = list(
-                    sorted(
-                        zip(logprobs_token, range(len(logprobs_token))), reverse=True
-                    )
-                )
-                token_logprobs.append(sorted_logprobs[int(token)][0])
-                top_logprob = {
-                    self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8"): logprob
-                    for logprob, i in sorted_logprobs[:logprobs]
-                }
-                top_logprob.update({token_str: sorted_logprobs[int(token)][0]})
-                top_logprobs.append(top_logprob)
-
         finish_reason = "length"
-        while True:
-            token = self.sample(
-                top_k=top_k,
-                top_p=top_p,
-                temp=temperature,
-                repeat_penalty=repeat_penalty,
-            )
+        for token in self.generate(
+            prompt_tokens,
+            top_k=top_k,
+            top_p=top_p,
+            temp=temperature,
+            repeat_penalty=repeat_penalty,
+        ):
             if token == llama_cpp.llama_token_eos():
                 text = self.detokenize(completion_tokens)
                 finish_reason = "stop"
@@ -443,34 +407,10 @@ class Llama:
                     ],
                 }
 
-            if logprobs is not None:
-                # TODO: Confirm wether this should happen before or after
-                # next eval.
-                token_str = self.detokenize([token]).decode("utf-8")
-                text_offsets.append(text_offset)
-                text_offset += len(token_str)
-                tokens.append(token_str)
-                logprobs_token = [
-                    Llama.logit_to_logprob(logit) for logit in self.all_logits[-1]
-                ]
-                sorted_logprobs = list(
-                    sorted(
-                        zip(logprobs_token, range(len(logprobs_token))), reverse=True
-                    )
-                )
-                token_logprobs.append(sorted_logprobs[int(token)][0])
-                top_logprob = {
-                    self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8"): logprob
-                    for logprob, i in sorted_logprobs[:logprobs]
-                }
-                top_logprob.update({token_str: logprobs_token[int(token)]})
-                top_logprobs.append(top_logprob)
-
             if len(completion_tokens) >= max_tokens:
                 text = self.detokenize(completion_tokens)
                 finish_reason = "length"
                 break
-            self.eval([token])
 
         if stream:
             yield {
@@ -499,6 +439,38 @@ class Llama:
 
         logprobs_or_none: Optional[CompletionLogprobs] = None
         if logprobs is not None:
+            text_offset = 0
+            text_offsets: List[int] = []
+            token_logprobs: List[float] = []
+            tokens: List[str] = []
+            top_logprobs: List[Dict[str, float]] = []
+
+            all_tokens = prompt_tokens + completion_tokens
+            all_token_strs = [
+                self.detokenize([token]).decode("utf-8") for token in all_tokens
+            ]
+            all_logprobs = [
+                [Llama.logit_to_logprob(logit) for logit in row]
+                for row in self.all_logits
+            ]
+            for token, token_str, logprobs_token in zip(
+                all_tokens, all_token_strs, all_logprobs
+            ):
+                text_offsets.append(text_offset)
+                text_offset += len(token_str)
+                tokens.append(token_str)
+                sorted_logprobs = list(
+                    sorted(
+                        zip(logprobs_token, range(len(logprobs_token))), reverse=True
+                    )
+                )
+                token_logprobs.append(sorted_logprobs[int(token)][0])
+                top_logprob = {
+                    self.detokenize([llama_cpp.llama_token(i)]).decode("utf-8"): logprob
+                    for logprob, i in sorted_logprobs[:logprobs]
+                }
+                top_logprob.update({token_str: sorted_logprobs[int(token)][0]})
+                top_logprobs.append(top_logprob)
             logprobs_or_none = {
                 "tokens": tokens,
                 "text_offset": text_offsets,

From 6c7cec0c65373d2892dbb23581af27ab407669d9 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 14 Apr 2023 10:01:15 -0400
Subject: [PATCH 27/43] Fix completion request

---
 llama_cpp/server/__main__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index c54d91b..7fc3c57 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -76,7 +76,7 @@ class CreateCompletionRequest(BaseModel):
     temperature: float = 0.8
     top_p: float = 0.95
     echo: bool = False
-    stop: List[str] = []
+    stop: Optional[List[str]] = []
     stream: bool = False
 
     # ignored or currently unsupported
@@ -173,7 +173,7 @@ class CreateChatCompletionRequest(BaseModel):
     temperature: float = 0.8
     top_p: float = 0.95
     stream: bool = False
-    stop: List[str] = []
+    stop: Optional[List[str]] = []
     max_tokens: int = 128
 
     # ignored or currently unsupported

From 9c8c2c37dce2326e3272beabd0c6460a4a4a9a3f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 14 Apr 2023 10:01:57 -0400
Subject: [PATCH 28/43] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index e7f6997..a32f7ac 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit e7f6997f897a18b6372a6460e25c5f89e1469f1d
+Subproject commit a32f7acc9f54dba1c728cb1e596bd00bf3b4eb5f

From 6e298d8fca1ee5f25239e54aa5f3eed2eee4651e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 14 Apr 2023 22:21:19 -0400
Subject: [PATCH 29/43] Set kv cache size to f16 by default

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index ecfd2f4..cd737c5 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -21,7 +21,7 @@ class Llama:
         n_ctx: int = 512,
         n_parts: int = -1,
         seed: int = 1337,
-        f16_kv: bool = False,
+        f16_kv: bool = True,
         logits_all: bool = False,
         vocab_only: bool = False,
         use_mmap: bool = True,

From 25b646c2fb1e510bf9133f1ee379cf778e99df6f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 14 Apr 2023 23:32:05 -0400
Subject: [PATCH 30/43] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index a32f7ac..c85e03d 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit a32f7acc9f54dba1c728cb1e596bd00bf3b4eb5f
+Subproject commit c85e03d12e4b8af22cb13aa9c618dcd5935862fd

From ac7068a4699f3a45c555072a9698d9de497aa88c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 14 Apr 2023 23:33:00 -0400
Subject: [PATCH 31/43] Track generated tokens internally

---
 llama_cpp/llama.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index cd737c5..3ff94a6 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -76,6 +76,7 @@ class Llama:
             maxlen=self.last_n_tokens_size,
         )
         self.tokens_consumed = 0
+        self.tokens: List[llama_cpp.llama_token] = []
         self.n_batch = min(n_ctx, n_batch)
         self.n_tokens = 0
         self.n_past = 0
@@ -140,6 +141,7 @@ class Llama:
             [llama_cpp.llama_token(0)] * self.last_n_tokens_size
         )
         self.tokens_consumed = 0
+        self.tokens.clear()
         self.n_tokens = 0
         self.n_past = 0
         self.all_logits = []
@@ -165,6 +167,7 @@ class Llama:
             )
             if int(return_code) != 0:
                 raise RuntimeError(f"llama_eval returned {return_code}")
+            self.tokens.extend(batch)
             self.last_n_tokens_data.extend(batch)
             self.tokens_consumed += len(batch)
             if self.params.logits_all:

From e90e122f2a6970edb7d10d1b95e5e97932ef8e18 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 14 Apr 2023 23:33:18 -0400
Subject: [PATCH 32/43] Use clear

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 3ff94a6..93c6288 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -144,7 +144,7 @@ class Llama:
         self.tokens.clear()
         self.n_tokens = 0
         self.n_past = 0
-        self.all_logits = []
+        self.all_logits.clear()
 
     def eval(self, tokens: Sequence[llama_cpp.llama_token]):
         """Evaluate a list of tokens.

From d7de0e8014d9b18cd7e1ede07c3ea786a532767e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Apr 2023 00:08:04 -0400
Subject: [PATCH 33/43] Bugfix

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 93c6288..0754a8d 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -339,7 +339,7 @@ class Llama:
         prompt_tokens = self.tokenize(b" " + prompt.encode("utf-8"))
         text = b""
         returned_characters = 0
-        stop = stop if not None else []
+        stop = stop if stop is not None else []
 
         if self.verbose:
             llama_cpp.llama_reset_timings(self.ctx)

From 3cd67c7bd730a721dcea915042ad8568afe76111 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Apr 2023 11:39:21 -0400
Subject: [PATCH 34/43] Add type annotations

---
 llama_cpp/llama.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 0754a8d..54a2f4a 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -332,13 +332,15 @@ class Llama:
         stream: bool = False,
     ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
         assert self.ctx is not None
-        completion_id = f"cmpl-{str(uuid.uuid4())}"
-        created = int(time.time())
+        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
+        created: int = int(time.time())
         completion_tokens: List[llama_cpp.llama_token] = []
         # Add blank space to start of prompt to match OG llama tokenizer
-        prompt_tokens = self.tokenize(b" " + prompt.encode("utf-8"))
-        text = b""
-        returned_characters = 0
+        prompt_tokens: List[llama_cpp.llama_token] = self.tokenize(
+            b" " + prompt.encode("utf-8")
+        )
+        text: bytes = b""
+        returned_characters: int = 0
         stop = stop if stop is not None else []
 
         if self.verbose:

From 02f9fb82fbfe8b33dba3f39a81625837dca34e02 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Apr 2023 11:39:52 -0400
Subject: [PATCH 35/43] Bugfix

---
 llama_cpp/llama.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 54a2f4a..e570236 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -695,10 +695,7 @@ class Llama:
         Returns:
             Generated chat completion or a stream of chat completion chunks.
         """
-        stop = stop if not None else []
-        instructions = """Complete the following chat conversation between the user and the assistant. System messages should be strictly followed as additional instructions."""
-        chat_history = "\n".join(
-            f'{message["role"]} {message.get("user", "")}: {message["content"]}'
+        stop = stop if stop is not None else []
             for message in messages
         )
         PROMPT = f" \n\n### Instructions:{instructions}\n\n### Inputs:{chat_history}\n\n### Response:\nassistant: "

From 62087514c641d2ee93b1797df3388af6d60f8c6d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Apr 2023 11:58:19 -0400
Subject: [PATCH 36/43] Update chat prompt

---
 llama_cpp/llama.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index e570236..578dcb6 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -696,10 +696,12 @@ class Llama:
             Generated chat completion or a stream of chat completion chunks.
         """
         stop = stop if stop is not None else []
+        chat_history = "".join(
+            f'### {"Human" if message["role"] == "user" else "Assistant"}:{message["content"]}'
             for message in messages
         )
-        PROMPT = f" \n\n### Instructions:{instructions}\n\n### Inputs:{chat_history}\n\n### Response:\nassistant: "
-        PROMPT_STOP = ["###", "\nuser: ", "\nassistant: ", "\nsystem: "]
+        PROMPT = chat_history + "### Assistant:"
+        PROMPT_STOP = ["### Assistant:", "### Human:", "\n"]
         completion_or_chunks = self(
             prompt=PROMPT,
             stop=PROMPT_STOP + stop,

From 83b2be6dc4e88154a72f221420823702bae6a1bc Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Apr 2023 11:58:43 -0400
Subject: [PATCH 37/43] Update chat parameters

---
 llama_cpp/llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 578dcb6..63c7b53 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -672,12 +672,12 @@ class Llama:
     def create_chat_completion(
         self,
         messages: List[ChatCompletionMessage],
-        temperature: float = 0.8,
+        temperature: float = 0.2,
         top_p: float = 0.95,
         top_k: int = 40,
         stream: bool = False,
         stop: Optional[List[str]] = [],
-        max_tokens: int = 128,
+        max_tokens: int = 256,
         repeat_penalty: float = 1.1,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         """Generate a chat completion from a list of messages.

From a6372a7ae5c32cdae7cded800dd988cd12b828fd Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Apr 2023 12:02:48 -0400
Subject: [PATCH 38/43] Update stop sequences for chat

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 63c7b53..121f91d 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -701,7 +701,7 @@ class Llama:
             for message in messages
         )
         PROMPT = chat_history + "### Assistant:"
-        PROMPT_STOP = ["### Assistant:", "### Human:", "\n"]
+        PROMPT_STOP = ["### Assistant:", "### Human:"]
         completion_or_chunks = self(
             prompt=PROMPT,
             stop=PROMPT_STOP + stop,

From 92c077136d1f0b029f8907a79eae009a750005e2 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Apr 2023 12:03:09 -0400
Subject: [PATCH 39/43] Add experimental cache

---
 llama_cpp/llama.py           | 69 +++++++++++++++++++++++++++++++++---
 llama_cpp/server/__main__.py |  5 ++-
 2 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 121f91d..b92801c 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -11,6 +11,15 @@ from . import llama_cpp
 from .llama_types import *
 
 
+class LlamaCache:
+    """Cache for a llama.cpp model.
+
+    NOTE: This implementation currently only tells the Llama class to avoid reprocessing bytes and continue from the last
+    completion. It does not actually cache the results."""
+
+    pass
+
+
 class Llama:
     """High-level Python wrapper for a llama.cpp model."""
 
@@ -82,6 +91,14 @@ class Llama:
         self.n_past = 0
         self.all_logits: List[List[float]] = []  # TODO: Use an array instead of a list.
 
+        ### HACK: This is a hack to work around the fact that the llama.cpp API does not yet support
+        ###       saving and restoring state, this allows us to continue a completion if the last
+        ###       completion_bytes is a prefix to the prompt passed in. However this is actually incorrect
+        ###       because it does not take into account stop tokens which have been processed by the model.
+        self._completion_bytes: List[bytes] = []
+        self._cache: Optional[LlamaCache] = None
+        ###
+
         self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
 
         if not os.path.exists(model_path):
@@ -135,6 +152,14 @@ class Llama:
             output += llama_cpp.llama_token_to_str(self.ctx, token)
         return output
 
+    def set_cache(self, cache: Optional[LlamaCache]):
+        """Set the cache.
+
+        Args:
+            cache: The cache to set.
+        """
+        self._cache = cache
+
     def reset(self):
         """Reset the model state."""
         self.last_n_tokens_data.extend(
@@ -245,6 +270,17 @@ class Llama:
             The generated tokens.
         """
         assert self.ctx is not None
+        ### HACK
+        if (
+            reset
+            and self._cache
+            and len(self.tokens) > 0
+            and self.tokens == tokens[: len(self.tokens)]
+        ):
+            if self.verbose:
+                print("generate cache hit", file=sys.stderr)
+            reset = False
+        ###
         if reset:
             self.reset()
         while True:
@@ -361,6 +397,21 @@ class Llama:
                 "logprobs is not supported for models created with logits_all=False"
             )
 
+        ### HACK
+        reset: bool = True
+        _prompt: bytes = prompt.encode("utf-8")
+        _completion: bytes = b"".join(self._completion_bytes)
+        if len(_completion) and self._cache and _prompt.startswith(_completion):
+            if self.verbose:
+                print("completion cache hit", file=sys.stderr)
+            reset = False
+            _prompt = _prompt[len(_completion) :]
+            prompt_tokens = self.tokenize(b" " + _prompt)
+            self._completion_bytes.append(_prompt)
+        else:
+            self._completion_bytes = [prompt.encode("utf-8")]
+        ###
+
         finish_reason = "length"
         for token in self.generate(
             prompt_tokens,
@@ -368,6 +419,7 @@ class Llama:
             top_p=top_p,
             temp=temperature,
             repeat_penalty=repeat_penalty,
+            reset=reset,
         ):
             if token == llama_cpp.llama_token_eos():
                 text = self.detokenize(completion_tokens)
@@ -397,6 +449,9 @@ class Llama:
                             break
                 text = all_text[: len(all_text) - longest]
                 returned_characters += len(text[start:])
+                ### HACK
+                self._completion_bytes.append(text[start:])
+                ###
                 yield {
                     "id": completion_id,
                     "object": "text_completion",
@@ -418,6 +473,9 @@ class Llama:
                 break
 
         if stream:
+            ### HACK
+            self._completion_bytes.append(text[returned_characters:])
+            ###
             yield {
                 "id": completion_id,
                 "object": "text_completion",
@@ -434,13 +492,16 @@ class Llama:
             }
             return
 
-        text = text.decode("utf-8")
+        ### HACK
+        self._completion_bytes.append(text)
+        ###
+        text_str = text.decode("utf-8")
 
         if echo:
-            text = prompt + text
+            text_str = prompt + text_str
 
         if suffix is not None:
-            text = text + suffix
+            text_str = text_str + suffix
 
         logprobs_or_none: Optional[CompletionLogprobs] = None
         if logprobs is not None:
@@ -493,7 +554,7 @@ class Llama:
             "model": self.model_path,
             "choices": [
                 {
-                    "text": text,
+                    "text": text_str,
                     "index": 0,
                     "logprobs": logprobs_or_none,
                     "finish_reason": finish_reason,
diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 7fc3c57..48481c6 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -35,6 +35,7 @@ class Settings(BaseSettings):
     embedding: bool = True
     last_n_tokens_size: int = 64
     logits_all: bool = False
+    cache: bool = False  # WARNING: This is an experimental feature
 
 
 app = FastAPI(
@@ -60,6 +61,9 @@ llama = llama_cpp.Llama(
     n_ctx=settings.n_ctx,
     last_n_tokens_size=settings.last_n_tokens_size,
 )
+if settings.cache:
+    cache = llama_cpp.LlamaCache()
+    llama.set_cache(cache)
 llama_lock = Lock()
 
 
@@ -68,7 +72,6 @@ def get_llama():
         yield llama
 
 
-
 class CreateCompletionRequest(BaseModel):
     prompt: Union[str, List[str]]
     suffix: Optional[str] = Field(None)

From 887f3b73ac16976d63c699adcb399ad63054ee74 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Apr 2023 12:16:05 -0400
Subject: [PATCH 40/43] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index c85e03d..e95b655 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit c85e03d12e4b8af22cb13aa9c618dcd5935862fd
+Subproject commit e95b6554b493e71a0275764342e09bd5784a7026

From 89856ef00d377d0b63ce91fb3c5d184dcbfa9124 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Apr 2023 17:32:53 -0400
Subject: [PATCH 41/43] Bugfix: only eval new tokens

---
 llama_cpp/llama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index b92801c..edd2eef 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -280,6 +280,7 @@ class Llama:
             if self.verbose:
                 print("generate cache hit", file=sys.stderr)
             reset = False
+            tokens = tokens[len(self.tokens) :]
         ###
         if reset:
             self.reset()

From e38485a66d0a92100815cccba3ba81439debdb6c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Apr 2023 20:27:55 -0400
Subject: [PATCH 42/43] Bump version.

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a0b6df3..aeb5579 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.33"
+version = "0.1.34"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 1648f64..b0ff844 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.33",
+    version="0.1.34",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From b2a24bddacc7b10d1ba8a0dff1d8b5fae9bfbad3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Apr 2023 22:31:14 -0400
Subject: [PATCH 43/43] Update docs

---
 docs/index.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/index.md b/docs/index.md
index 4055155..5424e26 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -104,10 +104,13 @@ python3 setup.py develop
             - create_completion
             - __call__
             - create_chat_completion
+            - set_cache
             - token_bos
             - token_eos
         show_root_heading: true
 
+::: llama_cpp.LlamaCache
+
 ::: llama_cpp.llama_cpp
     options:
         show_if_no_docstring: true