Merge branch 'main' into fix-state-pickle

2023-06-23 15:13:07 -04:00 · 2023-06-23 15:13:07 -04:00 · 877ca6d016
commit 877ca6d016
parent 10b0cb727b b6f9388436
11 changed files with 234 additions and 127 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

+## [0.1.65]
+
+### Added
+
+- (llama.cpp) Fix struct misalignment bug
+
+## [0.1.64]
+
+### Added
+
+- (llama.cpp) Update llama.cpp
+- Fix docs for seed. Set -1 for random.
+
+## [0.1.63]
+
+### Added
+
+- (llama.cpp) Add full gpu utilisation in CUDA
+- (llama.cpp) Add get_vocab
+- (llama.cpp) Add low_vram parameter
+- (server) Add logit_bias parameter
+
 ## [0.1.62]

 ### Fixed
--- a/README.md
+++ b/README.md
@ -17,6 +17,7 @@ This package provides:

 Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python).

+
 ## Installation from PyPI (recommended)

 Install from PyPI (requires a c compiler):
@ -25,7 +26,7 @@ Install from PyPI (requires a c compiler):
 pip install llama-cpp-python
 ```

-The above command will attempt to install the package and build build `llama.cpp` from source.
+The above command will attempt to install the package and build `llama.cpp` from source.
 This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system.

 If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different  compiler options, please add the following flags to ensure that the package is rebuilt correctly:
@ -70,6 +71,8 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor
 CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```

+Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md)
+
 ## High-level API

 The high-level API provides a simple managed interface through the `Llama` class.
--- a/docs/macos_install.md
+++ b/docs/macos_install.md
@ -0,0 +1,59 @@
+
+# llama-cpp-python - MacOS Install with Metal GPU
+
+
+**(1) Make sure you have xcode installed... at least the command line parts**
+```
+# check the path of your xcode install 
+xcode-select -p
+
+# xcode installed returns
+# /Applications/Xcode-beta.app/Contents/Developer
+
+# if xcode is missing then install it... it takes ages;
+xcode-select --install
+```
+
+**(2) Install the conda version for MacOS that supports Metal GPU**
+```
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
+bash Miniforge3-MacOSX-arm64.sh
+```
+
+**(3) Make a conda environment**
+```
+conda create -n llama python=3.9.16
+conda activate llama
+```
+
+**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU**  
+    *(you needed xcode installed in order pip to build/compile the C++ code)*
+```
+pip uninstall llama-cpp-python -y
+CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
+pip install 'llama-cpp-python[server]'
+
+# you should now have llama-cpp-python v0.1.62 installed
+llama-cpp-python         0.1.62      
+
+```
+
+**(4) Download a v3 ggml model**
+ - **ggmlv3**
+ - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0
+
+https://huggingface.co/TheBloke/open-llama-7b-open-instruct-GGML
+
+
+**(6) run the llama-cpp-python API server with MacOS Metal GPU support**
+```
+# config your ggml model path
+# make sure it is ggml v3
+# make sure it is q4_0
+export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin
+python3 -m llama_cpp.server --model $MODEL  --n_gpu_layers 1
+```
+
+***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used*
+
+
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -221,6 +221,7 @@ class Llama:
        last_n_tokens_size: int = 64,
        lora_base: Optional[str] = None,
        lora_path: Optional[str] = None,
+        low_vram: bool = False,
        verbose: bool = True,
    ):
        """Load a llama.cpp model from `model_path`.
@ -229,7 +230,7 @@ class Llama:
            model_path: Path to the model.
            n_ctx: Maximum context size.
            n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
-            seed: Random seed. 0 for random.
+            seed: Random seed. -1 for random.
            f16_kv: Use half-precision for key/value cache.
            logits_all: Return logits for all tokens, not just the last token.
            vocab_only: Only load the vocabulary no weights.
@ -262,6 +263,7 @@ class Llama:
        self.params.use_mmap = use_mmap if lora_path is None else False
        self.params.use_mlock = use_mlock
        self.params.embedding = embedding
+        self.params.low_vram = low_vram

        self.last_n_tokens_size = last_n_tokens_size
        self.n_batch = min(n_ctx, n_batch)
@ -814,7 +816,7 @@ class Llama:
            llama_cpp.llama_reset_timings(self.ctx)

        if len(prompt_tokens) > self._n_ctx:
-            raise ValueError(f"Requested tokens exceed context window of {self._n_ctx}")
+            raise ValueError(f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}")

        # Truncate max_tokens if requested tokens would exceed the context window
        max_tokens = (
@ -1380,6 +1382,7 @@ class Llama:
        mirostat_tau: float = 5.0,
        mirostat_eta: float = 0.1,
        model: Optional[str] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
        """Generate a chat completion from a list of messages.

@ -1421,6 +1424,7 @@ class Llama:
            mirostat_tau=mirostat_tau,
            mirostat_eta=mirostat_eta,
            model=model,
+            logits_processor=logits_processor,
        )
        if stream:
            chunks: Iterator[CompletionChunk] = completion_or_chunks  # type: ignore
@ -1447,6 +1451,7 @@ class Llama:
            use_mmap=self.params.use_mmap,
            use_mlock=self.params.use_mlock,
            embedding=self.params.embedding,
+            low_vram=self.params.low_vram,
            last_n_tokens_size=self.last_n_tokens_size,
            n_batch=self.n_batch,
            n_threads=self.n_threads,
@ -1470,6 +1475,7 @@ class Llama:
            use_mmap=state["use_mmap"],
            use_mlock=state["use_mlock"],
            embedding=state["embedding"],
+            low_vram=state["low_vram"],
            n_threads=state["n_threads"],
            n_batch=state["n_batch"],
            last_n_tokens_size=state["last_n_tokens_size"],
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -150,45 +150,43 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)


 # struct llama_context_params {
+#     int seed;                              // RNG seed, -1 for random
 #     int n_ctx;                             // text context
 #     int n_batch;                           // prompt processing batch size
 #     int n_gpu_layers;                      // number of layers to store in VRAM
 #     int main_gpu;                          // the GPU that is used for scratch and small tensors
 #     float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
-#     int seed;                              // RNG seed, -1 for random
+#     // called with a progress value between 0 and 1, pass NULL to disable
+#     llama_progress_callback progress_callback;
+#     // context pointer passed to the progress callback
+#     void * progress_callback_user_data;

+#     // Keep the booleans together to avoid misalignment during copy-by-value.
+#     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
 #     bool f16_kv;     // use fp16 for KV cache
 #     bool logits_all; // the llama_eval() call computes all logits, not just the last one
 #     bool vocab_only; // only load the vocabulary, no weights
 #     bool use_mmap;   // use mmap if possible
 #     bool use_mlock;  // force system to keep model in RAM
 #     bool embedding;  // embedding mode only
-
-
-#     // called with a progress value between 0 and 1, pass NULL to disable
-#     llama_progress_callback progress_callback;
-#     // context pointer passed to the progress callback
-#     void * progress_callback_user_data;
 # };
 class llama_context_params(Structure):
    _fields_ = [
+        ("seed", c_int),
        ("n_ctx", c_int),
        ("n_batch", c_int),
        ("n_gpu_layers", c_int),
        ("main_gpu", c_int),
        ("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
-        ("seed", c_int),
+        ("progress_callback", llama_progress_callback),
+        ("progress_callback_user_data", c_void_p),
+        ("low_vram", c_bool),
        ("f16_kv", c_bool),
-        (
-            "logits_all",
-            c_bool,
-        ),
+        ("logits_all", c_bool),
        ("vocab_only", c_bool),
        ("use_mmap", c_bool),
        ("use_mlock", c_bool),
        ("embedding", c_bool),
-        ("progress_callback", llama_progress_callback),
-        ("progress_callback_user_data", c_void_p),
    ]


@ -555,6 +553,26 @@ _lib.llama_n_embd.argtypes = [llama_context_p]
 _lib.llama_n_embd.restype = c_int


+# // Get the vocabulary as output parameters.
+# // Returns number of results.
+# LLAMA_API int llama_get_vocab(
+#         const struct llama_context * ctx,
+#                         const char * * strings,
+#                                 float * scores,
+#                                 int   capacity);
+def llama_get_vocab(
+    ctx: llama_context_p,
+    strings,  # type: Array[c_char_p] # type: ignore
+    scores,  # type: Array[c_float] # type: ignore
+    capacity: c_int,
+) -> int:
+    return _lib.llama_get_vocab(ctx, strings, scores, capacity)
+
+
+_lib.llama_get_vocab.argtypes = [llama_context_p, c_char_p, c_float, c_int]
+_lib.llama_get_vocab.restype = c_int
+
+
 # Token logits obtained from the last call to llama_eval()
 # The logits for the last token are stored in the last row
 # Can be mutated in order to change the probabilities of the next token
@ -596,7 +614,7 @@ _lib.llama_token_to_str.restype = c_char_p
 # Special tokens


-# LLAMA_API llama_token llama_token_bos();
+# LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
 def llama_token_bos() -> int:
    return _lib.llama_token_bos()

@ -605,7 +623,7 @@ _lib.llama_token_bos.argtypes = []
 _lib.llama_token_bos.restype = llama_token


-# LLAMA_API llama_token llama_token_eos();
+# LLAMA_API llama_token llama_token_eos(); // end-of-sentence
 def llama_token_eos() -> int:
    return _lib.llama_token_eos()

@ -614,7 +632,7 @@ _lib.llama_token_eos.argtypes = []
 _lib.llama_token_eos.restype = llama_token


-# LLAMA_API llama_token llama_token_nl();
+# LLAMA_API llama_token llama_token_nl(); // next-line
 def llama_token_nl() -> int:
    return _lib.llama_token_nl()

--- a/llama_cpp/server/main.py
+++ b/llama_cpp/server/main.py
@ -46,5 +46,5 @@ if __name__ == "__main__":
    app = create_app(settings=settings)

    uvicorn.run(
-        app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
+        app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port))
    )
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@ -30,6 +30,9 @@ class Settings(BaseSettings):
        ge=0,
        description="The number of layers to put on the GPU. The rest will be on the CPU.",
    )
+    seed: int = Field(
+        default=1337, description="Random seed. -1 for random."
+    )
    n_batch: int = Field(
        default=512, ge=1, description="The batch size to use per eval."
    )
@ -48,6 +51,10 @@ class Settings(BaseSettings):
        description="Use mmap.",
    )
    embedding: bool = Field(default=True, description="Whether to use embeddings.")
+    low_vram: bool = Field(
+        default=False,
+        description="Whether to use less VRAM. This will reduce performance.",
+    )
    last_n_tokens_size: int = Field(
        default=64,
        ge=0,
@ -72,6 +79,12 @@ class Settings(BaseSettings):
    verbose: bool = Field(
        default=True, description="Whether to print debug information."
    )
+    host: str = Field(
+        default="localhost", description="Listen address"
+    )
+    port: int = Field(
+        default=8000, description="Listen port"
+    )


 router = APIRouter()
@ -99,6 +112,7 @@ def create_app(settings: Optional[Settings] = None):
    llama = llama_cpp.Llama(
        model_path=settings.model,
        n_gpu_layers=settings.n_gpu_layers,
+        seed=settings.seed,
        f16_kv=settings.f16_kv,
        use_mlock=settings.use_mlock,
        use_mmap=settings.use_mmap,
@ -113,8 +127,12 @@ def create_app(settings: Optional[Settings] = None):
    )
    if settings.cache:
        if settings.cache_type == "disk":
+            if settings.verbose:
+                print(f"Using disk cache with size {settings.cache_size}")
            cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
        else:
+            if settings.verbose:
+                print(f"Using ram cache with size {settings.cache_size}")
            cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)

        cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size)
@ -249,18 +267,19 @@ class CreateCompletionRequest(BaseModel):
    )
    presence_penalty: Optional[float] = presence_penalty_field
    frequency_penalty: Optional[float] = frequency_penalty_field
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    logprobs: Optional[int] = Field(None)

    # ignored or currently unsupported
    model: Optional[str] = model_field
    n: Optional[int] = 1
-    logprobs: Optional[int] = Field(None)
    best_of: Optional[int] = 1
-    logit_bias: Optional[Dict[str, float]] = Field(None)
    user: Optional[str] = Field(None)

    # llama.cpp specific parameters
    top_k: int = top_k_field
    repeat_penalty: float = repeat_penalty_field
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)

    class Config:
        schema_extra = {
@ -274,6 +293,39 @@ class CreateCompletionRequest(BaseModel):
 CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)


+def make_logit_bias_processor(
+    llama: llama_cpp.Llama,
+    logit_bias: Dict[str, float],
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]],
+):
+    if logit_bias_type is None:
+        logit_bias_type = "input_ids"
+
+    to_bias: Dict[int, float] = {}
+    if logit_bias_type == "input_ids":
+        for input_id, score in logit_bias.items():
+            input_id = int(input_id)
+            to_bias[input_id] = score
+
+    elif logit_bias_type == "tokens":
+        for token, score in logit_bias.items():
+            token = token.encode('utf-8')
+            for input_id in llama.tokenize(token, add_bos=False):
+                to_bias[input_id] = score
+
+    def logit_bias_processor(
+        input_ids: List[int],
+        scores: List[float],
+    ) -> List[float]:
+        new_scores = [None] * len(scores)
+        for input_id, score in enumerate(scores):
+            new_scores[input_id] = score + to_bias.get(input_id, 0.0)
+
+        return new_scores
+
+    return logit_bias_processor
+
+
@router.post(
    "/v1/completions",
    response_model=CreateCompletionResponse,
@ -291,9 +343,16 @@ async def create_completion(
        "n",
        "best_of",
        "logit_bias",
+        "logit_bias_type",
        "user",
    }
    kwargs = body.dict(exclude=exclude)
+
+    if body.logit_bias is not None:
+        kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
+            make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
+        ])
+
    if body.stream:
        send_chan, recv_chan = anyio.create_memory_object_stream(10)

@ -372,16 +431,17 @@ class CreateChatCompletionRequest(BaseModel):
    stream: bool = stream_field
    presence_penalty: Optional[float] = presence_penalty_field
    frequency_penalty: Optional[float] = frequency_penalty_field
+    logit_bias: Optional[Dict[str, float]] = Field(None)

    # ignored or currently unsupported
    model: Optional[str] = model_field
    n: Optional[int] = 1
-    logit_bias: Optional[Dict[str, float]] = Field(None)
    user: Optional[str] = Field(None)

    # llama.cpp specific parameters
    top_k: int = top_k_field
    repeat_penalty: float = repeat_penalty_field
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)

    class Config:
        schema_extra = {
@ -413,9 +473,16 @@ async def create_chat_completion(
    exclude = {
        "n",
        "logit_bias",
+        "logit_bias_type",
        "user",
    }
    kwargs = body.dict(exclude=exclude)
+
+    if body.logit_bias is not None:
+        kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
+            make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
+        ])
+
    if body.stream:
        send_chan, recv_chan = anyio.create_memory_object_stream(10)

--- a/poetry.lock
+++ b/poetry.lock
@ -1,10 +1,9 @@
-# This file is automatically @generated by Poetry and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.

 [[package]]
 name = "anyio"
 version = "3.6.2"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
-category = "main"
 optional = false
 python-versions = ">=3.6.2"
 files = [
@ -25,7 +24,6 @@ trio = ["trio (>=0.16,<0.22)"]
 name = "black"
 version = "23.3.0"
 description = "The uncompromising code formatter."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -75,7 +73,6 @@ uvloop = ["uvloop (>=0.15.2)"]
 name = "bleach"
 version = "6.0.0"
 description = "An easy safelist-based HTML-sanitizing tool."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -94,7 +91,6 @@ css = ["tinycss2 (>=1.1.0,<1.2)"]
 name = "certifi"
 version = "2023.5.7"
 description = "Python package for providing Mozilla's CA Bundle."
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -106,7 +102,6 @@ files = [
 name = "cffi"
 version = "1.15.1"
 description = "Foreign Function Interface for Python calling C code."
-category = "dev"
 optional = false
 python-versions = "*"
 files = [
@ -183,7 +178,6 @@ pycparser = "*"
 name = "charset-normalizer"
 version = "3.1.0"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
-category = "dev"
 optional = false
 python-versions = ">=3.7.0"
 files = [
@ -268,7 +262,6 @@ files = [
 name = "click"
 version = "8.1.3"
 description = "Composable command line interface toolkit"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -283,7 +276,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
 name = "colorama"
 version = "0.4.6"
 description = "Cross-platform colored terminal text."
-category = "main"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
 files = [
@ -295,7 +287,6 @@ files = [
 name = "cryptography"
 version = "40.0.2"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -333,11 +324,21 @@ test = ["iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-co
 test-randomorder = ["pytest-randomly"]
 tox = ["tox"]

+[[package]]
+name = "diskcache"
+version = "5.6.1"
+description = "Disk Cache -- Disk and file backed persistent cache."
+optional = false
+python-versions = ">=3"
+files = [
+    {file = "diskcache-5.6.1-py3-none-any.whl", hash = "sha256:558c6a2d5d7c721bb00e40711803d6804850c9f76c426ed81ecc627fe9d2ce2d"},
+    {file = "diskcache-5.6.1.tar.gz", hash = "sha256:e4c978532feff5814c4cc00fe1e11e40501985946643d73220d41ee7737c72c3"},
+]
+
 [[package]]
 name = "distro"
 version = "1.8.0"
 description = "Distro - an OS platform information API"
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -349,7 +350,6 @@ files = [
 name = "docutils"
 version = "0.20"
 description = "Docutils -- Python Documentation Utilities"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -361,7 +361,6 @@ files = [
 name = "exceptiongroup"
 version = "1.1.1"
 description = "Backport of PEP 654 (exception groups)"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -374,31 +373,26 @@ test = ["pytest (>=6)"]

 [[package]]
 name = "fastapi"
-version = "0.96.0"
+version = "0.97.0"
 description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
-category = "main"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "fastapi-0.96.0-py3-none-any.whl", hash = "sha256:b8e11fe81e81eab4e1504209917338e0b80f783878a42c2b99467e5e1019a1e9"},
-    {file = "fastapi-0.96.0.tar.gz", hash = "sha256:71232d47c2787446991c81c41c249f8a16238d52d779c0e6b43927d3773dbe3c"},
+    {file = "fastapi-0.97.0-py3-none-any.whl", hash = "sha256:95d757511c596409930bd20673358d4a4d709004edb85c5d24d6ffc48fabcbf2"},
+    {file = "fastapi-0.97.0.tar.gz", hash = "sha256:b53248ee45f64f19bb7600953696e3edf94b0f7de94df1e5433fc5c6136fa986"},
 ]

 [package.dependencies]
-pydantic = ">=1.6.2,<1.7 || >1.7,<1.7.1 || >1.7.1,<1.7.2 || >1.7.2,<1.7.3 || >1.7.3,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0"
+pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0"
 starlette = ">=0.27.0,<0.28.0"

 [package.extras]
 all = ["email-validator (>=1.1.1)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
-dev = ["pre-commit (>=2.17.0,<3.0.0)", "ruff (==0.0.138)", "uvicorn[standard] (>=0.12.0,<0.21.0)"]
-doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-markdownextradata-plugin (>=0.1.7,<0.3.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pyyaml (>=5.3.1,<7.0.0)", "typer-cli (>=0.0.13,<0.0.14)", "typer[all] (>=0.6.1,<0.8.0)"]
-test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==23.1.0)", "coverage[toml] (>=6.5.0,<8.0)", "databases[sqlite] (>=0.3.2,<0.7.0)", "email-validator (>=1.1.1,<2.0.0)", "flask (>=1.1.2,<3.0.0)", "httpx (>=0.23.0,<0.24.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.982)", "orjson (>=3.2.1,<4.0.0)", "passlib[bcrypt] (>=1.7.2,<2.0.0)", "peewee (>=3.13.3,<4.0.0)", "pytest (>=7.1.3,<8.0.0)", "python-jose[cryptography] (>=3.3.0,<4.0.0)", "python-multipart (>=0.0.5,<0.0.7)", "pyyaml (>=5.3.1,<7.0.0)", "ruff (==0.0.138)", "sqlalchemy (>=1.3.18,<1.4.43)", "types-orjson (==3.6.2)", "types-ujson (==5.7.0.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,<6.0.0)"]

 [[package]]
 name = "ghp-import"
 version = "2.1.0"
 description = "Copy your docs directly to the gh-pages branch."
-category = "dev"
 optional = false
 python-versions = "*"
 files = [
@ -416,7 +410,6 @@ dev = ["flake8", "markdown", "twine", "wheel"]
 name = "griffe"
 version = "0.27.3"
 description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -431,7 +424,6 @@ colorama = ">=0.4"
 name = "h11"
 version = "0.14.0"
 description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -443,7 +435,6 @@ files = [
 name = "httpcore"
 version = "0.17.0"
 description = "A minimal low-level HTTP client."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -455,17 +446,16 @@ files = [
 anyio = ">=3.0,<5.0"
 certifi = "*"
 h11 = ">=0.13,<0.15"
-sniffio = ">=1.0.0,<2.0.0"
+sniffio = "==1.*"

 [package.extras]
 http2 = ["h2 (>=3,<5)"]
-socks = ["socksio (>=1.0.0,<2.0.0)"]
+socks = ["socksio (==1.*)"]

 [[package]]
 name = "httpx"
 version = "0.24.1"
 description = "The next generation HTTP client."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -481,15 +471,14 @@ sniffio = "*"

 [package.extras]
 brotli = ["brotli", "brotlicffi"]
-cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"]
+cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
 http2 = ["h2 (>=3,<5)"]
-socks = ["socksio (>=1.0.0,<2.0.0)"]
+socks = ["socksio (==1.*)"]

 [[package]]
 name = "idna"
 version = "3.4"
 description = "Internationalized Domain Names in Applications (IDNA)"
-category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@ -501,7 +490,6 @@ files = [
 name = "importlib-metadata"
 version = "6.6.0"
 description = "Read metadata from Python packages"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -521,7 +509,6 @@ testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packag
 name = "importlib-resources"
 version = "5.12.0"
 description = "Read resources from Python packages"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -540,7 +527,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec
 name = "iniconfig"
 version = "2.0.0"
 description = "brain-dead simple config-ini parsing"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -552,7 +538,6 @@ files = [
 name = "jaraco-classes"
 version = "3.2.3"
 description = "Utility functions for Python class constructs"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -571,7 +556,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec
 name = "jeepney"
 version = "0.8.0"
 description = "Low-level, pure Python DBus protocol wrapper."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -587,7 +571,6 @@ trio = ["async_generator", "trio"]
 name = "jinja2"
 version = "3.1.2"
 description = "A very fast and expressive template engine."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -605,7 +588,6 @@ i18n = ["Babel (>=2.7)"]
 name = "keyring"
 version = "23.13.1"
 description = "Store and access your passwords safely."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -630,7 +612,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec
 name = "markdown"
 version = "3.3.7"
 description = "Python implementation of Markdown."
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -648,7 +629,6 @@ testing = ["coverage", "pyyaml"]
 name = "markdown-it-py"
 version = "2.2.0"
 description = "Python port of markdown-it. Markdown parsing, done right!"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -673,7 +653,6 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
 name = "markupsafe"
 version = "2.1.2"
 description = "Safely add untrusted strings to HTML/XML markup."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -733,7 +712,6 @@ files = [
 name = "mdurl"
 version = "0.1.2"
 description = "Markdown URL utilities"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -745,7 +723,6 @@ files = [
 name = "mergedeep"
 version = "1.3.4"
 description = "A deep merge function for 🐍."
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -757,7 +734,6 @@ files = [
 name = "mkdocs"
 version = "1.4.3"
 description = "Project documentation with Markdown."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -786,7 +762,6 @@ min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-imp
 name = "mkdocs-autorefs"
 version = "0.4.1"
 description = "Automatically link across pages in MkDocs."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -800,14 +775,13 @@ mkdocs = ">=1.1"

 [[package]]
 name = "mkdocs-material"
-version = "9.1.15"
+version = "9.1.16"
 description = "Documentation that simply works"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocs_material-9.1.15-py3-none-any.whl", hash = "sha256:b49e12869ab464558e2dd3c5792da5b748a7e0c48ee83b4d05715f98125a7a39"},
-    {file = "mkdocs_material-9.1.15.tar.gz", hash = "sha256:8513ab847c9a541ed3d11a3a7eed556caf72991ee786c31c5aac6691a121088a"},
+    {file = "mkdocs_material-9.1.16-py3-none-any.whl", hash = "sha256:f9e62558a6b01ffac314423cbc223d970c25fbc78999860226245b64e64d6751"},
+    {file = "mkdocs_material-9.1.16.tar.gz", hash = "sha256:1021bfea20f00a9423530c8c2ae9be3c78b80f5a527b3f822e6de3d872e5ab79"},
 ]

 [package.dependencies]
@ -825,7 +799,6 @@ requests = ">=2.26"
 name = "mkdocs-material-extensions"
 version = "1.1.1"
 description = "Extension pack for Python Markdown and MkDocs Material."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -837,7 +810,6 @@ files = [
 name = "mkdocstrings"
 version = "0.22.0"
 description = "Automatic documentation from sources, for MkDocs."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -865,7 +837,6 @@ python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"]
 name = "mkdocstrings-python"
 version = "0.10.1"
 description = "A Python handler for mkdocstrings."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -881,7 +852,6 @@ mkdocstrings = ">=0.20"
 name = "more-itertools"
 version = "9.1.0"
 description = "More routines for operating on iterables, beyond itertools"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -893,7 +863,6 @@ files = [
 name = "mypy-extensions"
 version = "1.0.0"
 description = "Type system extensions for programs checked with the mypy type checker."
-category = "dev"
 optional = false
 python-versions = ">=3.5"
 files = [
@ -905,7 +874,6 @@ files = [
 name = "numpy"
 version = "1.24.3"
 description = "Fundamental package for array computing in Python"
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@ -943,7 +911,6 @@ files = [
 name = "packaging"
 version = "23.1"
 description = "Core utilities for Python packages"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -955,7 +922,6 @@ files = [
 name = "pathspec"
 version = "0.11.1"
 description = "Utility library for gitignore style pattern matching of file paths."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -967,7 +933,6 @@ files = [
 name = "pkginfo"
 version = "1.9.6"
 description = "Query metadata from sdists / bdists / installed packages."
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -982,7 +947,6 @@ testing = ["pytest", "pytest-cov"]
 name = "platformdirs"
 version = "3.5.0"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -998,7 +962,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-
 name = "pluggy"
 version = "1.0.0"
 description = "plugin and hook calling mechanisms for python"
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -1014,7 +977,6 @@ testing = ["pytest", "pytest-benchmark"]
 name = "pycparser"
 version = "2.21"
 description = "C parser in Python"
-category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@ -1026,7 +988,6 @@ files = [
 name = "pydantic"
 version = "1.10.7"
 description = "Data validation and settings management using python type hints"
-category = "main"
 optional = true
 python-versions = ">=3.7"
 files = [
@ -1079,7 +1040,6 @@ email = ["email-validator (>=1.0.3)"]
 name = "pygments"
 version = "2.15.1"
 description = "Pygments is a syntax highlighting package written in Python."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1094,7 +1054,6 @@ plugins = ["importlib-metadata"]
 name = "pymdown-extensions"
 version = "9.11"
 description = "Extension pack for Python Markdown."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1108,14 +1067,13 @@ pyyaml = "*"

 [[package]]
 name = "pytest"
-version = "7.3.1"
+version = "7.3.2"
 description = "pytest: simple powerful testing with Python"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
-    {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
+    {file = "pytest-7.3.2-py3-none-any.whl", hash = "sha256:cdcbd012c9312258922f8cd3f1b62a6580fdced17db6014896053d47cddf9295"},
+    {file = "pytest-7.3.2.tar.gz", hash = "sha256:ee990a3cc55ba808b80795a79944756f315c67c12b56abd3ac993a7b8c17030b"},
 ]

 [package.dependencies]
@ -1127,13 +1085,12 @@ pluggy = ">=0.12,<2.0"
 tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}

 [package.extras]
-testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]

 [[package]]
 name = "python-dateutil"
 version = "2.8.2"
 description = "Extensions to the standard Python datetime module"
-category = "dev"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
 files = [
@ -1148,7 +1105,6 @@ six = ">=1.5"
 name = "pywin32-ctypes"
 version = "0.2.0"
 description = ""
-category = "dev"
 optional = false
 python-versions = "*"
 files = [
@ -1160,7 +1116,6 @@ files = [
 name = "pyyaml"
 version = "6.0"
 description = "YAML parser and emitter for Python"
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -1210,7 +1165,6 @@ files = [
 name = "pyyaml-env-tag"
 version = "0.1"
 description = "A custom YAML tag for referencing environment variables in YAML files. "
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -1225,7 +1179,6 @@ pyyaml = "*"
 name = "readme-renderer"
 version = "37.3"
 description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1245,7 +1198,6 @@ md = ["cmarkgfm (>=0.8.0)"]
 name = "regex"
 version = "2023.5.5"
 description = "Alternative regular expression module, to replace re."
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -1343,7 +1295,6 @@ files = [
 name = "requests"
 version = "2.30.0"
 description = "Python HTTP for Humans."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1365,7 +1316,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 name = "requests-toolbelt"
 version = "1.0.0"
 description = "A utility belt for advanced users of python-requests"
-category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@ -1380,7 +1330,6 @@ requests = ">=2.0.1,<3.0.0"
 name = "rfc3986"
 version = "2.0.0"
 description = "Validating URI References per RFC 3986"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1395,7 +1344,6 @@ idna2008 = ["idna"]
 name = "rich"
 version = "13.3.5"
 description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
-category = "dev"
 optional = false
 python-versions = ">=3.7.0"
 files = [
@ -1415,7 +1363,6 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"]
 name = "scikit-build"
 version = "0.17.6"
 description = "Improved build system generator for Python C/C++/Fortran/Cython extensions"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1440,7 +1387,6 @@ test = ["build (>=0.7)", "cython (>=0.25.1)", "importlib-metadata", "pytest (>=6
 name = "secretstorage"
 version = "3.3.3"
 description = "Python bindings to FreeDesktop.org Secret Service API"
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@ -1456,7 +1402,6 @@ jeepney = ">=0.6"
 name = "setuptools"
 version = "67.7.2"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1473,7 +1418,6 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (
 name = "six"
 version = "1.16.0"
 description = "Python 2 and 3 compatibility utilities"
-category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
 files = [
@ -1485,7 +1429,6 @@ files = [
 name = "sniffio"
 version = "1.3.0"
 description = "Sniff out which async library your code is running under"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1497,7 +1440,6 @@ files = [
 name = "sse-starlette"
 version = "1.6.1"
 description = "\"SSE plugin for Starlette\""
-category = "main"
 optional = true
 python-versions = ">=3.8"
 files = [
@ -1512,7 +1454,6 @@ starlette = "*"
 name = "starlette"
 version = "0.27.0"
 description = "The little ASGI library that shines."
-category = "main"
 optional = true
 python-versions = ">=3.7"
 files = [
@ -1531,7 +1472,6 @@ full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyam
 name = "tomli"
 version = "2.0.1"
 description = "A lil' TOML parser"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1543,7 +1483,6 @@ files = [
 name = "twine"
 version = "4.0.2"
 description = "Collection of utilities for publishing packages on PyPI"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1566,7 +1505,6 @@ urllib3 = ">=1.26.0"
 name = "typing-extensions"
 version = "4.6.3"
 description = "Backported and Experimental Type Hints for Python 3.7+"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1578,7 +1516,6 @@ files = [
 name = "urllib3"
 version = "2.0.2"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1596,7 +1533,6 @@ zstd = ["zstandard (>=0.18.0)"]
 name = "uvicorn"
 version = "0.22.0"
 description = "The lightning-fast ASGI server."
-category = "main"
 optional = true
 python-versions = ">=3.7"
 files = [
@ -1615,7 +1551,6 @@ standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)",
 name = "watchdog"
 version = "3.0.0"
 description = "Filesystem events monitoring"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1655,7 +1590,6 @@ watchmedo = ["PyYAML (>=3.10)"]
 name = "webencodings"
 version = "0.5.1"
 description = "Character encoding aliases for legacy web content"
-category = "dev"
 optional = false
 python-versions = "*"
 files = [
@ -1667,7 +1601,6 @@ files = [
 name = "wheel"
 version = "0.40.0"
 description = "A built-package format for Python"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1682,7 +1615,6 @@ test = ["pytest (>=6.0.0)"]
 name = "zipp"
 version = "3.15.0"
 description = "Backport of pathlib-compatible object wrapper for zip files"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@ -1695,9 +1627,9 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker
 testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]

 [extras]
-server = ["uvicorn", "fastapi", "sse-starlette"]
+server = ["fastapi", "sse-starlette", "uvicorn"]

 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "5c3354c253bc7ab7c7577a9a3733c7a341e91176e1d0c13dc2e3f3dcc0971bbe"
+content-hash = "fabdd2d7dba563fe7b01b4592dfb33e520b5f6e67317ce5f03205ecba396a577"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.62"
+version = "0.1.65"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
@ -18,7 +18,7 @@ typing-extensions = "^4.6.3"
 numpy = "^1.20.0"
 diskcache = "^5.6.1"
 uvicorn = { version = "^0.22.0", optional = true }
-fastapi = { version = "^0.96.0", optional = true }
+fastapi = { version = "^0.97.0", optional = true }
 sse-starlette = { version = "^1.6.1", optional = true }

 [tool.poetry.group.dev.dependencies]
@ -26,8 +26,8 @@ black = "^23.3.0"
 twine = "^4.0.2"
 mkdocs = "^1.4.3"
 mkdocstrings = {extras = ["python"], version = "^0.22.0"}
-mkdocs-material = "^9.1.15"
-pytest = "^7.3.1"
+mkdocs-material = "^9.1.16"
+pytest = "^7.3.2"
 httpx = "^0.24.1"
 scikit-build = "0.17.6"

--- a/setup.py
+++ b/setup.py
@ -10,7 +10,7 @@ setup(
    description="A Python wrapper for llama.cpp",
    long_description=long_description,
    long_description_content_type="text/markdown",
-    version="0.1.62",
+    version="0.1.65",
    author="Andrei Betlen",
    author_email="abetlen@gmail.com",
    license="MIT",
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 4de0334f5cabf4696eced2e5d6e279fdfaa6c0f2
+Subproject commit 2322ec223a21625dfe9bd73ee677444a98a24ac9