From 60e85cbe4676ddfc5849d9b147de72f339516cde Mon Sep 17 00:00:00 2001
From: Huge <mr.huge@seznam.cz>
Date: Wed, 2 Aug 2023 12:27:08 +0200
Subject: [PATCH 01/16] Fix dev setup in README.md so that everyone can run it

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index ea1e07f..614ba03 100644
--- a/README.md
+++ b/README.md
@@ -201,7 +201,7 @@ This package is under active development and I welcome any contributions.
 To get started, clone the repository and install the package in development mode:
 
 ```bash
-git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
+git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git
 cd llama-cpp-python
 
 # Install with pip

From cdab73536b4371679dbeb53e08a324194713ec49 Mon Sep 17 00:00:00 2001
From: Ihsan Soydemir <soydemir.ihsan@gmail.com>
Date: Thu, 3 Aug 2023 16:36:50 +0200
Subject: [PATCH 02/16] Docker improvements

---
 docker/README.md | 50 +++++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/docker/README.md b/docker/README.md
index 053d311..9ffd3b1 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,13 +1,13 @@
-# Install Docker Server
-
-**Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
+### Install Docker Server
+> [!IMPORTANT]  
+> This was tested with Docker running on Linux. <br>If you can get it working on Windows or MacOS, please update this `README.md` with a PR!<br>
 
 [Install Docker Engine](https://docs.docker.com/engine/install)
 
-**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
 
-# Simple Dockerfiles for building the llama-cpp-python server with external model bin files
-## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image
+## Simple Dockerfiles for building the llama-cpp-python server with external model bin files
+### openblas_simple
+A simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image:
 ```
 cd ./openblas_simple
 docker build -t openblas_simple .
@@ -15,23 +15,30 @@ docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:
 ```
 where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
 
-## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image
+### cuda_simple
+> [!WARNING]  
+> Nvidia GPU CuBLAS support requires an Nvidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker Nvidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) <br>
+
+A simple Dockerfile for CUDA-accelerated CuBLAS, where the model is located outside the Docker image:
+
 ```
 cd ./cuda_simple
 docker build -t cuda_simple .
-docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple
+docker run --gpus=all --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple
 ```
 where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
 
-# "Open-Llama-in-a-box"
-## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server
+--------------------------------------------------------------------------
+
+### "Open-Llama-in-a-box"
+Download an Apache V2.0 licensed 3B params Open LLaMA model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server:
 ```
 $ cd ./open_llama
 ./build.sh
 ./start.sh
 ```
 
-# Manually choose your own Llama model from Hugging Face
+### Manually choose your own Llama model from Hugging Face
 `python3 ./hug_model.py -a TheBloke -t llama`
 You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
 ```
@@ -39,8 +46,10 @@ docker $ ls -lh *.bin
 -rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>q5_1.bin
 lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_1.bin
 ```
-**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
-**TWICE** as much disk space as the size of the model:
+
+> [!NOTE]  
+> Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
+**TWICE** as much disk space as the size of the model:<br>
 
 | Model |  Quantized size |
 |------:|----------------:|
@@ -50,17 +59,6 @@ lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_
 |   33B |           25 GB |
 |   65B |           50 GB |
 
-**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
 
-## Use OpenBLAS
-Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
-### Build:
-`docker build -t openblas .`
-### Run:
-`docker run --cap-add SYS_RESOURCE -t openblas`
-
-## Use CuBLAS
-### Build:
-`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
-### Run:
-`docker run --cap-add SYS_RESOURCE -t cublas`
+> [!NOTE]  
+> If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`

From a5bc57e2792c28410c6d10b41a2fe71bbb6de721 Mon Sep 17 00:00:00 2001
From: Ihsan Soydemir <soydemir.ihsan@gmail.com>
Date: Thu, 3 Aug 2023 16:49:45 +0200
Subject: [PATCH 03/16] Update README.md

---
 docker/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/README.md b/docker/README.md
index 9ffd3b1..474503f 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -11,7 +11,7 @@ A simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the
 ```
 cd ./openblas_simple
 docker build -t openblas_simple .
-docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple
+docker run --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple
 ```
 where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
 

From d010ea70d20db9064e9c0c73cab36d611d4d35eb Mon Sep 17 00:00:00 2001
From: Pradyumna Singh Rathore <36568190+pradhyumna85@users.noreply.github.com>
Date: Thu, 10 Aug 2023 20:41:34 +0530
Subject: [PATCH 04/16] Fixed Cuda Dockerfile

Previously models produced garbage output when running on GPU with layers offloaded.

Similar to related fix on another repo: https://github.com/noneabove1182/koboldcpp-docker/commit/331326a0e340ac845855d346f730356b000650c7
---
 docker/cuda_simple/Dockerfile | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/docker/cuda_simple/Dockerfile b/docker/cuda_simple/Dockerfile
index e4a2f07..e5aaf17 100644
--- a/docker/cuda_simple/Dockerfile
+++ b/docker/cuda_simple/Dockerfile
@@ -4,13 +4,24 @@ FROM nvidia/cuda:${CUDA_IMAGE}
 # We need to set the host to 0.0.0.0 to allow outside access
 ENV HOST 0.0.0.0
 
+RUN apt-get update && apt-get upgrade -y \
+    && apt-get install -y git build-essential \
+    python3 python3-pip gcc wget \
+    ocl-icd-opencl-dev opencl-headers clinfo \
+    libclblast-dev libopenblas-dev \
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
+
 COPY . .
 
-# Install the package
-RUN apt update && apt install -y python3 python3-pip
+# setting build related env vars
+ENV CUDA_DOCKER_ARCH=all
+ENV LLAMA_CUBLAS=1
+
+# Install depencencies
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 
-RUN LLAMA_CUBLAS=1 pip install llama-cpp-python
+# Install llama-cpp-python (build with cuda)
+RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
 
 # Run the server
 CMD python3 -m llama_cpp.server

From abca3d81c863f3951e6970ccdf3949aca74bdb09 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 21 Aug 2023 20:53:56 +0000
Subject: [PATCH 05/16] Bump mkdocs-material from 9.1.21 to 9.2.0

Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.21 to 9.2.0.
- [Release notes](https://github.com/squidfunk/mkdocs-material/releases)
- [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG)
- [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.21...9.2.0)

---
updated-dependencies:
- dependency-name: mkdocs-material
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 245 ++++++++++++++++++++++++++++++++++++++++++++++++-
 pyproject.toml |   2 +-
 2 files changed, 241 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 273cedc..ec65da1 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -34,6 +34,38 @@ doc = ["packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
 test = ["contextlib2", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (<0.15)", "uvloop (>=0.15)"]
 trio = ["trio (>=0.16,<0.22)"]
 
+[[package]]
+name = "babel"
+version = "2.12.1"
+description = "Internationalization utilities"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "Babel-2.12.1-py3-none-any.whl", hash = "sha256:b4246fb7677d3b98f501a39d43396d3cafdc8eadb045f4a31be01863f655c610"},
+    {file = "Babel-2.12.1.tar.gz", hash = "sha256:cc2d99999cd01d44420ae725a21c9e3711b3aadc7976d6147f622d8581963455"},
+]
+
+[package.dependencies]
+pytz = {version = ">=2015.7", markers = "python_version < \"3.9\""}
+
+[[package]]
+name = "beautifulsoup4"
+version = "4.12.2"
+description = "Screen-scraping library"
+optional = false
+python-versions = ">=3.6.0"
+files = [
+    {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"},
+    {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"},
+]
+
+[package.dependencies]
+soupsieve = ">1.2"
+
+[package.extras]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
 [[package]]
 name = "black"
 version = "23.7.0"
@@ -335,6 +367,17 @@ test = ["iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-co
 test-randomorder = ["pytest-randomly"]
 tox = ["tox"]
 
+[[package]]
+name = "cssselect"
+version = "1.2.0"
+description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "cssselect-1.2.0-py2.py3-none-any.whl", hash = "sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e"},
+    {file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"},
+]
+
 [[package]]
 name = "diskcache"
 version = "5.6.1"
@@ -620,6 +663,113 @@ completion = ["shtab"]
 docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)"]
 testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
 
+[[package]]
+name = "lxml"
+version = "4.9.3"
+description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
+files = [
+    {file = "lxml-4.9.3-cp27-cp27m-macosx_11_0_x86_64.whl", hash = "sha256:b0a545b46b526d418eb91754565ba5b63b1c0b12f9bd2f808c852d9b4b2f9b5c"},
+    {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d"},
+    {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1e224d5755dba2f4a9498e150c43792392ac9b5380aa1b845f98a1618c94eeef"},
+    {file = "lxml-4.9.3-cp27-cp27m-win32.whl", hash = "sha256:2c74524e179f2ad6d2a4f7caf70e2d96639c0954c943ad601a9e146c76408ed7"},
+    {file = "lxml-4.9.3-cp27-cp27m-win_amd64.whl", hash = "sha256:4f1026bc732b6a7f96369f7bfe1a4f2290fb34dce00d8644bc3036fb351a4ca1"},
+    {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0781a98ff5e6586926293e59480b64ddd46282953203c76ae15dbbbf302e8bb"},
+    {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cef2502e7e8a96fe5ad686d60b49e1ab03e438bd9123987994528febd569868e"},
+    {file = "lxml-4.9.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b86164d2cff4d3aaa1f04a14685cbc072efd0b4f99ca5708b2ad1b9b5988a991"},
+    {file = "lxml-4.9.3-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:42871176e7896d5d45138f6d28751053c711ed4d48d8e30b498da155af39aebd"},
+    {file = "lxml-4.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae8b9c6deb1e634ba4f1930eb67ef6e6bf6a44b6eb5ad605642b2d6d5ed9ce3c"},
+    {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:411007c0d88188d9f621b11d252cce90c4a2d1a49db6c068e3c16422f306eab8"},
+    {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:cd47b4a0d41d2afa3e58e5bf1f62069255aa2fd6ff5ee41604418ca925911d76"},
+    {file = "lxml-4.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e2cb47860da1f7e9a5256254b74ae331687b9672dfa780eed355c4c9c3dbd23"},
+    {file = "lxml-4.9.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1247694b26342a7bf47c02e513d32225ededd18045264d40758abeb3c838a51f"},
+    {file = "lxml-4.9.3-cp310-cp310-win32.whl", hash = "sha256:cdb650fc86227eba20de1a29d4b2c1bfe139dc75a0669270033cb2ea3d391b85"},
+    {file = "lxml-4.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:97047f0d25cd4bcae81f9ec9dc290ca3e15927c192df17331b53bebe0e3ff96d"},
+    {file = "lxml-4.9.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:1f447ea5429b54f9582d4b955f5f1985f278ce5cf169f72eea8afd9502973dd5"},
+    {file = "lxml-4.9.3-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:57d6ba0ca2b0c462f339640d22882acc711de224d769edf29962b09f77129cbf"},
+    {file = "lxml-4.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:9767e79108424fb6c3edf8f81e6730666a50feb01a328f4a016464a5893f835a"},
+    {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:71c52db65e4b56b8ddc5bb89fb2e66c558ed9d1a74a45ceb7dcb20c191c3df2f"},
+    {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d73d8ecf8ecf10a3bd007f2192725a34bd62898e8da27eb9d32a58084f93962b"},
+    {file = "lxml-4.9.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0a3d3487f07c1d7f150894c238299934a2a074ef590b583103a45002035be120"},
+    {file = "lxml-4.9.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9e28c51fa0ce5674be9f560c6761c1b441631901993f76700b1b30ca6c8378d6"},
+    {file = "lxml-4.9.3-cp311-cp311-win32.whl", hash = "sha256:0bfd0767c5c1de2551a120673b72e5d4b628737cb05414f03c3277bf9bed3305"},
+    {file = "lxml-4.9.3-cp311-cp311-win_amd64.whl", hash = "sha256:25f32acefac14ef7bd53e4218fe93b804ef6f6b92ffdb4322bb6d49d94cad2bc"},
+    {file = "lxml-4.9.3-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:d3ff32724f98fbbbfa9f49d82852b159e9784d6094983d9a8b7f2ddaebb063d4"},
+    {file = "lxml-4.9.3-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:48d6ed886b343d11493129e019da91d4039826794a3e3027321c56d9e71505be"},
+    {file = "lxml-4.9.3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9a92d3faef50658dd2c5470af249985782bf754c4e18e15afb67d3ab06233f13"},
+    {file = "lxml-4.9.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b4e4bc18382088514ebde9328da057775055940a1f2e18f6ad2d78aa0f3ec5b9"},
+    {file = "lxml-4.9.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fc9b106a1bf918db68619fdcd6d5ad4f972fdd19c01d19bdb6bf63f3589a9ec5"},
+    {file = "lxml-4.9.3-cp312-cp312-win_amd64.whl", hash = "sha256:d37017287a7adb6ab77e1c5bee9bcf9660f90ff445042b790402a654d2ad81d8"},
+    {file = "lxml-4.9.3-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:56dc1f1ebccc656d1b3ed288f11e27172a01503fc016bcabdcbc0978b19352b7"},
+    {file = "lxml-4.9.3-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:578695735c5a3f51569810dfebd05dd6f888147a34f0f98d4bb27e92b76e05c2"},
+    {file = "lxml-4.9.3-cp35-cp35m-win32.whl", hash = "sha256:704f61ba8c1283c71b16135caf697557f5ecf3e74d9e453233e4771d68a1f42d"},
+    {file = "lxml-4.9.3-cp35-cp35m-win_amd64.whl", hash = "sha256:c41bfca0bd3532d53d16fd34d20806d5c2b1ace22a2f2e4c0008570bf2c58833"},
+    {file = "lxml-4.9.3-cp36-cp36m-macosx_11_0_x86_64.whl", hash = "sha256:64f479d719dc9f4c813ad9bb6b28f8390360660b73b2e4beb4cb0ae7104f1c12"},
+    {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:dd708cf4ee4408cf46a48b108fb9427bfa00b9b85812a9262b5c668af2533ea5"},
+    {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c31c7462abdf8f2ac0577d9f05279727e698f97ecbb02f17939ea99ae8daa98"},
+    {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e3cd95e10c2610c360154afdc2f1480aea394f4a4f1ea0a5eacce49640c9b190"},
+    {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:4930be26af26ac545c3dffb662521d4e6268352866956672231887d18f0eaab2"},
+    {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4aec80cde9197340bc353d2768e2a75f5f60bacda2bab72ab1dc499589b3878c"},
+    {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:14e019fd83b831b2e61baed40cab76222139926b1fb5ed0e79225bc0cae14584"},
+    {file = "lxml-4.9.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0c0850c8b02c298d3c7006b23e98249515ac57430e16a166873fc47a5d549287"},
+    {file = "lxml-4.9.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:aca086dc5f9ef98c512bac8efea4483eb84abbf926eaeedf7b91479feb092458"},
+    {file = "lxml-4.9.3-cp36-cp36m-win32.whl", hash = "sha256:50baa9c1c47efcaef189f31e3d00d697c6d4afda5c3cde0302d063492ff9b477"},
+    {file = "lxml-4.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bef4e656f7d98aaa3486d2627e7d2df1157d7e88e7efd43a65aa5dd4714916cf"},
+    {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:46f409a2d60f634fe550f7133ed30ad5321ae2e6630f13657fb9479506b00601"},
+    {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4c28a9144688aef80d6ea666c809b4b0e50010a2aca784c97f5e6bf143d9f129"},
+    {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:141f1d1a9b663c679dc524af3ea1773e618907e96075262726c7612c02b149a4"},
+    {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:53ace1c1fd5a74ef662f844a0413446c0629d151055340e9893da958a374f70d"},
+    {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17a753023436a18e27dd7769e798ce302963c236bc4114ceee5b25c18c52c693"},
+    {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7d298a1bd60c067ea75d9f684f5f3992c9d6766fadbc0bcedd39750bf344c2f4"},
+    {file = "lxml-4.9.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:081d32421db5df44c41b7f08a334a090a545c54ba977e47fd7cc2deece78809a"},
+    {file = "lxml-4.9.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:23eed6d7b1a3336ad92d8e39d4bfe09073c31bfe502f20ca5116b2a334f8ec02"},
+    {file = "lxml-4.9.3-cp37-cp37m-win32.whl", hash = "sha256:1509dd12b773c02acd154582088820893109f6ca27ef7291b003d0e81666109f"},
+    {file = "lxml-4.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:120fa9349a24c7043854c53cae8cec227e1f79195a7493e09e0c12e29f918e52"},
+    {file = "lxml-4.9.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4d2d1edbca80b510443f51afd8496be95529db04a509bc8faee49c7b0fb6d2cc"},
+    {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d7e43bd40f65f7d97ad8ef5c9b1778943d02f04febef12def25f7583d19baac"},
+    {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:71d66ee82e7417828af6ecd7db817913cb0cf9d4e61aa0ac1fde0583d84358db"},
+    {file = "lxml-4.9.3-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:6fc3c450eaa0b56f815c7b62f2b7fba7266c4779adcf1cece9e6deb1de7305ce"},
+    {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65299ea57d82fb91c7f019300d24050c4ddeb7c5a190e076b5f48a2b43d19c42"},
+    {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:eadfbbbfb41b44034a4c757fd5d70baccd43296fb894dba0295606a7cf3124aa"},
+    {file = "lxml-4.9.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:3e9bdd30efde2b9ccfa9cb5768ba04fe71b018a25ea093379c857c9dad262c40"},
+    {file = "lxml-4.9.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fcdd00edfd0a3001e0181eab3e63bd5c74ad3e67152c84f93f13769a40e073a7"},
+    {file = "lxml-4.9.3-cp38-cp38-win32.whl", hash = "sha256:57aba1bbdf450b726d58b2aea5fe47c7875f5afb2c4a23784ed78f19a0462574"},
+    {file = "lxml-4.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:92af161ecbdb2883c4593d5ed4815ea71b31fafd7fd05789b23100d081ecac96"},
+    {file = "lxml-4.9.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:9bb6ad405121241e99a86efff22d3ef469024ce22875a7ae045896ad23ba2340"},
+    {file = "lxml-4.9.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8ed74706b26ad100433da4b9d807eae371efaa266ffc3e9191ea436087a9d6a7"},
+    {file = "lxml-4.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fbf521479bcac1e25a663df882c46a641a9bff6b56dc8b0fafaebd2f66fb231b"},
+    {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:303bf1edce6ced16bf67a18a1cf8339d0db79577eec5d9a6d4a80f0fb10aa2da"},
+    {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:5515edd2a6d1a5a70bfcdee23b42ec33425e405c5b351478ab7dc9347228f96e"},
+    {file = "lxml-4.9.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:690dafd0b187ed38583a648076865d8c229661ed20e48f2335d68e2cf7dc829d"},
+    {file = "lxml-4.9.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b6420a005548ad52154c8ceab4a1290ff78d757f9e5cbc68f8c77089acd3c432"},
+    {file = "lxml-4.9.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bb3bb49c7a6ad9d981d734ef7c7193bc349ac338776a0360cc671eaee89bcf69"},
+    {file = "lxml-4.9.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d27be7405547d1f958b60837dc4c1007da90b8b23f54ba1f8b728c78fdb19d50"},
+    {file = "lxml-4.9.3-cp39-cp39-win32.whl", hash = "sha256:8df133a2ea5e74eef5e8fc6f19b9e085f758768a16e9877a60aec455ed2609b2"},
+    {file = "lxml-4.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:4dd9a263e845a72eacb60d12401e37c616438ea2e5442885f65082c276dfb2b2"},
+    {file = "lxml-4.9.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6689a3d7fd13dc687e9102a27e98ef33730ac4fe37795d5036d18b4d527abd35"},
+    {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f6bdac493b949141b733c5345b6ba8f87a226029cbabc7e9e121a413e49441e0"},
+    {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:05186a0f1346ae12553d66df1cfce6f251589fea3ad3da4f3ef4e34b2d58c6a3"},
+    {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c2006f5c8d28dee289f7020f721354362fa304acbaaf9745751ac4006650254b"},
+    {file = "lxml-4.9.3-pp38-pypy38_pp73-macosx_11_0_x86_64.whl", hash = "sha256:5c245b783db29c4e4fbbbfc9c5a78be496c9fea25517f90606aa1f6b2b3d5f7b"},
+    {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4fb960a632a49f2f089d522f70496640fdf1218f1243889da3822e0a9f5f3ba7"},
+    {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:50670615eaf97227d5dc60de2dc99fb134a7130d310d783314e7724bf163f75d"},
+    {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9719fe17307a9e814580af1f5c6e05ca593b12fb7e44fe62450a5384dbf61b4b"},
+    {file = "lxml-4.9.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3331bece23c9ee066e0fb3f96c61322b9e0f54d775fccefff4c38ca488de283a"},
+    {file = "lxml-4.9.3-pp39-pypy39_pp73-macosx_11_0_x86_64.whl", hash = "sha256:ed667f49b11360951e201453fc3967344d0d0263aa415e1619e85ae7fd17b4e0"},
+    {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8b77946fd508cbf0fccd8e400a7f71d4ac0e1595812e66025bac475a8e811694"},
+    {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e4da8ca0c0c0aea88fd46be8e44bd49716772358d648cce45fe387f7b92374a7"},
+    {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fe4bda6bd4340caa6e5cf95e73f8fea5c4bfc55763dd42f1b50a94c1b4a2fbd4"},
+    {file = "lxml-4.9.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3df3db1d336b9356dd3112eae5f5c2b8b377f3bc826848567f10bfddfee77e9"},
+    {file = "lxml-4.9.3.tar.gz", hash = "sha256:48628bd53a426c9eb9bc066a923acaa0878d1e86129fd5359aee99285f4eed9c"},
+]
+
+[package.extras]
+cssselect = ["cssselect (>=0.7)"]
+html5 = ["html5lib"]
+htmlsoup = ["BeautifulSoup4"]
+source = ["Cython (>=0.29.35)"]
+
 [[package]]
 name = "markdown"
 version = "3.3.7"
@@ -661,6 +811,22 @@ profiling = ["gprof2dot"]
 rtd = ["attrs", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
 testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
 
+[[package]]
+name = "markdown2"
+version = "2.4.10"
+description = "A fast and complete Python implementation of Markdown"
+optional = false
+python-versions = ">=3.5, <4"
+files = [
+    {file = "markdown2-2.4.10-py2.py3-none-any.whl", hash = "sha256:e6105800483783831f5dc54f827aa5b44eb137ecef5a70293d8ecfbb4109ecc6"},
+    {file = "markdown2-2.4.10.tar.gz", hash = "sha256:cdba126d90dc3aef6f4070ac342f974d63f415678959329cc7909f96cc235d72"},
+]
+
+[package.extras]
+all = ["pygments (>=2.7.3)", "wavedrom"]
+code-syntax-highlighting = ["pygments (>=2.7.3)"]
+wavedrom = ["wavedrom"]
+
 [[package]]
 name = "markupsafe"
 version = "2.1.2"
@@ -790,23 +956,27 @@ mkdocs = ">=1.1"
 
 [[package]]
 name = "mkdocs-material"
-version = "9.1.21"
+version = "9.2.0"
 description = "Documentation that simply works"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocs_material-9.1.21-py3-none-any.whl", hash = "sha256:58bb2f11ef240632e176d6f0f7d1cff06be1d11c696a5a1b553b808b4280ed47"},
-    {file = "mkdocs_material-9.1.21.tar.gz", hash = "sha256:71940cdfca84ab296b6362889c25395b1621273fb16c93deda257adb7ff44ec8"},
+    {file = "mkdocs_material-9.2.0-py3-none-any.whl", hash = "sha256:051e531c78e9cb7e0a6b8b786f0bae7a168e716bc688b5bb1872d9d34f7c2409"},
+    {file = "mkdocs_material-9.2.0.tar.gz", hash = "sha256:dfabc32ca49cbd21e456c60cc5c88e10bad17e7c254c23a24dc47d009a704133"},
 ]
 
 [package.dependencies]
+babel = ">=2.10.3"
 colorama = ">=0.4"
 jinja2 = ">=3.0"
+lxml = ">=4.6"
 markdown = ">=3.2"
-mkdocs = ">=1.5.0"
+mkdocs = ">=1.5.2"
 mkdocs-material-extensions = ">=1.1"
+paginate = ">=0.5.6"
 pygments = ">=2.14"
 pymdown-extensions = ">=9.9.1"
+readtime = ">=2.0"
 regex = ">=2022.4.24"
 requests = ">=2.26"
 
@@ -933,6 +1103,16 @@ files = [
     {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
 ]
 
+[[package]]
+name = "paginate"
+version = "0.5.6"
+description = "Divides large result sets into pages for easier browsing"
+optional = false
+python-versions = "*"
+files = [
+    {file = "paginate-0.5.6.tar.gz", hash = "sha256:5e6007b6a9398177a7e1648d04fdd9f8c9766a1a945bceac82f1929e8c78af2d"},
+]
+
 [[package]]
 name = "pathspec"
 version = "0.11.1"
@@ -1175,6 +1355,24 @@ files = [
 markdown = ">=3.2"
 pyyaml = "*"
 
+[[package]]
+name = "pyquery"
+version = "2.0.0"
+description = "A jquery-like library for python"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pyquery-2.0.0-py3-none-any.whl", hash = "sha256:8dfc9b4b7c5f877d619bbae74b1898d5743f6ca248cfd5d72b504dd614da312f"},
+    {file = "pyquery-2.0.0.tar.gz", hash = "sha256:963e8d4e90262ff6d8dec072ea97285dc374a2f69cad7776f4082abcf6a1d8ae"},
+]
+
+[package.dependencies]
+cssselect = ">=1.2.0"
+lxml = ">=2.1"
+
+[package.extras]
+test = ["pytest", "pytest-cov", "requests", "webob", "webtest"]
+
 [[package]]
 name = "pytest"
 version = "7.4.0"
@@ -1225,6 +1423,17 @@ files = [
 [package.extras]
 cli = ["click (>=5.0)"]
 
+[[package]]
+name = "pytz"
+version = "2023.3"
+description = "World timezone definitions, modern and historical"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytz-2023.3-py2.py3-none-any.whl", hash = "sha256:a151b3abb88eda1d4e34a9814df37de2a80e301e68ba0fd856fb9b46bfbbbffb"},
+    {file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"},
+]
+
 [[package]]
 name = "pywin32-ctypes"
 version = "0.2.0"
@@ -1318,6 +1527,21 @@ Pygments = ">=2.5.1"
 [package.extras]
 md = ["cmarkgfm (>=0.8.0)"]
 
+[[package]]
+name = "readtime"
+version = "3.0.0"
+description = "Calculates the time some text takes the average human to read, based on Medium's read time forumula"
+optional = false
+python-versions = "*"
+files = [
+    {file = "readtime-3.0.0.tar.gz", hash = "sha256:76c5a0d773ad49858c53b42ba3a942f62fbe20cc8c6f07875797ac7dc30963a9"},
+]
+
+[package.dependencies]
+beautifulsoup4 = ">=4.0.1"
+markdown2 = ">=2.4.3"
+pyquery = ">=1.2"
+
 [[package]]
 name = "regex"
 version = "2023.5.5"
@@ -1560,6 +1784,17 @@ files = [
     {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
 ]
 
+[[package]]
+name = "soupsieve"
+version = "2.4.1"
+description = "A modern CSS selector implementation for Beautiful Soup."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"},
+    {file = "soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"},
+]
+
 [[package]]
 name = "sse-starlette"
 version = "1.6.5"
@@ -1757,4 +1992,4 @@ server = ["fastapi", "pydantic-settings", "sse-starlette", "uvicorn"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "4bfb67dfb72b02c845376211f7f958b2ece8c985944fbd03d246c858e846ddf6"
+content-hash = "4f3d987c81e8a3a06416219b6078ca386164c8c09499720f5991a7b0f2aa8c71"
diff --git a/pyproject.toml b/pyproject.toml
index 8735b60..f6e8264 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,7 @@ black = "^23.7.0"
 twine = "^4.0.2"
 mkdocs = "^1.5.2"
 mkdocstrings = {extras = ["python"], version = "^0.22.0"}
-mkdocs-material = "^9.1.21"
+mkdocs-material = "^9.2.0"
 pytest = "^7.4.0"
 httpx = "^0.24.1"
 scikit-build = "0.17.6"

From 4ed632c4b352f80a8cd19ad5923e2981a2bb3c53 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 24 Aug 2023 01:01:05 -0400
Subject: [PATCH 06/16] Remove deprecated params

---
 llama_cpp/llama.py | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index bfcae18..d6fd830 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -228,7 +228,7 @@ class Llama:
         rope_freq_scale: float = 1.0,
         n_gqa: Optional[int] = None,  # (TEMPORARY) must be 8 for llama2 70b
         rms_norm_eps: Optional[float] = None,  # (TEMPORARY)
-        mul_mat_q: Optional[bool] = None,  # (TEMPORARY)
+        mul_mat_q: Optional[bool] = None,
         verbose: bool = True,
     ):
         """Load a llama.cpp model from `model_path`.
@@ -290,11 +290,6 @@ class Llama:
         self.params.rope_freq_base = rope_freq_base
         self.params.rope_freq_scale = rope_freq_scale
 
-        if n_gqa is not None:
-            self.params.n_gqa = n_gqa
-
-        if rms_norm_eps is not None:
-            self.params.rms_norm_eps = rms_norm_eps
 
         if mul_mat_q is not None:
             self.params.mul_mat_q = mul_mat_q
@@ -453,6 +448,8 @@ class Llama:
         buffer_size = 32
         buffer = (ctypes.c_char * buffer_size)()
         for token in tokens:
+            if token == llama_cpp.llama_token_bos(self.ctx):
+                continue
             n = llama_cpp.llama_token_to_str(
                 self.ctx, llama_cpp.llama_token(token), buffer, buffer_size
             )
@@ -1585,13 +1582,7 @@ class Llama:
             lora_base=self.lora_base,
             lora_path=self.lora_path,
             tensor_split=self.tensor_split,
-            ### TEMPORARY ###
-            n_gqa=self.params.n_gqa,
-            rms_norm_eps=self.params.rms_norm_eps,
-            ### TEMPORARY ###
-            ### DEPRECATED ###
-            n_parts=self.n_parts,
-            ### DEPRECATED ###
+            mul_mat_q=self.params.mul_mat_q,
         )
 
     def __setstate__(self, state):
@@ -1613,14 +1604,8 @@ class Llama:
             lora_base=state["lora_base"],
             lora_path=state["lora_path"],
             tensor_split=state["tensor_split"],
+            mul_mat_q=state["mul_mat_q"],
             verbose=state["verbose"],
-            ### TEMPORARY ###
-            n_gqa=state["n_gqa"],
-            rms_norm_eps=state["rms_norm_eps"],
-            ### TEMPORARY ###
-            ### DEPRECATED ###
-            n_parts=state["n_parts"],
-            ### DEPRECATED ###
         )
 
     def save_state(self) -> LlamaState:

From db982a861fe74ffa88af901cf8d3df07ae68f0e1 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 24 Aug 2023 01:01:12 -0400
Subject: [PATCH 07/16] Fix

---
 llama_cpp/llama_cpp.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index c9200c6..5442708 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1008,15 +1008,24 @@ _lib.llama_token_to_str_bpe.argtypes = [llama_context_p, llama_token, c_char_p,
 _lib.llama_token_to_str_bpe.restype = c_int
 
 
-# LLAMA_API const char * llama_token_to_str_with_model(
-#             const struct llama_model * model,
-#                         llama_token   token);
-def llama_token_to_str_with_model(model: llama_model_p, token: llama_token) -> bytes:
-    return _lib.llama_token_to_str_with_model(model, token)
+# LLAMA_API int llama_token_to_str_with_model(
+#           const struct llama_model * model,
+#                        llama_token   token,
+#                               char * buf,
+#                               int    length);
+def llama_token_to_str_with_model(
+    model: llama_model_p, token: llama_token, buf: bytes, length: c_int
+) -> int:
+    return _lib.llama_token_to_str_with_model(model, token, buf, length)
 
 
-_lib.llama_token_to_str_with_model.argtypes = [llama_model_p, llama_token]
-_lib.llama_token_to_str_with_model.restype = c_char_p
+_lib.llama_token_to_str_with_model.argtypes = [
+    llama_model_p,
+    llama_token,
+    c_char_p,
+    c_int,
+]
+_lib.llama_token_to_str_with_model.restype = c_int
 
 
 # //

From 3674e5ed4ef2f0271b30eb92d982d8b7524d2f34 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 24 Aug 2023 01:01:20 -0400
Subject: [PATCH 08/16] Update model path

---
 tests/test_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_llama.py b/tests/test_llama.py
index 941287d..9701321 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -1,6 +1,6 @@
 import llama_cpp
 
-MODEL = "./vendor/llama.cpp/models/ggml-vocab.bin"
+MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama.gguf"
 
 
 def test_llama():

From c2d1deaa8a759518eb9a8486e44ae7a6059d131d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 24 Aug 2023 18:01:42 -0400
Subject: [PATCH 09/16] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 52 ++++++++----------------------------------
 vendor/llama.cpp       |  2 +-
 2 files changed, 11 insertions(+), 43 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 5442708..62ddbf4 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -531,6 +531,15 @@ _lib.llama_n_embd.argtypes = [llama_context_p]
 _lib.llama_n_embd.restype = c_int
 
 
+# LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
+def llama_vocab_type(ctx: llama_context_p) -> int:
+    return _lib.llama_vocab_type(ctx)
+
+
+_lib.llama_vocab_type.argtypes = [llama_context_p]
+_lib.llama_vocab_type.restype = c_int
+
+
 # LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
 def llama_model_n_vocab(model: llama_model_p) -> int:
     return _lib.llama_model_n_vocab(model)
@@ -849,7 +858,7 @@ _lib.llama_token_get_score.argtypes = [llama_context_p, llama_token]
 _lib.llama_token_get_score.restype = c_float
 
 
-# LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
+# LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
 def llama_token_get_type(ctx: llama_context_p, token: llama_token) -> int:
     return _lib.llama_token_get_type(ctx, token)
 
@@ -918,32 +927,6 @@ _lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int,
 _lib.llama_tokenize.restype = c_int
 
 
-# LLAMA_API int llama_tokenize_bpe(
-#         struct llama_context * ctx,
-#                   const char * text,
-#                  llama_token * tokens,
-#                          int   n_max_tokens,
-#                         bool   add_bos);
-def llama_tokenize_bpe(
-    ctx: llama_context_p,
-    text: bytes,
-    tokens,  # type: Array[llama_token]
-    n_max_tokens: c_int,
-    add_bos: c_bool,
-) -> int:
-    return _lib.llama_tokenize_bpe(ctx, text, tokens, n_max_tokens, add_bos)
-
-
-_lib.llama_tokenize_bpe.argtypes = [
-    llama_context_p,
-    c_char_p,
-    llama_token_p,
-    c_int,
-    c_bool,
-]
-_lib.llama_tokenize_bpe.restype = c_int
-
-
 # LLAMA_API int llama_tokenize_with_model(
 #     const struct llama_model * model,
 #                   const char * text,
@@ -993,21 +976,6 @@ _lib.llama_tokenize_with_model.argtypes = [
 _lib.llama_tokenize_with_model.restype = c_int
 
 
-# LLAMA_API int llama_token_to_str_bpe(
-#         const struct llama_context * ctx,
-#                         llama_token   token,
-#                                 char * buf,
-#                                 int    length);
-def llama_token_to_str_bpe(
-    ctx: llama_context_p, token: llama_token, buf: bytes, length: c_int
-) -> int:
-    return _lib.llama_token_to_str_bpe(ctx, token, buf, length)
-
-
-_lib.llama_token_to_str_bpe.argtypes = [llama_context_p, llama_token, c_char_p, c_int]
-_lib.llama_token_to_str_bpe.restype = c_int
-
-
 # LLAMA_API int llama_token_to_str_with_model(
 #           const struct llama_model * model,
 #                        llama_token   token,
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f5fe98d..2e5f70a 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26
+Subproject commit 2e5f70a25fc4576e9ed78603fe493eb7702c37a3

From 8ac59465b9b5c39fb6cc833dc5c3319664e60ec0 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 25 Aug 2023 04:56:48 -0400
Subject: [PATCH 10/16] Strip leading space when de-tokenizing.

---
 llama_cpp/llama.py  | 10 +++++-----
 tests/test_llama.py | 20 ++++++++++++++++----
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index d6fd830..b8f76e9 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -445,17 +445,17 @@ class Llama:
         """
         assert self.ctx is not None
         output = b""
-        buffer_size = 32
+        buffer_size = 8
         buffer = (ctypes.c_char * buffer_size)()
         for token in tokens:
-            if token == llama_cpp.llama_token_bos(self.ctx):
-                continue
             n = llama_cpp.llama_token_to_str(
                 self.ctx, llama_cpp.llama_token(token), buffer, buffer_size
             )
             assert n <= buffer_size
             output += bytes(buffer[:n])
-        return output
+        # NOTE: Llama1 models automatically added a space at the start of the prompt
+        # this line removes a leading space if the first token is a beginning of sentence token
+        return output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
 
     def set_cache(self, cache: Optional[BaseLlamaCache]):
         """Set the cache.
@@ -886,7 +886,7 @@ class Llama:
         created: int = int(time.time())
         completion_tokens: List[int] = []
         # Add blank space to start of prompt to match OG llama tokenizer
-        prompt_tokens: List[int] = self.tokenize(b" " + prompt.encode("utf-8"))
+        prompt_tokens: List[int] = self.tokenize(prompt.encode("utf-8")) if prompt != "" else [self.token_bos()]
         text: bytes = b""
         returned_tokens: int = 0
         stop = (
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 9701321..c240122 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -1,20 +1,32 @@
+import pytest
 import llama_cpp
 
 MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama.gguf"
 
 
-def test_llama():
-    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
+def test_llama_cpp_tokenization():
+    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, verbose=False)
 
     assert llama
     assert llama.ctx is not None
 
     text = b"Hello World"
 
-    assert llama.detokenize(llama.tokenize(text)) == text
+    tokens = llama.tokenize(text)
+    assert tokens[0] == llama.token_bos()
+    assert tokens == [1, 15043, 2787]
+    detokenized = llama.detokenize(tokens)
+    assert detokenized == text
+
+    tokens = llama.tokenize(text, add_bos=False)
+    assert tokens[0] != llama.token_bos()
+    assert tokens == [15043, 2787]
+
+    detokenized = llama.detokenize(tokens)
+    assert detokenized != text
 
 
-# @pytest.mark.skip(reason="need to update sample mocking")
+@pytest.mark.skip(reason="bug in tokenization where leading space is always inserted even if not after eos")
 def test_llama_patch(monkeypatch):
     llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
     n_vocab = llama_cpp.llama_n_vocab(llama.ctx)

From 80389f71da6dc8688af0859c6b93812821abb181 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 25 Aug 2023 05:02:48 -0400
Subject: [PATCH 11/16] Update README

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 7c515d0..0901b63 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,9 @@ This package provides:
 
 Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
 
+> [!WARNING]  
+> Starting with version 0.1.79 the model format has changed from `ggmlv3` to `gguf`
+
 
 ## Installation from PyPI (recommended)
 

From 48cf43b4270a95ac875fc2ffc24bb28196ac3014 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 25 Aug 2023 13:43:16 -0400
Subject: [PATCH 12/16] Use _with_model variants for tokenization

---
 llama_cpp/llama.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index b8f76e9..fd3e2c4 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -408,11 +408,11 @@ class Llama:
         Returns:
             A list of tokens.
         """
-        assert self.ctx is not None
+        assert self.model is not None
         n_ctx = self._n_ctx
         tokens = (llama_cpp.llama_token * n_ctx)()
-        n_tokens = llama_cpp.llama_tokenize(
-            self.ctx,
+        n_tokens = llama_cpp.llama_tokenize_with_model(
+            self.model,
             text,
             tokens,
             llama_cpp.c_int(n_ctx),
@@ -421,8 +421,8 @@ class Llama:
         if n_tokens < 0:
             n_tokens = abs(n_tokens)
             tokens = (llama_cpp.llama_token * n_tokens)()
-            n_tokens = llama_cpp.llama_tokenize(
-                self.ctx,
+            n_tokens = llama_cpp.llama_tokenize_with_model(
+                self.model,
                 text,
                 tokens,
                 llama_cpp.c_int(n_tokens),
@@ -443,15 +443,15 @@ class Llama:
         Returns:
             The detokenized string.
         """
-        assert self.ctx is not None
+        assert self.model is not None
         output = b""
-        buffer_size = 8
-        buffer = (ctypes.c_char * buffer_size)()
+        size = 8
+        buffer = (ctypes.c_char * size)()
         for token in tokens:
-            n = llama_cpp.llama_token_to_str(
-                self.ctx, llama_cpp.llama_token(token), buffer, buffer_size
+            n = llama_cpp.llama_token_to_str_with_model(
+                self.model, llama_cpp.llama_token(token), buffer, size
             )
-            assert n <= buffer_size
+            assert n <= size
             output += bytes(buffer[:n])
         # NOTE: Llama1 models automatically added a space at the start of the prompt
         # this line removes a leading space if the first token is a beginning of sentence token

From c8a7637978436538d26a5d7d79b8c8e3c3fab4da Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 25 Aug 2023 14:35:27 -0400
Subject: [PATCH 13/16] Ignore vendor directory for tests

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 8735b60..11d38b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,9 @@ scikit-build = "0.17.6"
 [tool.poetry.extras]
 server = ["uvicorn", "fastapi", "pydantic-settings", "sse-starlette"]
 
+[tool.pytest.ini_options]
+addopts = "--ignore=vendor"
+
 [build-system]
 requires = [
     "setuptools>=42",

From ef23d1e545a1db51a6fe110d1f6b1317374a7de3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 25 Aug 2023 14:35:53 -0400
Subject: [PATCH 14/16] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 98 +++++++++++++++++++++++++++++++++++++++---
 vendor/llama.cpp       |  2 +-
 2 files changed, 94 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 62ddbf4..1731878 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -568,13 +568,33 @@ _lib.llama_model_n_embd.restype = c_int
 
 
 # // Get a string describing the model type
-# LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
-def llama_model_type(model: llama_model_p, buf: bytes, buf_size: c_size_t) -> int:
-    return _lib.llama_model_type(model, buf, buf_size)
+# LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+def llama_model_desc(model: llama_model_p, buf: bytes, buf_size: c_size_t) -> int:
+    return _lib.llama_model_desc(model, buf, buf_size)
 
 
-_lib.llama_model_type.argtypes = [llama_model_p, c_char_p, c_size_t]
-_lib.llama_model_type.restype = c_int
+_lib.llama_model_desc.argtypes = [llama_model_p, c_char_p, c_size_t]
+_lib.llama_model_desc.restype = c_int
+
+
+# // Returns the total size of all the tensors in the model in bytes
+# LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
+def llama_model_size(model: llama_model_p) -> int:
+    return _lib.llama_model_size(model)
+
+
+_lib.llama_model_size.argtypes = [llama_model_p]
+_lib.llama_model_size.restype = ctypes.c_uint64
+
+
+# // Returns the total number of parameters in the model
+# LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
+def llama_model_n_params(model: llama_model_p) -> int:
+    return _lib.llama_model_n_params(model)
+
+
+_lib.llama_model_n_params.argtypes = [llama_model_p]
+_lib.llama_model_n_params.restype = ctypes.c_uint64
 
 
 # // Returns 0 on success
@@ -1029,6 +1049,74 @@ def llama_grammar_free(grammar: llama_grammar_p):
 _lib.llama_grammar_free.argtypes = [llama_grammar_p]
 _lib.llama_grammar_free.restype = None
 
+# //
+# // Beam search
+# //
+
+
+# struct llama_beam_view {
+#     const llama_token * tokens;
+#     size_t n_tokens;
+#     float p;   // Cumulative beam probability (renormalized relative to all beams)
+#     bool eob;  // Callback should set this to true when a beam is at end-of-beam.
+# };
+class llama_beam_view(ctypes.Structure):
+    _fields_ = [
+        ("tokens", llama_token_p),
+        ("n_tokens", c_size_t),
+        ("p", c_float),
+        ("eob", c_bool),
+    ]
+
+
+# // Passed to beam_search_callback function.
+# // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
+# // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
+# // These pointers are valid only during the synchronous callback, so should not be saved.
+# struct llama_beams_state {
+#     struct llama_beam_view * beam_views;
+#     size_t n_beams;               // Number of elements in beam_views[].
+#     size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
+#     bool last_call;               // True iff this is the last callback invocation.
+# };
+class llama_beams_state(ctypes.Structure):
+    _fields_ = [
+        ("beam_views", POINTER(llama_beam_view)),
+        ("n_beams", c_size_t),
+        ("common_prefix_length", c_size_t),
+        ("last_call", c_bool),
+    ]
+
+
+# // Type of pointer to the beam_search_callback function.
+# // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
+# // passed back to beam_search_callback. This avoids having to use global variables in the callback.
+# typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, llama_beams_state);
+llama_beam_search_callback_fn_t = ctypes.CFUNCTYPE(None, c_void_p, llama_beams_state)
+
+
+# /// @details Deterministically returns entire sentence constructed by a beam search.
+# /// @param ctx Pointer to the llama_context.
+# /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
+# /// @param callback_data A pointer that is simply passed back to callback.
+# /// @param n_beams Number of beams to use.
+# /// @param n_past Number of tokens already evaluated.
+# /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
+# /// @param n_threads Number of threads as passed to llama_eval().
+# LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
+def llama_beam_search(
+    ctx: llama_context_p,
+    callback: "ctypes._CFuncPtr[None, c_void_p, llama_beams_state]",  # type: ignore
+    callback_data: c_void_p,
+    n_beams: c_size_t,
+    n_past: c_int,
+    n_predict: c_int,
+    n_threads: c_int,
+):
+    return _lib.llama_beam_search(
+        ctx, callback, callback_data, n_beams, n_past, n_predict, n_threads
+    )
+
 
 # //
 # // Sampling functions
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 2e5f70a..232caf3 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 2e5f70a25fc4576e9ed78603fe493eb7702c37a3
+Subproject commit 232caf3c1581a6cb023571780ff41dc2d66d1ca0

From ac37ea562bb9286bd222e5bd83e11d34f91256b1 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 25 Aug 2023 15:11:08 -0400
Subject: [PATCH 15/16] Add temporary docs for GGUF model conversion

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0901b63..b9d72f9 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ This package provides:
 Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
 
 > [!WARNING]  
-> Starting with version 0.1.79 the model format has changed from `ggmlv3` to `gguf`
+> Starting with version 0.1.79 the model format has changed from `ggmlv3` to `gguf`. Old model files can be converted using the `convert-llama-ggmlv3-to-gguf.py` script in [`llama.cpp`](https://github.com/ggerganov/llama.cpp)
 
 
 ## Installation from PyPI (recommended)

From 3f8bc417d7186eba1c09846abf46cd0aa40dbd4d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 25 Aug 2023 15:18:15 -0400
Subject: [PATCH 16/16] Bump version

---
 CHANGELOG.md   | 6 ++++++
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index df635fa..e838d05 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.79]
+
+### Added
+
+- GGUF Support (breaking change requiring new model format)
+
 ## [0.1.78]
 
 ### Added
diff --git a/pyproject.toml b/pyproject.toml
index 6f6f59e..bbc3e92 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.78"
+version = "0.1.79"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index bdc5a2e..f88bc29 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.78",
+    version="0.1.79",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",