From eb7645b3ba84e182a903663d68c0b4864b670f9b Mon Sep 17 00:00:00 2001
From: Tanner Hobson <thobson125@gmail.com>
Date: Fri, 9 Jun 2023 13:13:08 -0400
Subject: [PATCH 01/27] Add support for logit_bias and logit_bias_type
 parameters

---
 llama_cpp/llama.py      |  2 ++
 llama_cpp/server/app.py | 53 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 02fe774..197511c 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1380,6 +1380,7 @@ class Llama:
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
         model: Optional[str] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         """Generate a chat completion from a list of messages.
 
@@ -1421,6 +1422,7 @@ class Llama:
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
             model=model,
+            logits_processor=logits_processor,
         )
         if stream:
             chunks: Iterator[CompletionChunk] = completion_or_chunks  # type: ignore
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index f70d8f0..a6194f5 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -249,13 +249,14 @@ class CreateCompletionRequest(BaseModel):
     )
     presence_penalty: Optional[float] = presence_penalty_field
     frequency_penalty: Optional[float] = frequency_penalty_field
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
 
     # ignored or currently unsupported
     model: Optional[str] = model_field
     n: Optional[int] = 1
     logprobs: Optional[int] = Field(None)
     best_of: Optional[int] = 1
-    logit_bias: Optional[Dict[str, float]] = Field(None)
     user: Optional[str] = Field(None)
 
     # llama.cpp specific parameters
@@ -274,6 +275,39 @@ class CreateCompletionRequest(BaseModel):
 CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
 
 
+def make_logit_bias_processor(
+    llama: llama_cpp.Llama,
+    logit_bias: Dict[str, float],
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]],
+):
+    if logit_bias_type is None:
+        logit_bias_type = "input_ids"
+
+    to_bias: Dict[int, float] = {}
+    if logit_bias_type == "input_ids":
+        for input_id, score in logit_bias.items():
+            input_id = int(input_id)
+            to_bias[input_id] = score
+
+    elif logit_bias_type == "tokens":
+        for token, score in logit_bias.items():
+            token = token.encode('utf-8')
+            for input_id in llama.tokenize(token, add_bos=False):
+                to_bias[input_id] = score
+
+    def logit_bias_processor(
+        input_ids: List[int],
+        scores: List[float],
+    ) -> List[float]:
+        new_scores = [None] * len(scores)
+        for input_id, score in enumerate(scores):
+            new_scores[input_id] = score + to_bias.get(input_id, 0.0)
+
+        return new_scores
+
+    return logit_bias_processor
+
+
 @router.post(
     "/v1/completions",
     response_model=CreateCompletionResponse,
@@ -291,9 +325,16 @@ async def create_completion(
         "n",
         "best_of",
         "logit_bias",
+        "logit_bias_type",
         "user",
     }
     kwargs = body.dict(exclude=exclude)
+
+    if body.logit_bias is not None:
+        kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
+            make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
+        ])
+
     if body.stream:
         send_chan, recv_chan = anyio.create_memory_object_stream(10)
 
@@ -372,11 +413,12 @@ class CreateChatCompletionRequest(BaseModel):
     stream: bool = stream_field
     presence_penalty: Optional[float] = presence_penalty_field
     frequency_penalty: Optional[float] = frequency_penalty_field
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
 
     # ignored or currently unsupported
     model: Optional[str] = model_field
     n: Optional[int] = 1
-    logit_bias: Optional[Dict[str, float]] = Field(None)
     user: Optional[str] = Field(None)
 
     # llama.cpp specific parameters
@@ -413,9 +455,16 @@ async def create_chat_completion(
     exclude = {
         "n",
         "logit_bias",
+        "logit_bias_type",
         "user",
     }
     kwargs = body.dict(exclude=exclude)
+
+    if body.logit_bias is not None:
+        kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
+            make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
+        ])
+
     if body.stream:
         send_chan, recv_chan = anyio.create_memory_object_stream(10)
 

From 3ea31930e57a45a0806488950e841efbb575369a Mon Sep 17 00:00:00 2001
From: Gabor <orfeous.b@gmail.com>
Date: Sun, 11 Jun 2023 00:58:08 +0100
Subject: [PATCH 02/27] fixes abetlen/llama-cpp-python #358

---
 llama_cpp/server/__main__.py | 2 +-
 llama_cpp/server/app.py      | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 4fe1d94..1de4548 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -46,5 +46,5 @@ if __name__ == "__main__":
     app = create_app(settings=settings)
 
     uvicorn.run(
-        app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
+        app, host=settings.host, port=settings.port
     )
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index f70d8f0..2191005 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -72,6 +72,12 @@ class Settings(BaseSettings):
     verbose: bool = Field(
         default=True, description="Whether to print debug information."
     )
+    host: str = Field(
+        default="localhost", description="Listen address"
+    )
+    port: int = Field(
+        default=8000, description="Listen port"
+    )
 
 
 router = APIRouter()

From 3129a0e7e581f6edd29a497a13ab014687867134 Mon Sep 17 00:00:00 2001
From: Gabor <orfeous.b@gmail.com>
Date: Sun, 11 Jun 2023 01:11:24 +0100
Subject: [PATCH 03/27] correction to add back environment variable support <3
 docker

---
 llama_cpp/server/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 1de4548..748a2af 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -46,5 +46,5 @@ if __name__ == "__main__":
     app = create_app(settings=settings)
 
     uvicorn.run(
-        app, host=settings.host, port=settings.port
+        app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port))
     )

From efcf380490af7007389df698ddfe1b0f755e7069 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 12 Jun 2023 21:03:40 +0000
Subject: [PATCH 04/27] Bump fastapi from 0.96.0 to 0.97.0

Bumps [fastapi](https://github.com/tiangolo/fastapi) from 0.96.0 to 0.97.0.
- [Release notes](https://github.com/tiangolo/fastapi/releases)
- [Commits](https://github.com/tiangolo/fastapi/compare/0.96.0...0.97.0)

---
updated-dependencies:
- dependency-name: fastapi
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 112 ++++++++++---------------------------------------
 pyproject.toml |   2 +-
 2 files changed, 23 insertions(+), 91 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 4a9c572..1d95d76 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,10 +1,9 @@
-# This file is automatically @generated by Poetry and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
 
 [[package]]
 name = "anyio"
 version = "3.6.2"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
-category = "main"
 optional = false
 python-versions = ">=3.6.2"
 files = [
@@ -25,7 +24,6 @@ trio = ["trio (>=0.16,<0.22)"]
 name = "black"
 version = "23.3.0"
 description = "The uncompromising code formatter."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -75,7 +73,6 @@ uvloop = ["uvloop (>=0.15.2)"]
 name = "bleach"
 version = "6.0.0"
 description = "An easy safelist-based HTML-sanitizing tool."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -94,7 +91,6 @@ css = ["tinycss2 (>=1.1.0,<1.2)"]
 name = "certifi"
 version = "2023.5.7"
 description = "Python package for providing Mozilla's CA Bundle."
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -106,7 +102,6 @@ files = [
 name = "cffi"
 version = "1.15.1"
 description = "Foreign Function Interface for Python calling C code."
-category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -183,7 +178,6 @@ pycparser = "*"
 name = "charset-normalizer"
 version = "3.1.0"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
-category = "dev"
 optional = false
 python-versions = ">=3.7.0"
 files = [
@@ -268,7 +262,6 @@ files = [
 name = "click"
 version = "8.1.3"
 description = "Composable command line interface toolkit"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -283,7 +276,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
 name = "colorama"
 version = "0.4.6"
 description = "Cross-platform colored terminal text."
-category = "main"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
 files = [
@@ -295,7 +287,6 @@ files = [
 name = "cryptography"
 version = "40.0.2"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -333,11 +324,21 @@ test = ["iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-co
 test-randomorder = ["pytest-randomly"]
 tox = ["tox"]
 
+[[package]]
+name = "diskcache"
+version = "5.6.1"
+description = "Disk Cache -- Disk and file backed persistent cache."
+optional = false
+python-versions = ">=3"
+files = [
+    {file = "diskcache-5.6.1-py3-none-any.whl", hash = "sha256:558c6a2d5d7c721bb00e40711803d6804850c9f76c426ed81ecc627fe9d2ce2d"},
+    {file = "diskcache-5.6.1.tar.gz", hash = "sha256:e4c978532feff5814c4cc00fe1e11e40501985946643d73220d41ee7737c72c3"},
+]
+
 [[package]]
 name = "distro"
 version = "1.8.0"
 description = "Distro - an OS platform information API"
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -349,7 +350,6 @@ files = [
 name = "docutils"
 version = "0.20"
 description = "Docutils -- Python Documentation Utilities"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -361,7 +361,6 @@ files = [
 name = "exceptiongroup"
 version = "1.1.1"
 description = "Backport of PEP 654 (exception groups)"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -374,31 +373,26 @@ test = ["pytest (>=6)"]
 
 [[package]]
 name = "fastapi"
-version = "0.96.0"
+version = "0.97.0"
 description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
-category = "main"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "fastapi-0.96.0-py3-none-any.whl", hash = "sha256:b8e11fe81e81eab4e1504209917338e0b80f783878a42c2b99467e5e1019a1e9"},
-    {file = "fastapi-0.96.0.tar.gz", hash = "sha256:71232d47c2787446991c81c41c249f8a16238d52d779c0e6b43927d3773dbe3c"},
+    {file = "fastapi-0.97.0-py3-none-any.whl", hash = "sha256:95d757511c596409930bd20673358d4a4d709004edb85c5d24d6ffc48fabcbf2"},
+    {file = "fastapi-0.97.0.tar.gz", hash = "sha256:b53248ee45f64f19bb7600953696e3edf94b0f7de94df1e5433fc5c6136fa986"},
 ]
 
 [package.dependencies]
-pydantic = ">=1.6.2,<1.7 || >1.7,<1.7.1 || >1.7.1,<1.7.2 || >1.7.2,<1.7.3 || >1.7.3,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0"
+pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0"
 starlette = ">=0.27.0,<0.28.0"
 
 [package.extras]
 all = ["email-validator (>=1.1.1)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
-dev = ["pre-commit (>=2.17.0,<3.0.0)", "ruff (==0.0.138)", "uvicorn[standard] (>=0.12.0,<0.21.0)"]
-doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-markdownextradata-plugin (>=0.1.7,<0.3.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pyyaml (>=5.3.1,<7.0.0)", "typer-cli (>=0.0.13,<0.0.14)", "typer[all] (>=0.6.1,<0.8.0)"]
-test = ["anyio[trio] (>=3.2.1,<4.0.0)", "black (==23.1.0)", "coverage[toml] (>=6.5.0,<8.0)", "databases[sqlite] (>=0.3.2,<0.7.0)", "email-validator (>=1.1.1,<2.0.0)", "flask (>=1.1.2,<3.0.0)", "httpx (>=0.23.0,<0.24.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.982)", "orjson (>=3.2.1,<4.0.0)", "passlib[bcrypt] (>=1.7.2,<2.0.0)", "peewee (>=3.13.3,<4.0.0)", "pytest (>=7.1.3,<8.0.0)", "python-jose[cryptography] (>=3.3.0,<4.0.0)", "python-multipart (>=0.0.5,<0.0.7)", "pyyaml (>=5.3.1,<7.0.0)", "ruff (==0.0.138)", "sqlalchemy (>=1.3.18,<1.4.43)", "types-orjson (==3.6.2)", "types-ujson (==5.7.0.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,<6.0.0)"]
 
 [[package]]
 name = "ghp-import"
 version = "2.1.0"
 description = "Copy your docs directly to the gh-pages branch."
-category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -416,7 +410,6 @@ dev = ["flake8", "markdown", "twine", "wheel"]
 name = "griffe"
 version = "0.27.3"
 description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -431,7 +424,6 @@ colorama = ">=0.4"
 name = "h11"
 version = "0.14.0"
 description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -443,7 +435,6 @@ files = [
 name = "httpcore"
 version = "0.17.0"
 description = "A minimal low-level HTTP client."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -455,17 +446,16 @@ files = [
 anyio = ">=3.0,<5.0"
 certifi = "*"
 h11 = ">=0.13,<0.15"
-sniffio = ">=1.0.0,<2.0.0"
+sniffio = "==1.*"
 
 [package.extras]
 http2 = ["h2 (>=3,<5)"]
-socks = ["socksio (>=1.0.0,<2.0.0)"]
+socks = ["socksio (==1.*)"]
 
 [[package]]
 name = "httpx"
 version = "0.24.1"
 description = "The next generation HTTP client."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -481,15 +471,14 @@ sniffio = "*"
 
 [package.extras]
 brotli = ["brotli", "brotlicffi"]
-cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"]
+cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
 http2 = ["h2 (>=3,<5)"]
-socks = ["socksio (>=1.0.0,<2.0.0)"]
+socks = ["socksio (==1.*)"]
 
 [[package]]
 name = "idna"
 version = "3.4"
 description = "Internationalized Domain Names in Applications (IDNA)"
-category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -501,7 +490,6 @@ files = [
 name = "importlib-metadata"
 version = "6.6.0"
 description = "Read metadata from Python packages"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -521,7 +509,6 @@ testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packag
 name = "importlib-resources"
 version = "5.12.0"
 description = "Read resources from Python packages"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -540,7 +527,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec
 name = "iniconfig"
 version = "2.0.0"
 description = "brain-dead simple config-ini parsing"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -552,7 +538,6 @@ files = [
 name = "jaraco-classes"
 version = "3.2.3"
 description = "Utility functions for Python class constructs"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -571,7 +556,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec
 name = "jeepney"
 version = "0.8.0"
 description = "Low-level, pure Python DBus protocol wrapper."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -587,7 +571,6 @@ trio = ["async_generator", "trio"]
 name = "jinja2"
 version = "3.1.2"
 description = "A very fast and expressive template engine."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -605,7 +588,6 @@ i18n = ["Babel (>=2.7)"]
 name = "keyring"
 version = "23.13.1"
 description = "Store and access your passwords safely."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -630,7 +612,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec
 name = "markdown"
 version = "3.3.7"
 description = "Python implementation of Markdown."
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -648,7 +629,6 @@ testing = ["coverage", "pyyaml"]
 name = "markdown-it-py"
 version = "2.2.0"
 description = "Python port of markdown-it. Markdown parsing, done right!"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -673,7 +653,6 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
 name = "markupsafe"
 version = "2.1.2"
 description = "Safely add untrusted strings to HTML/XML markup."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -733,7 +712,6 @@ files = [
 name = "mdurl"
 version = "0.1.2"
 description = "Markdown URL utilities"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -745,7 +723,6 @@ files = [
 name = "mergedeep"
 version = "1.3.4"
 description = "A deep merge function for 🐍."
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -757,7 +734,6 @@ files = [
 name = "mkdocs"
 version = "1.4.3"
 description = "Project documentation with Markdown."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -786,7 +762,6 @@ min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-imp
 name = "mkdocs-autorefs"
 version = "0.4.1"
 description = "Automatically link across pages in MkDocs."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -802,7 +777,6 @@ mkdocs = ">=1.1"
 name = "mkdocs-material"
 version = "9.1.15"
 description = "Documentation that simply works"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -825,7 +799,6 @@ requests = ">=2.26"
 name = "mkdocs-material-extensions"
 version = "1.1.1"
 description = "Extension pack for Python Markdown and MkDocs Material."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -837,7 +810,6 @@ files = [
 name = "mkdocstrings"
 version = "0.22.0"
 description = "Automatic documentation from sources, for MkDocs."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -865,7 +837,6 @@ python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"]
 name = "mkdocstrings-python"
 version = "0.10.1"
 description = "A Python handler for mkdocstrings."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -881,7 +852,6 @@ mkdocstrings = ">=0.20"
 name = "more-itertools"
 version = "9.1.0"
 description = "More routines for operating on iterables, beyond itertools"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -893,7 +863,6 @@ files = [
 name = "mypy-extensions"
 version = "1.0.0"
 description = "Type system extensions for programs checked with the mypy type checker."
-category = "dev"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -905,7 +874,6 @@ files = [
 name = "numpy"
 version = "1.24.3"
 description = "Fundamental package for array computing in Python"
-category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -943,7 +911,6 @@ files = [
 name = "packaging"
 version = "23.1"
 description = "Core utilities for Python packages"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -955,7 +922,6 @@ files = [
 name = "pathspec"
 version = "0.11.1"
 description = "Utility library for gitignore style pattern matching of file paths."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -967,7 +933,6 @@ files = [
 name = "pkginfo"
 version = "1.9.6"
 description = "Query metadata from sdists / bdists / installed packages."
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -982,7 +947,6 @@ testing = ["pytest", "pytest-cov"]
 name = "platformdirs"
 version = "3.5.0"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -998,7 +962,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-
 name = "pluggy"
 version = "1.0.0"
 description = "plugin and hook calling mechanisms for python"
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1014,7 +977,6 @@ testing = ["pytest", "pytest-benchmark"]
 name = "pycparser"
 version = "2.21"
 description = "C parser in Python"
-category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -1026,7 +988,6 @@ files = [
 name = "pydantic"
 version = "1.10.7"
 description = "Data validation and settings management using python type hints"
-category = "main"
 optional = true
 python-versions = ">=3.7"
 files = [
@@ -1079,7 +1040,6 @@ email = ["email-validator (>=1.0.3)"]
 name = "pygments"
 version = "2.15.1"
 description = "Pygments is a syntax highlighting package written in Python."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1094,7 +1054,6 @@ plugins = ["importlib-metadata"]
 name = "pymdown-extensions"
 version = "9.11"
 description = "Extension pack for Python Markdown."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1110,7 +1069,6 @@ pyyaml = "*"
 name = "pytest"
 version = "7.3.1"
 description = "pytest: simple powerful testing with Python"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1133,7 +1091,6 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no
 name = "python-dateutil"
 version = "2.8.2"
 description = "Extensions to the standard Python datetime module"
-category = "dev"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
 files = [
@@ -1148,7 +1105,6 @@ six = ">=1.5"
 name = "pywin32-ctypes"
 version = "0.2.0"
 description = ""
-category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1160,7 +1116,6 @@ files = [
 name = "pyyaml"
 version = "6.0"
 description = "YAML parser and emitter for Python"
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1210,7 +1165,6 @@ files = [
 name = "pyyaml-env-tag"
 version = "0.1"
 description = "A custom YAML tag for referencing environment variables in YAML files. "
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1225,7 +1179,6 @@ pyyaml = "*"
 name = "readme-renderer"
 version = "37.3"
 description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1245,7 +1198,6 @@ md = ["cmarkgfm (>=0.8.0)"]
 name = "regex"
 version = "2023.5.5"
 description = "Alternative regular expression module, to replace re."
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1343,7 +1295,6 @@ files = [
 name = "requests"
 version = "2.30.0"
 description = "Python HTTP for Humans."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1365,7 +1316,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 name = "requests-toolbelt"
 version = "1.0.0"
 description = "A utility belt for advanced users of python-requests"
-category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -1380,7 +1330,6 @@ requests = ">=2.0.1,<3.0.0"
 name = "rfc3986"
 version = "2.0.0"
 description = "Validating URI References per RFC 3986"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1395,7 +1344,6 @@ idna2008 = ["idna"]
 name = "rich"
 version = "13.3.5"
 description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
-category = "dev"
 optional = false
 python-versions = ">=3.7.0"
 files = [
@@ -1415,7 +1363,6 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"]
 name = "scikit-build"
 version = "0.17.6"
 description = "Improved build system generator for Python C/C++/Fortran/Cython extensions"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1440,7 +1387,6 @@ test = ["build (>=0.7)", "cython (>=0.25.1)", "importlib-metadata", "pytest (>=6
 name = "secretstorage"
 version = "3.3.3"
 description = "Python bindings to FreeDesktop.org Secret Service API"
-category = "dev"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -1456,7 +1402,6 @@ jeepney = ">=0.6"
 name = "setuptools"
 version = "67.7.2"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1473,7 +1418,6 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (
 name = "six"
 version = "1.16.0"
 description = "Python 2 and 3 compatibility utilities"
-category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
 files = [
@@ -1485,7 +1429,6 @@ files = [
 name = "sniffio"
 version = "1.3.0"
 description = "Sniff out which async library your code is running under"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1497,7 +1440,6 @@ files = [
 name = "sse-starlette"
 version = "1.6.1"
 description = "\"SSE plugin for Starlette\""
-category = "main"
 optional = true
 python-versions = ">=3.8"
 files = [
@@ -1512,7 +1454,6 @@ starlette = "*"
 name = "starlette"
 version = "0.27.0"
 description = "The little ASGI library that shines."
-category = "main"
 optional = true
 python-versions = ">=3.7"
 files = [
@@ -1531,7 +1472,6 @@ full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyam
 name = "tomli"
 version = "2.0.1"
 description = "A lil' TOML parser"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1543,7 +1483,6 @@ files = [
 name = "twine"
 version = "4.0.2"
 description = "Collection of utilities for publishing packages on PyPI"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1566,7 +1505,6 @@ urllib3 = ">=1.26.0"
 name = "typing-extensions"
 version = "4.6.3"
 description = "Backported and Experimental Type Hints for Python 3.7+"
-category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1578,7 +1516,6 @@ files = [
 name = "urllib3"
 version = "2.0.2"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1596,7 +1533,6 @@ zstd = ["zstandard (>=0.18.0)"]
 name = "uvicorn"
 version = "0.22.0"
 description = "The lightning-fast ASGI server."
-category = "main"
 optional = true
 python-versions = ">=3.7"
 files = [
@@ -1615,7 +1551,6 @@ standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)",
 name = "watchdog"
 version = "3.0.0"
 description = "Filesystem events monitoring"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1655,7 +1590,6 @@ watchmedo = ["PyYAML (>=3.10)"]
 name = "webencodings"
 version = "0.5.1"
 description = "Character encoding aliases for legacy web content"
-category = "dev"
 optional = false
 python-versions = "*"
 files = [
@@ -1667,7 +1601,6 @@ files = [
 name = "wheel"
 version = "0.40.0"
 description = "A built-package format for Python"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1682,7 +1615,6 @@ test = ["pytest (>=6.0.0)"]
 name = "zipp"
 version = "3.15.0"
 description = "Backport of pathlib-compatible object wrapper for zip files"
-category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -1695,9 +1627,9 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker
 testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
 
 [extras]
-server = ["uvicorn", "fastapi", "sse-starlette"]
+server = ["fastapi", "sse-starlette", "uvicorn"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "5c3354c253bc7ab7c7577a9a3733c7a341e91176e1d0c13dc2e3f3dcc0971bbe"
+content-hash = "8effb1d2fa3fa9026f291dd19e8fa20f84967e63c5e3c1e0cdfcdbaa547fb586"
diff --git a/pyproject.toml b/pyproject.toml
index 564059c..f2dd4b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ typing-extensions = "^4.6.3"
 numpy = "^1.20.0"
 diskcache = "^5.6.1"
 uvicorn = { version = "^0.22.0", optional = true }
-fastapi = { version = "^0.96.0", optional = true }
+fastapi = { version = "^0.97.0", optional = true }
 sse-starlette = { version = "^1.6.1", optional = true }
 
 [tool.poetry.group.dev.dependencies]

From 94f63a66b9f6f24f3e0079efa0f98c5872ef3a82 Mon Sep 17 00:00:00 2001
From: Ian Scrivener <github@zilogy.asia>
Date: Tue, 13 Jun 2023 09:49:19 +1000
Subject: [PATCH 05/27] Create macos_install.md

add MacOS Metal markdown install instructions
---
 docs/macos_install.md | 62 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 docs/macos_install.md

diff --git a/docs/macos_install.md b/docs/macos_install.md
new file mode 100644
index 0000000..7d46bc4
--- /dev/null
+++ b/docs/macos_install.md
@@ -0,0 +1,62 @@
+
+# llama-cpp-python - MacOS Install with Metal GPU
+
+
+**(1) Make sure you have xcode installed... at least the command line parts**
+```
+# check the path of your xcode install 
+xcode-select -p
+
+# xcode installed returns
+# /Applications/Xcode-beta.app/Contents/Developer
+
+# if xcode is missing then install it... it takes ages;
+xcode-select --install
+```
+
+**(2) Install the conda version for MacOS that supports Metal GPU**
+```
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
+bash Miniforge3-MacOSX-arm64.sh
+```
+
+**(3) Make a conda environment**
+```
+conda create -n llama python=3.9.16
+conda activate llama
+```
+
+**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU**  
+    *(you needed xcode installed in order pip to build/compile the C++ code)*
+```
+pip uninstall llama-cpp-python -y
+CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
+pip install 'llama-cpp-python[server]'
+
+# you should now have llama-cpp-python v0.1.62 installed
+llama-cpp-python         0.1.62      
+
+```
+
+**(4) Download a v3 ggml llama/vicuna/alpaca model**
+ - **ggmlv3**
+ - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0
+
+https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-q4_0.bin
+https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-uncensored-q4_0.bin
+https://huggingface.co/TheBloke/LLaMa-7B-GGML/blob/main/llama-7b.ggmlv3.q4_0.bin
+https://huggingface.co/TheBloke/LLaMa-13B-GGML/blob/main/llama-13b.ggmlv3.q4_0.bin
+
+
+**(6) run the llama-cpp-python API server with MacOS Metal GPU support**
+```
+# config your ggml model path
+# make sure it is ggml v3
+# make sure it is q4_0
+export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin
+python3 -m llama_cpp.server --model $MODEL  --n_gpu_layers 1
+```
+
+***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used*
+
+

From 7ca50a3e45a89fda886a3f8179b7a70fc2bda197 Mon Sep 17 00:00:00 2001
From: Ian Scrivener <github@zilogy.asia>
Date: Tue, 13 Jun 2023 09:52:22 +1000
Subject: [PATCH 06/27] Update README.md

add link to main README>md
---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index ee6e540..a4ca04d 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,9 @@ This package provides:
 
 Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python).
 
+Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md)
+
+
 ## Installation from PyPI (recommended)
 
 Install from PyPI (requires a c compiler):

From 613dd70c8a9e54c373428055102283fdd468f09b Mon Sep 17 00:00:00 2001
From: Matt Dennewitz <mattdennewitz@gmail.com>
Date: Tue, 13 Jun 2023 00:56:05 -0500
Subject: [PATCH 07/27] Update README.md

Fixes typo in README
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ee6e540..c099cbf 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ Install from PyPI (requires a c compiler):
 pip install llama-cpp-python
 ```
 
-The above command will attempt to install the package and build build `llama.cpp` from source.
+The above command will attempt to install the package and build `llama.cpp` from source.
 This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system.
 
 If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different  compiler options, please add the following flags to ensure that the package is rebuilt correctly:

From fe41cb9043e4ca54e7a0989baae68eb5b730a0b4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 13 Jun 2023 15:07:50 +0000
Subject: [PATCH 08/27] Bump pytest from 7.3.1 to 7.3.2

Bumps [pytest](https://github.com/pytest-dev/pytest) from 7.3.1 to 7.3.2.
- [Release notes](https://github.com/pytest-dev/pytest/releases)
- [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest/compare/7.3.1...7.3.2)

---
updated-dependencies:
- dependency-name: pytest
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 10 +++++-----
 pyproject.toml |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 1d95d76..e720acc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1067,13 +1067,13 @@ pyyaml = "*"
 
 [[package]]
 name = "pytest"
-version = "7.3.1"
+version = "7.3.2"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
-    {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
+    {file = "pytest-7.3.2-py3-none-any.whl", hash = "sha256:cdcbd012c9312258922f8cd3f1b62a6580fdced17db6014896053d47cddf9295"},
+    {file = "pytest-7.3.2.tar.gz", hash = "sha256:ee990a3cc55ba808b80795a79944756f315c67c12b56abd3ac993a7b8c17030b"},
 ]
 
 [package.dependencies]
@@ -1085,7 +1085,7 @@ pluggy = ">=0.12,<2.0"
 tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
-testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
 [[package]]
 name = "python-dateutil"
@@ -1632,4 +1632,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "8effb1d2fa3fa9026f291dd19e8fa20f84967e63c5e3c1e0cdfcdbaa547fb586"
+content-hash = "1d809f04ae0543b3476915b5b767e070811908cc75032f8dc8867294cbf0055d"
diff --git a/pyproject.toml b/pyproject.toml
index f2dd4b7..9d1be84 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,7 @@ twine = "^4.0.2"
 mkdocs = "^1.4.3"
 mkdocstrings = {extras = ["python"], version = "^0.22.0"}
 mkdocs-material = "^9.1.15"
-pytest = "^7.3.1"
+pytest = "^7.3.2"
 httpx = "^0.24.1"
 scikit-build = "0.17.6"
 

From 715f98c591e9249acc051e73b9757666e656ab57 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 14 Jun 2023 21:40:13 -0400
Subject: [PATCH 09/27] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 22 ++++++++++++++++++++++
 vendor/llama.cpp       |  2 +-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 29136c7..be5e9c3 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -155,6 +155,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 #     int n_gpu_layers;                      // number of layers to store in VRAM
 #     int main_gpu;                          // the GPU that is used for scratch and small tensors
 #     float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+#     bool low_vram;                         // if true, reduce VRAM usage at the cost of performance
 #     int seed;                              // RNG seed, -1 for random
 
 #     bool f16_kv;     // use fp16 for KV cache
@@ -177,6 +178,7 @@ class llama_context_params(Structure):
         ("n_gpu_layers", c_int),
         ("main_gpu", c_int),
         ("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
+        ("low_vram", c_bool),
         ("seed", c_int),
         ("f16_kv", c_bool),
         (
@@ -555,6 +557,26 @@ _lib.llama_n_embd.argtypes = [llama_context_p]
 _lib.llama_n_embd.restype = c_int
 
 
+# // Get the vocabulary as output parameters.
+# // Returns number of results.
+# LLAMA_API int llama_get_vocab(
+#         const struct llama_context * ctx,
+#                         const char * * strings,
+#                                 float * scores,
+#                                 int   capacity);
+def llama_get_vocab(
+    ctx: llama_context_p,
+    strings,  # type: Array[c_char_p] # type: ignore
+    scores,  # type: Array[c_float] # type: ignore
+    capacity: c_int,
+) -> int:
+    return _lib.llama_get_vocab(ctx, strings, scores, capacity)
+
+
+_lib.llama_get_vocab.argtypes = [llama_context_p, c_char_p, c_float, c_int]
+_lib.llama_get_vocab.restype = c_int
+
+
 # Token logits obtained from the last call to llama_eval()
 # The logits for the last token are stored in the last row
 # Can be mutated in order to change the probabilities of the next token
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 4de0334..254a7a7 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 4de0334f5cabf4696eced2e5d6e279fdfaa6c0f2
+Subproject commit 254a7a7a5ff4c874ff8488f1f5cbdd7e9c89d682

From f27393ab7ed06c769aba414dcaf2d544ab0c4c35 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 14 Jun 2023 21:46:48 -0400
Subject: [PATCH 10/27] Add additional verbose logs for cache

---
 llama_cpp/server/app.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 2191005..e248472 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -119,8 +119,12 @@ def create_app(settings: Optional[Settings] = None):
     )
     if settings.cache:
         if settings.cache_type == "disk":
+            if settings.verbose:
+                print(f"Using disk cache with size {settings.cache_size}")
             cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
         else:
+            if settings.verbose:
+                print(f"Using ram cache with size {settings.cache_size}")
             cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
 
         cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size)

From f7c5cfaf503eb251202f609dbbc8b5b337771de5 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 14 Jun 2023 22:08:28 -0400
Subject: [PATCH 11/27] Format server options

---
 llama_cpp/server/app.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 999d1e6..0d011f0 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -260,18 +260,18 @@ class CreateCompletionRequest(BaseModel):
     presence_penalty: Optional[float] = presence_penalty_field
     frequency_penalty: Optional[float] = frequency_penalty_field
     logit_bias: Optional[Dict[str, float]] = Field(None)
-    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
+    logprobs: Optional[int] = Field(None)
 
     # ignored or currently unsupported
     model: Optional[str] = model_field
     n: Optional[int] = 1
-    logprobs: Optional[int] = Field(None)
     best_of: Optional[int] = 1
     user: Optional[str] = Field(None)
 
     # llama.cpp specific parameters
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
 
     class Config:
         schema_extra = {
@@ -424,7 +424,6 @@ class CreateChatCompletionRequest(BaseModel):
     presence_penalty: Optional[float] = presence_penalty_field
     frequency_penalty: Optional[float] = frequency_penalty_field
     logit_bias: Optional[Dict[str, float]] = Field(None)
-    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
 
     # ignored or currently unsupported
     model: Optional[str] = model_field
@@ -434,6 +433,7 @@ class CreateChatCompletionRequest(BaseModel):
     # llama.cpp specific parameters
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
 
     class Config:
         schema_extra = {

From 44b83cada5a9183d42a42670252b97b2ea7b37f3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 14 Jun 2023 22:12:33 -0400
Subject: [PATCH 12/27] Add low_vram parameter

---
 llama_cpp/llama.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 46a9aeb..a6f1e76 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -219,6 +219,7 @@ class Llama:
         last_n_tokens_size: int = 64,
         lora_base: Optional[str] = None,
         lora_path: Optional[str] = None,
+        low_vram: bool = False,
         verbose: bool = True,
     ):
         """Load a llama.cpp model from `model_path`.
@@ -260,6 +261,7 @@ class Llama:
         self.params.use_mmap = use_mmap if lora_path is None else False
         self.params.use_mlock = use_mlock
         self.params.embedding = embedding
+        self.params.low_vram = low_vram
 
         self.last_n_tokens_size = last_n_tokens_size
         self.n_batch = min(n_ctx, n_batch)
@@ -1447,6 +1449,7 @@ class Llama:
             use_mmap=self.params.use_mmap,
             use_mlock=self.params.use_mlock,
             embedding=self.params.embedding,
+            low_vram=self.params.low_vram,
             last_n_tokens_size=self.last_n_tokens_size,
             n_batch=self.n_batch,
             n_threads=self.n_threads,
@@ -1470,6 +1473,7 @@ class Llama:
             use_mmap=state["use_mmap"],
             use_mlock=state["use_mlock"],
             embedding=state["embedding"],
+            low_vram=state["low_vram"],
             n_threads=state["n_threads"],
             n_batch=state["n_batch"],
             last_n_tokens_size=state["last_n_tokens_size"],

From 1e20be6d0c0ada75bbd30ae855d17569dd346b8f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 14 Jun 2023 22:13:42 -0400
Subject: [PATCH 13/27] Add low_vram to server settings

---
 llama_cpp/server/app.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 0d011f0..313e27d 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -48,6 +48,10 @@ class Settings(BaseSettings):
         description="Use mmap.",
     )
     embedding: bool = Field(default=True, description="Whether to use embeddings.")
+    low_vram: bool = Field(
+        default=False,
+        description="Whether to use less VRAM. This will reduce performance.",
+    )
     last_n_tokens_size: int = Field(
         default=64,
         ge=0,

From 54e2e4ffde8eac57ca3f0ad117b878837d7c3d1f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 14 Jun 2023 22:15:22 -0400
Subject: [PATCH 14/27] Move metal docs to metal section of README.

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c29202c..0e62f3d 100644
--- a/README.md
+++ b/README.md
@@ -17,8 +17,6 @@ This package provides:
 
 Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python).
 
-Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md)
-
 
 ## Installation from PyPI (recommended)
 
@@ -73,6 +71,8 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor
 CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 
+Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md)
+
 ## High-level API
 
 The high-level API provides a simple managed interface through the `Llama` class.

From d938e5900369d4af2dfe86e1f51cd402cb58c87c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 14 Jun 2023 22:15:44 -0400
Subject: [PATCH 15/27] Bump version

---
 CHANGELOG.md   | 7 +++++++
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bf6ed5d..7a01f6d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- (llama.cpp) Add full gpu utilisation in CUDA
+- (llama.cpp) Add get_vocab
+- (llama.cpp) Add low_vram parameter
+- (server) Add logit_bias parameter
+
 ## [0.1.62]
 
 ### Fixed
diff --git a/pyproject.toml b/pyproject.toml
index 9d1be84..281e1bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.62"
+version = "0.1.63"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index bb423d8..0449149 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.62",
+    version="0.1.63",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From fd9f294b3a5194844f50d3b77cc71c51b8ffbb52 Mon Sep 17 00:00:00 2001
From: imaprogrammer <46126206+nb-programmer@users.noreply.github.com>
Date: Fri, 16 Jun 2023 14:11:57 +0530
Subject: [PATCH 16/27] Update llama.py: Added how many input tokens in
 ValueError exception

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index a6f1e76..366f050 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -814,7 +814,7 @@ class Llama:
             llama_cpp.llama_reset_timings(self.ctx)
 
         if len(prompt_tokens) > self._n_ctx:
-            raise ValueError(f"Requested tokens exceed context window of {self._n_ctx}")
+            raise ValueError(f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}")
 
         # Truncate max_tokens if requested tokens would exceed the context window
         max_tokens = (

From 37d5192a92a9e3a861027af03dab7a792436fad7 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 16 Jun 2023 10:41:51 -0400
Subject: [PATCH 17/27] Update docs

---
 docs/macos_install.md | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/docs/macos_install.md b/docs/macos_install.md
index 7d46bc4..33dcb5d 100644
--- a/docs/macos_install.md
+++ b/docs/macos_install.md
@@ -38,14 +38,11 @@ llama-cpp-python         0.1.62     
 
 ```
 
-**(4) Download a v3 ggml llama/vicuna/alpaca model**
+**(4) Download a v3 ggml model**
  - **ggmlv3**
  - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0
 
-https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-q4_0.bin
-https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-uncensored-q4_0.bin
-https://huggingface.co/TheBloke/LLaMa-7B-GGML/blob/main/llama-7b.ggmlv3.q4_0.bin
-https://huggingface.co/TheBloke/LLaMa-13B-GGML/blob/main/llama-13b.ggmlv3.q4_0.bin
+https://huggingface.co/TheBloke/open-llama-7b-open-instruct-GGML
 
 
 **(6) run the llama-cpp-python API server with MacOS Metal GPU support**

From d7153abcf820b6ad39192857a1be8b806595990d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 16 Jun 2023 23:11:14 -0400
Subject: [PATCH 18/27] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 6 +++---
 vendor/llama.cpp       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index be5e9c3..d6be0ea 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -618,7 +618,7 @@ _lib.llama_token_to_str.restype = c_char_p
 # Special tokens
 
 
-# LLAMA_API llama_token llama_token_bos();
+# LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
 def llama_token_bos() -> int:
     return _lib.llama_token_bos()
 
@@ -627,7 +627,7 @@ _lib.llama_token_bos.argtypes = []
 _lib.llama_token_bos.restype = llama_token
 
 
-# LLAMA_API llama_token llama_token_eos();
+# LLAMA_API llama_token llama_token_eos(); // end-of-sentence
 def llama_token_eos() -> int:
     return _lib.llama_token_eos()
 
@@ -636,7 +636,7 @@ _lib.llama_token_eos.argtypes = []
 _lib.llama_token_eos.restype = llama_token
 
 
-# LLAMA_API llama_token llama_token_nl();
+# LLAMA_API llama_token llama_token_nl(); // next-line
 def llama_token_nl() -> int:
     return _lib.llama_token_nl()
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 254a7a7..d411968 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 254a7a7a5ff4c874ff8488f1f5cbdd7e9c89d682
+Subproject commit d411968e990c37f51328849c96a743dd78f3c3dd

From 60426b23cc6b9f715214ec09a144e477bfcb2b06 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 17 Jun 2023 13:37:14 -0400
Subject: [PATCH 19/27] Update llama.cpp

---
 CHANGELOG.md     | 6 ++++++
 vendor/llama.cpp | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7a01f6d..9fba95d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- (llama.cpp) Update llama.cpp
+
+## [0.1.63]
+
+### Added
+
 - (llama.cpp) Add full gpu utilisation in CUDA
 - (llama.cpp) Add get_vocab
 - (llama.cpp) Add low_vram parameter
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index d411968..4f9c43e 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit d411968e990c37f51328849c96a743dd78f3c3dd
+Subproject commit 4f9c43e3bd488b7561119785485e1155dba338d7

From d410f12fae32bf77a8eedc05e7bef263dc6b7cfd Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 17 Jun 2023 13:38:48 -0400
Subject: [PATCH 20/27] Update docs. Closes #386

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 366f050..a0b2030 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -228,7 +228,7 @@ class Llama:
             model_path: Path to the model.
             n_ctx: Maximum context size.
             n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
-            seed: Random seed. 0 for random.
+            seed: Random seed. -1 for random.
             f16_kv: Use half-precision for key/value cache.
             logits_all: Return logits for all tokens, not just the last token.
             vocab_only: Only load the vocabulary no weights.

From c7d7d5b656cb63ab54c17483dec2ba36b45142f5 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 17 Jun 2023 13:39:48 -0400
Subject: [PATCH 21/27] Update Changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9fba95d..c4cd88c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - (llama.cpp) Update llama.cpp
+- Fix docs for seed. Set -1 for random.
 
 ## [0.1.63]
 

From 44dcb5cf715cd384af85b99d13190c8d96f1f85e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 18 Jun 2023 09:37:20 -0400
Subject: [PATCH 22/27] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 4f9c43e..8596af4 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 4f9c43e3bd488b7561119785485e1155dba338d7
+Subproject commit 8596af427722775f0df4a7c90b9af067ba90d4ef

From 92b0013427be9a1fcea29a3090aa51d0fd8fb35f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 18 Jun 2023 09:48:43 -0400
Subject: [PATCH 23/27] Bump version

---
 CHANGELOG.md   | 2 ++
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c4cd88c..0060af5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.64]
+
 ### Added
 
 - (llama.cpp) Update llama.cpp
diff --git a/pyproject.toml b/pyproject.toml
index 281e1bb..eb7d23b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.63"
+version = "0.1.64"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 0449149..cc17564 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.63",
+    version="0.1.64",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From d5974a1096860e13a7dd6c123bd4557497c6b70c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 19 Jun 2023 21:07:49 +0000
Subject: [PATCH 24/27] Bump mkdocs-material from 9.1.15 to 9.1.16

Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.15 to 9.1.16.
- [Release notes](https://github.com/squidfunk/mkdocs-material/releases)
- [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG)
- [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.15...9.1.16)

---
updated-dependencies:
- dependency-name: mkdocs-material
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 8 ++++----
 pyproject.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index e720acc..e006449 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -775,13 +775,13 @@ mkdocs = ">=1.1"
 
 [[package]]
 name = "mkdocs-material"
-version = "9.1.15"
+version = "9.1.16"
 description = "Documentation that simply works"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocs_material-9.1.15-py3-none-any.whl", hash = "sha256:b49e12869ab464558e2dd3c5792da5b748a7e0c48ee83b4d05715f98125a7a39"},
-    {file = "mkdocs_material-9.1.15.tar.gz", hash = "sha256:8513ab847c9a541ed3d11a3a7eed556caf72991ee786c31c5aac6691a121088a"},
+    {file = "mkdocs_material-9.1.16-py3-none-any.whl", hash = "sha256:f9e62558a6b01ffac314423cbc223d970c25fbc78999860226245b64e64d6751"},
+    {file = "mkdocs_material-9.1.16.tar.gz", hash = "sha256:1021bfea20f00a9423530c8c2ae9be3c78b80f5a527b3f822e6de3d872e5ab79"},
 ]
 
 [package.dependencies]
@@ -1632,4 +1632,4 @@ server = ["fastapi", "sse-starlette", "uvicorn"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "1d809f04ae0543b3476915b5b767e070811908cc75032f8dc8867294cbf0055d"
+content-hash = "fabdd2d7dba563fe7b01b4592dfb33e520b5f6e67317ce5f03205ecba396a577"
diff --git a/pyproject.toml b/pyproject.toml
index eb7d23b..19015b0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ black = "^23.3.0"
 twine = "^4.0.2"
 mkdocs = "^1.4.3"
 mkdocstrings = {extras = ["python"], version = "^0.22.0"}
-mkdocs-material = "^9.1.15"
+mkdocs-material = "^9.1.16"
 pytest = "^7.3.2"
 httpx = "^0.24.1"
 scikit-build = "0.17.6"

From e37798777e8aed908787f209396190438d724c72 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 20 Jun 2023 11:25:10 -0400
Subject: [PATCH 25/27] Update llama.cpp

---
 CHANGELOG.md           |  4 ++++
 llama_cpp/llama_cpp.py | 26 +++++++++++---------------
 vendor/llama.cpp       |  2 +-
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0060af5..a6cb99b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- (llama.cpp) Fix struct misalignment bug
+
 ## [0.1.64]
 
 ### Added
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index d6be0ea..a516829 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -150,47 +150,43 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 
 
 # struct llama_context_params {
+#     int seed;                              // RNG seed, -1 for random
 #     int n_ctx;                             // text context
 #     int n_batch;                           // prompt processing batch size
 #     int n_gpu_layers;                      // number of layers to store in VRAM
 #     int main_gpu;                          // the GPU that is used for scratch and small tensors
 #     float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
-#     bool low_vram;                         // if true, reduce VRAM usage at the cost of performance
-#     int seed;                              // RNG seed, -1 for random
+#     // called with a progress value between 0 and 1, pass NULL to disable
+#     llama_progress_callback progress_callback;
+#     // context pointer passed to the progress callback
+#     void * progress_callback_user_data;
 
+#     // Keep the booleans together to avoid misalignment during copy-by-value.
+#     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
 #     bool f16_kv;     // use fp16 for KV cache
 #     bool logits_all; // the llama_eval() call computes all logits, not just the last one
 #     bool vocab_only; // only load the vocabulary, no weights
 #     bool use_mmap;   // use mmap if possible
 #     bool use_mlock;  // force system to keep model in RAM
 #     bool embedding;  // embedding mode only
-
-
-#     // called with a progress value between 0 and 1, pass NULL to disable
-#     llama_progress_callback progress_callback;
-#     // context pointer passed to the progress callback
-#     void * progress_callback_user_data;
 # };
 class llama_context_params(Structure):
     _fields_ = [
+        ("seed", c_int),
         ("n_ctx", c_int),
         ("n_batch", c_int),
         ("n_gpu_layers", c_int),
         ("main_gpu", c_int),
         ("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
+        ("progress_callback", llama_progress_callback),
+        ("progress_callback_user_data", c_void_p),
         ("low_vram", c_bool),
-        ("seed", c_int),
         ("f16_kv", c_bool),
-        (
-            "logits_all",
-            c_bool,
-        ),
+        ("logits_all", c_bool),
         ("vocab_only", c_bool),
         ("use_mmap", c_bool),
         ("use_mlock", c_bool),
         ("embedding", c_bool),
-        ("progress_callback", llama_progress_callback),
-        ("progress_callback_user_data", c_void_p),
     ]
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 8596af4..2322ec2 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 8596af427722775f0df4a7c90b9af067ba90d4ef
+Subproject commit 2322ec223a21625dfe9bd73ee677444a98a24ac9

From 3e7eae479631890196823324e0573416408f52a0 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 20 Jun 2023 11:25:44 -0400
Subject: [PATCH 26/27] Bump Version

---
 CHANGELOG.md   | 2 ++
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a6cb99b..d5925bc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.65]
+
 ### Added
 
 - (llama.cpp) Fix struct misalignment bug
diff --git a/pyproject.toml b/pyproject.toml
index eb7d23b..dac026c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.64"
+version = "0.1.65"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index cc17564..9f27648 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.64",
+    version="0.1.65",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 282698b6d383e216e129856f25b0ca41348ad525 Mon Sep 17 00:00:00 2001
From: Alexey <lexin4ever@gmail.com>
Date: Fri, 23 Jun 2023 00:19:24 +0400
Subject: [PATCH 27/27] server: pass seed param from command line to llama

---
 llama_cpp/server/app.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 313e27d..ef319c7 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -30,6 +30,9 @@ class Settings(BaseSettings):
         ge=0,
         description="The number of layers to put on the GPU. The rest will be on the CPU.",
     )
+    seed: int = Field(
+        default=1337, description="Random seed. -1 for random."
+    )
     n_batch: int = Field(
         default=512, ge=1, description="The batch size to use per eval."
     )
@@ -109,6 +112,7 @@ def create_app(settings: Optional[Settings] = None):
     llama = llama_cpp.Llama(
         model_path=settings.model,
         n_gpu_layers=settings.n_gpu_layers,
+        seed=settings.seed,
         f16_kv=settings.f16_kv,
         use_mlock=settings.use_mlock,
         use_mmap=settings.use_mmap,