Merge https://github.com/abetlen/llama-cpp-python

2024-04-05 10:38:53 +05:30 · 2024-04-05 10:38:53 +05:30 · 8b9cd38c0d
commit 8b9cd38c0d
parent d3afd4507f 909ef66951
18 changed files with 569 additions and 85 deletions
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@ -11,7 +11,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-latest, windows-latest, macOS-latest]
+        os: [ubuntu-20.04, windows-2019, macos-11]

    steps:
      - uses: actions/checkout@v3
@ -23,19 +23,19 @@ jobs:
        with:
          python-version: "3.8"

-      - name: Install cibuildwheel
-        run: python -m pip install cibuildwheel==2.12.1
-
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          python -m pip install -e .[all]

      - name: Build wheels
-        run: python -m cibuildwheel --output-dir wheelhouse
+        uses: pypa/cibuildwheel@v2.16.5
        env:
          # disable repair
          CIBW_REPAIR_WHEEL_COMMAND: ""
+        with:
+          package-dir: .
+          output-dir: wheelhouse

      - uses: actions/upload-artifact@v3
        with:
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@ -0,0 +1,131 @@
+name: Build Wheels (CUDA)
+
+on: workflow_dispatch
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-20.04', 'windows-latest')
+              'pyver' = @("3.10", "3.11", "3.12")
+              'cuda' = @("12.1.1", "12.2.2", "12.3.2")
+              'releasetag' = @("basic")
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CUDAVER: ${{ matrix.cuda }}
+      AVXVER: ${{ matrix.releasetag }}
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Setup Mamba
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.pyver }}
+          miniforge-variant: Mambaforge
+          miniforge-version: latest
+          use-mamba: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+
+      - name: VS Integration Cache
+        id: vs-integration-cache
+        if: runner.os == 'Windows'
+        uses: actions/cache@v3.3.2
+        with:
+          path: ./MSBuildExtensions
+          key: cuda-${{ matrix.cuda }}-vs-integration
+
+      - name: Get Visual Studio Integration
+        if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
+        run: |
+          if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
+          $links = (Invoke-RestMethod 'https://github.com/Jimver/cuda-toolkit/raw/dc0ca7bb29c5a92f7a963d3d5c93f8d59765136a/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
+          for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
+          Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
+          & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
+          Remove-Item 'cudainstaller.zip'
+
+      - name: Install Visual Studio Integration
+        if: runner.os == 'Windows'
+        run: |
+          $y = (gi '.\MSBuildExtensions').fullname + '\*'
+          (gi 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_})
+          $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_')
+          echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
+
+      - name: Install Dependencies
+        env:
+          MAMBA_DOWNLOAD_FAILFAST: "0"
+          MAMBA_NO_LOW_SPEED_LIMIT: "1"
+        run: |
+          $cudaVersion = $env:CUDAVER
+          mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
+          python -m pip install build wheel
+
+      - name: Build Wheel
+        run: |
+          $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
+          $env:CUDA_PATH = $env:CONDA_PREFIX
+          $env:CUDA_HOME = $env:CONDA_PREFIX
+          $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
+          if ($IsLinux) {
+            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
+          }
+          $env:VERBOSE = '1'
+          $env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all'
+          $env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
+          if ($env:AVXVER -eq 'AVX') {
+            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
+          }
+          if ($env:AVXVER -eq 'AVX512') {
+            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on'
+          }
+          if ($env:AVXVER -eq 'basic') {
+            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
+          }
+          python -m build --wheel
+          # write the build tag to the output
+          Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
+
+      - uses: softprops/action-gh-release@v1
+        with:
+          files: dist/*
+          # Set tag_name to <tag>-cu<cuda_version>
+          tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/build-wheels-metal.yaml
+++ b/.github/workflows/build-wheels-metal.yaml
@ -0,0 +1,87 @@
+name: Build Wheels (Metal)
+
+on: workflow_dispatch
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('macos-11', 'macos-12', 'macos-13')
+              'pyver' = @('3.10', '3.11', '3.12')
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: ${{ matrix.os }} Python ${{ matrix.pyver }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    env:
+      OSVER: ${{ matrix.os }}
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install build wheel cmake
+
+      - name: Build Wheel
+        run: |
+          XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer"
+          XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin"
+          export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_METAL=on"
+          [[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0"
+          [[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0"
+          [[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0"
+
+          export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64"
+          VERBOSE=1 python -m build --wheel
+
+          if [[ "$OSVER" == "macos-13" ]]; then
+            export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
+            export MACOSX_DEPLOYMENT_TARGET="14.0"
+            VERBOSE=1 python -m build --wheel
+          fi
+
+          for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done
+
+          export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_METAL=on" && export ARCHFLAGS="-arch x86_64"
+          VERBOSE=1 python -m build --wheel
+
+          if [[ "$OSVER" == "macos-13" ]]; then
+            export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
+            export MACOSX_DEPLOYMENT_TARGET="14.0"
+            VERBOSE=1 python -m build --wheel
+          fi
+
+      - uses: softprops/action-gh-release@v1
+        with:
+          files: dist/*
+          # set release name to <tag>-metal
+          tag_name: ${{ github.ref_name }}-metal
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generate-index-from-release.yaml
+++ b/.github/workflows/generate-index-from-release.yaml
@ -0,0 +1,48 @@
+name: Wheels Index
+
+on:
+  # Trigger on any new release
+  release:
+    types: [published]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  # Single deploy job since we're just deploying
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Setup Pages
+        uses: actions/configure-pages@v4
+      - name: Build
+        run: |
+          ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
+          ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          # Upload entire repository
+          path: 'index'
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

+## [0.2.59]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
+- feat: Binary wheels for CPU, CUDA (12.1 - 12.3), Metal by @abetlen, @jllllll, and @oobabooga in #1247
+- fix: segfault when logits_all=False by @abetlen in 8649d7671bd1a7c0d9cc6a5ad91c6ca286512ab3
+- fix: last tokens passing to sample_repetition_penalties function by @ymikhailov in #1295
+
+## [0.2.58]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
+- feat: add support for KV cache quantization options by @Limour-dev in #1307
+- feat: Add logprobs support to chat completions by @windspirit95 in #1311
+- fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64 by @bretello in #1289
+- feat: Add tools/functions variables to Jinja2ChatFormatter, add function response formatting for all simple chat formats by @CISC in #1273
+- fix: Changed local API doc references to hosted by by @lawfordp2017 in #1317
+
 ## [0.2.57]

 - feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
@ -24,7 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [0.2.55]

- feat: Update llama.cpp to ggerganov/9731134296af3a6839cd682e51d9c2109a871de5
+- feat: Update llama.cpp to ggerganov/llama.cpp@9731134296af3a6839cd682e51d9c2109a871de5
 - docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244

 ## [0.2.54]
--- a/README.md
+++ b/README.md
@ -6,6 +6,7 @@
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
+[![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]()

 Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.
 This package provides:
@ -43,6 +44,15 @@ This will also build `llama.cpp` from source and install it alongside this pytho

 If this fails, add `--verbose` to the `pip install` see the full cmake build log.

+**Pre-built Wheel (New)**
+
+It is also possible to install a pre-built wheel with basic CPU support.
+
+```bash
+pip install llama-cpp-python \
+  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+```
+
 ### Installation Configuration

 `llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https://github.com/ggerganov/llama.cpp#build) for a full list.
@ -99,12 +109,36 @@ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-
 </details>

 <details>
-<summary>cuBLAS (CUDA)</summary>
+<summary>CUDA</summary>

-To install with cuBLAS, set the `LLAMA_CUBLAS=on` environment variable before installing:
+To install with CUDA support, set the `LLAMA_CUDA=on` environment variable before installing:

 ```bash
-CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
+```
+
+**Pre-built Wheel (New)**
+
+It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
+
+- CUDA Version is 12.1, 12.2 or 12.3
+- Python Version is 3.10, 3.11 or 3.12
+
+```bash
+pip install llama-cpp-python \
+  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/<cuda-version>
+```
+
+Where `<cuda-version>` is one of the following:
+- `cu121`: CUDA 12.1
+- `cu122`: CUDA 12.2
+- `cu123`: CUDA 12.3
+
+For example, to install the CUDA 12.1 wheel:
+
+```bash
+pip install llama-cpp-python \
+  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
 ```

 </details>
@ -118,6 +152,18 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor
 CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
 ```

+**Pre-built Wheel (New)**
+
+It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements:
+
+- MacOS Version is 11.0 or later
+- Python Version is 3.10, 3.11 or 3.12
+
+```bash
+pip install llama-cpp-python \
+  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
+```
+
 </details>
 <details>

@ -321,7 +367,7 @@ For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`

 ### JSON and JSON Schema Mode

-To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_chat_completion).
+To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).

 #### JSON Mode

@ -529,7 +575,7 @@ llama = Llama(

 ### Embeddings

-To generate text embeddings use [`create_embedding`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_embedding).
+To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding).

 ```python
 import llama_cpp
@ -568,7 +614,7 @@ python3 -m llama_cpp.server --model models/7B/llama-model.gguf
 Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this:

 ```bash
-CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'
+CMAKE_ARGS="-DLLAMA_CUDA=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'
 python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35
 ```

--- a/llama_cpp/init.py
+++ b/llama_cpp/init.py
@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *

-__version__ = "0.2.57"
+__version__ = "0.2.59"
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@ -730,12 +730,14 @@ class _LlamaSamplingContext:
        if len(self.prev) > 0:
            nl_token = ctx_main.model.token_nl()
            nl_logit = logits_array[nl_token]
-            if self.params.penalty_last_n > 0:
+            last_tokens = self.prev[-self.params.penalty_last_n:]
+            last_tokens_size = min(len(last_tokens), self.params.penalty_last_n)
+            if last_tokens_size > 0:
+                last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens)
                ctx_main.sample_repetition_penalties(
                    token_data_array,
-                    # TODO: Only create this once
-                    (llama_cpp.llama_token * len(self.prev))(*self.prev),
-                    self.params.penalty_last_n,
+                    last_tokens_p,
+                    last_tokens_size,
                    self.params.penalty_repeat,
                    self.params.penalty_freq,
                    self.params.penalty_present,
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -105,6 +105,9 @@ class Llama:
        draft_model: Optional[LlamaDraftModel] = None,
        # Tokenizer Override
        tokenizer: Optional[BaseLlamaTokenizer] = None,
+        # KV cache quantization
+        type_k: Optional[int] = None,
+        type_v: Optional[int] = None,
        # Misc
        verbose: bool = True,
        # Extra Params
@ -172,6 +175,8 @@ class Llama:
            draft_model: Optional draft model to use for speculative decoding.
            tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
            verbose: Print verbose output to stderr.
+            type_k: KV cache data type for K (default: f16)
+            type_v: KV cache data type for V (default: f16)

        Raises:
            ValueError: If the model path does not exist.
@ -298,7 +303,11 @@ class Llama:
        )  # Must be set to True for speculative decoding
        self.context_params.embeddings = embedding # TODO: Rename to embeddings
        self.context_params.offload_kqv = offload_kqv
-
+        #  KV cache quantization
+        if type_k is not None:
+            self.context_params.type_k = type_k
+        if type_v is not None:
+            self.context_params.type_v = type_v
        # Sampling Params
        self.last_n_tokens_size = last_n_tokens_size

@ -526,14 +535,16 @@ class Llama:
            # Save tokens
            self.input_ids[n_past : n_past + n_tokens] = batch
            # Save logits
-            rows = n_tokens
-            cols = self._n_vocab
-            offset = (
-                0 if self.context_params.logits_all else n_tokens - 1
-            )  # NOTE: Only save the last token logits if logits_all is False
-            self.scores[n_past + offset : n_past + n_tokens, :].reshape(-1)[
-                :
-            ] = self._ctx.get_logits()[offset * cols : rows * cols]
+            if self.context_params.logits_all:
+                rows = n_tokens
+                cols = self._n_vocab
+                logits = self._ctx.get_logits()[: rows * cols]
+                self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits
+            else:
+                rows = 1
+                cols = self._n_vocab
+                logits = self._ctx.get_logits()[: rows * cols]
+                self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits
            # Update n_tokens
            self.n_tokens += n_tokens

@ -1653,6 +1664,7 @@ class Llama:
            top_k=top_k,
            min_p=min_p,
            typical_p=typical_p,
+            logprobs=top_logprobs if logprobs else None,
            stream=stream,
            stop=stop,
            seed=seed,
@ -1723,6 +1735,7 @@ class Llama:
            n_threads=self.context_params.n_threads,
            n_threads_batch=self.context_params.n_threads_batch,
            rope_scaling_type=self.context_params.rope_scaling_type,
+            pooling_type=self.context_params.pooling_type,
            rope_freq_base=self.context_params.rope_freq_base,
            rope_freq_scale=self.context_params.rope_freq_scale,
            yarn_ext_factor=self.context_params.yarn_ext_factor,
@ -1732,6 +1745,7 @@ class Llama:
            yarn_orig_ctx=self.context_params.yarn_orig_ctx,
            logits_all=self.context_params.logits_all,
            embedding=self.context_params.embeddings,
+            offload_kqv=self.context_params.offload_kqv,
            # Sampling Params
            last_n_tokens_size=self.last_n_tokens_size,
            # LoRA Params
@ -1743,51 +1757,17 @@ class Llama:
            # Chat Format Params
            chat_format=self.chat_format,
            chat_handler=self.chat_handler,
+            # Speculative Decidng
+            draft_model=self.draft_model,
+            # KV cache quantization
+            type_k=self.context_params.type_k,
+            type_v=self.context_params.type_v,
            # Misc
            verbose=self.verbose,
        )

    def __setstate__(self, state):
-        self.__init__(
-            model_path=state["model_path"],
-            # Model Params
-            n_gpu_layers=state["n_gpu_layers"],
-            split_mode=state["split_mode"],
-            main_gpu=state["main_gpu"],
-            tensor_split=state["tensor_split"],
-            vocab_only=state["vocab_only"],
-            use_mmap=state["use_mmap"],
-            use_mlock=state["use_mlock"],
-            kv_overrides=state["kv_overrides"],
-            # Context Params
-            seed=state["seed"],
-            n_ctx=state["n_ctx"],
-            n_batch=state["n_batch"],
-            n_threads=state["n_threads"],
-            n_threads_batch=state["n_threads_batch"],
-            rope_freq_base=state["rope_freq_base"],
-            rope_freq_scale=state["rope_freq_scale"],
-            rope_scaling_type=state["rope_scaling_type"],
-            yarn_ext_factor=state["yarn_ext_factor"],
-            yarn_attn_factor=state["yarn_attn_factor"],
-            yarn_beta_fast=state["yarn_beta_fast"],
-            yarn_beta_slow=state["yarn_beta_slow"],
-            yarn_orig_ctx=state["yarn_orig_ctx"],
-            logits_all=state["logits_all"],
-            embedding=state["embedding"],
-            # Sampling Params
-            last_n_tokens_size=state["last_n_tokens_size"],
-            # LoRA Params
-            lora_base=state["lora_base"],
-            lora_path=state["lora_path"],
-            # Backend Params
-            numa=state["numa"],
-            # Chat Format Params
-            chat_format=state["chat_format"],
-            chat_handler=state["chat_handler"],
-            # Misc
-            verbose=state["verbose"],
-        )
+        self.__init__(**state)

    def save_state(self) -> LlamaState:
        assert self._ctx.ctx is not None
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -231,6 +231,7 @@ def _convert_text_completion_to_chat(
                    "role": "assistant",
                    "content": completion["choices"][0]["text"],
                },
+                "logprobs": completion["choices"][0]["logprobs"],
                "finish_reason": completion["choices"][0]["finish_reason"],
            }
        ],
@ -254,6 +255,7 @@ def _convert_text_completion_chunks_to_chat(
                        "delta": {
                            "role": "assistant",
                        },
+                        "logprobs": None,
                        "finish_reason": None,
                    }
                ],
@ -273,6 +275,7 @@ def _convert_text_completion_chunks_to_chat(
                        if chunk["choices"][0]["finish_reason"] is None
                        else {}
                    ),
+                    "logprobs": chunk["choices"][0]["logprobs"],
                    "finish_reason": chunk["choices"][0]["finish_reason"],
                }
            ],
@ -487,6 +490,7 @@ def chat_formatter_to_chat_completion_handler(
        temperature: float = 0.2,
        top_p: float = 0.95,
        top_k: int = 40,
+        logprobs: int = 0,
        min_p: float = 0.05,
        typical_p: float = 1.0,
        stream: bool = False,
@ -576,6 +580,7 @@ def chat_formatter_to_chat_completion_handler(
            top_k=top_k,
            min_p=min_p,
            typical_p=typical_p,
+            logprobs=logprobs,
            stream=stream,
            stop=stop,
            seed=seed,
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -141,6 +141,70 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa

 byref = ctypes.byref  # type: ignore

+# from ggml.h
+# // NOTE: always add types at the end of the enum to keep backward compatibility
+# enum ggml_type {
+#     GGML_TYPE_F32     = 0,
+#     GGML_TYPE_F16     = 1,
+#     GGML_TYPE_Q4_0    = 2,
+#     GGML_TYPE_Q4_1    = 3,
+#     // GGML_TYPE_Q4_2 = 4, support has been removed
+#     // GGML_TYPE_Q4_3 = 5, support has been removed
+#     GGML_TYPE_Q5_0    = 6,
+#     GGML_TYPE_Q5_1    = 7,
+#     GGML_TYPE_Q8_0    = 8,
+#     GGML_TYPE_Q8_1    = 9,
+#     GGML_TYPE_Q2_K    = 10,
+#     GGML_TYPE_Q3_K    = 11,
+#     GGML_TYPE_Q4_K    = 12,
+#     GGML_TYPE_Q5_K    = 13,
+#     GGML_TYPE_Q6_K    = 14,
+#     GGML_TYPE_Q8_K    = 15,
+#     GGML_TYPE_IQ2_XXS = 16,
+#     GGML_TYPE_IQ2_XS  = 17,
+#     GGML_TYPE_IQ3_XXS = 18,
+#     GGML_TYPE_IQ1_S   = 19,
+#     GGML_TYPE_IQ4_NL  = 20,
+#     GGML_TYPE_IQ3_S   = 21,
+#     GGML_TYPE_IQ2_S   = 22,
+#     GGML_TYPE_IQ4_XS  = 23,
+#     GGML_TYPE_I8      = 24,
+#     GGML_TYPE_I16     = 25,
+#     GGML_TYPE_I32     = 26,
+#     GGML_TYPE_I64     = 27,
+#     GGML_TYPE_F64     = 28,
+#     GGML_TYPE_IQ1_M   = 29,
+#     GGML_TYPE_COUNT,
+# };
+GGML_TYPE_F32 = 0
+GGML_TYPE_F16 = 1
+GGML_TYPE_Q4_0 = 2
+GGML_TYPE_Q4_1 = 3
+GGML_TYPE_Q5_0 = 6
+GGML_TYPE_Q5_1 = 7
+GGML_TYPE_Q8_0 = 8
+GGML_TYPE_Q8_1 = 9
+GGML_TYPE_Q2_K = 10
+GGML_TYPE_Q3_K = 11
+GGML_TYPE_Q4_K = 12
+GGML_TYPE_Q5_K = 13
+GGML_TYPE_Q6_K = 14
+GGML_TYPE_Q8_K = 15
+GGML_TYPE_IQ2_XXS = 16
+GGML_TYPE_IQ2_XS = 17
+GGML_TYPE_IQ3_XXS = 18
+GGML_TYPE_IQ1_S = 19
+GGML_TYPE_IQ4_NL = 20
+GGML_TYPE_IQ3_S = 21
+GGML_TYPE_IQ2_S = 22
+GGML_TYPE_IQ4_XS = 23
+GGML_TYPE_I8 = 24
+GGML_TYPE_I16 = 25
+GGML_TYPE_I32 = 26
+GGML_TYPE_I64 = 27
+GGML_TYPE_F64 = 28
+GGML_TYPE_IQ1_M = 29
+GGML_TYPE_COUNT = 30

 # from ggml-backend.h
 # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
@ -175,8 +239,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E

 # define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
-# define LLAMA_SESSION_VERSION 4
-LLAMA_SESSION_VERSION = 4
+# define LLAMA_SESSION_VERSION 5
+LLAMA_SESSION_VERSION = 5


 # struct llama_model;
@ -199,14 +263,18 @@ llama_seq_id = ctypes.c_int32

 # enum llama_vocab_type {
 #     LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-#     LLAMA_VOCAB_TYPE_SPM  = 1, // SentencePiece
-#     LLAMA_VOCAB_TYPE_BPE  = 2, // Byte Pair Encoding
-#     LLAMA_VOCAB_TYPE_WPM  = 3, // WordPiece
+#     LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
+#     LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
+#     LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
 # };
 LLAMA_VOCAB_TYPE_NONE = 0
+"""For models without vocab"""
 LLAMA_VOCAB_TYPE_SPM = 1
+"""LLaMA tokenizer based on byte-level BPE with byte fallback"""
 LLAMA_VOCAB_TYPE_BPE = 2
+"""GPT-2 tokenizer based on byte-level BPE"""
 LLAMA_VOCAB_TYPE_WPM = 3
+"""BERT tokenizer based on WordPiece"""


 # // note: these values should be synchronized with ggml_rope
@ -274,6 +342,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
 #     LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors

 #     LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 # };
@ -677,6 +746,7 @@ It might not exist for progress report where '.' is output repeatedly."""
 #     bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
 #     bool pure;                           // quantize all tensors to the default type
 #     void * imatrix;                      // pointer to importance matrix data
+#     void * kv_overrides;                 // pointer to vector containing overrides
 # } llama_model_quantize_params;
 class llama_model_quantize_params(ctypes.Structure):
    """Parameters for llama_model_quantize
@ -691,6 +761,7 @@ class llama_model_quantize_params(ctypes.Structure):
        only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
        pure (bool): quantize all tensors to the default type
        imatrix (ctypes.c_void_p): pointer to importance matrix data
+        kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
    """

    _fields_ = [
@ -703,6 +774,7 @@ class llama_model_quantize_params(ctypes.Structure):
        ("only_copy", ctypes.c_bool),
        ("pure", ctypes.c_bool),
        ("imatrix", ctypes.c_void_p),
+        ("kv_overrides", ctypes.c_void_p),
    ]


@ -1838,9 +1910,9 @@ def llama_synchronize(ctx: llama_context_p, /):


 # // Token logits obtained from the last call to llama_decode()
-# // The logits for the last token are stored in the last row
-# // Logits for which llama_batch.logits[i] == 0 are undefined
-# // Rows: n_tokens provided with llama_batch
+# // The logits for which llama_batch.logits[i] != 0 are stored contiguously
+# // in the order they have appeared in the batch.
+# // Rows: number of tokens for which llama_batch.logits[i] != 0
 # // Cols: n_vocab
 # LLAMA_API float * llama_get_logits(struct llama_context * ctx);
@ctypes_function(
@ -1859,7 +1931,8 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:


 # // Logits for the ith token. Equivalent to:
-# // llama_get_logits(ctx) + i*n_vocab
+# // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
+# // returns NULL for invalid ids.
 # LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
@ctypes_function(
    "llama_get_logits_ith",
@ -1874,8 +1947,12 @@ def llama_get_logits_ith(
    ...


-# // Get all output token embeddings
-# // shape: [n_tokens*n_embd] (1-dimensional)
+# // Get all output token embeddings.
+# // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
+# // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
+# // in the order they have appeared in the batch.
+# // shape: [n_outputs*n_embd]
+# // Otherwise, returns NULL.
 # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@ctypes_function(
    "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@ -1886,9 +1963,10 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
    ...


-# // Get the embeddings for the ith token
-# // llama_get_embeddings(ctx) + i*n_embd
+# // Get the embeddings for the ith token. Equivalent to:
+# // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
 # // shape: [n_embd] (1-dimensional)
+# // returns NULL for invalid ids.
 # LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
@ctypes_function(
    "llama_get_embeddings_ith",
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@ -84,6 +84,7 @@ class ChatCompletionFunction(TypedDict):
 class ChatCompletionResponseChoice(TypedDict):
    index: int
    message: "ChatCompletionResponseMessage"
+    logprobs: Optional[CompletionLogprobs]
    finish_reason: Optional[str]


--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@ -405,6 +405,18 @@ async def create_chat_completion(
                    }
                },
            },
+            "logprobs": {
+                "summary": "Logprobs",
+                "value": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "What is the capital of France?"},
+                    ],
+                    "logprobs": True,
+                    "top_logprobs": 10
+                },
+            },
        }
    ),
    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
@ -493,7 +505,7 @@ async def tokenize(
 ) -> TokenizeInputResponse:
    tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)

-    return {"tokens": tokens}
+    return TokenizeInputResponse(tokens=tokens)


@router.post(
@ -508,7 +520,7 @@ async def count_query_tokens(
 ) -> TokenizeInputCountResponse:
    tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)

-    return {"count": len(tokens)}
+    return TokenizeInputCountResponse(count=len(tokens))


@router.post(
@ -523,4 +535,4 @@ async def detokenize(
 ) -> DetokenizeInputResponse:
    text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")

-    return {"text": text}
+    return DetokenizeInputResponse(text=text)
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@ -175,6 +175,9 @@ class LlamaProxy:
            chat_handler=chat_handler,
            # Speculative Decoding
            draft_model=draft_model,
+            # KV Cache Quantization
+            type_k=settings.type_k,
+            type_v=settings.type_v,
            # Tokenizer
            tokenizer=tokenizer,
            # Misc
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@ -159,6 +159,15 @@ class ModelSettings(BaseSettings):
        default=10,
        description="Number of tokens to predict using the draft model.",
    )
+    # KV Cache Quantization
+    type_k: Optional[int] = Field(
+        default=None,
+        description="Type of the key cache quantization.",
+    )
+    type_v: Optional[int] = Field(
+        default=None,
+        description="Type of the value cache quantization.",
+    )
    # Misc
    verbose: bool = Field(
        default=True, description="Whether to print debug information."
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@ -130,7 +130,6 @@ class CreateCompletionRequest(BaseModel):
    presence_penalty: Optional[float] = presence_penalty_field
    frequency_penalty: Optional[float] = frequency_penalty_field
    logit_bias: Optional[Dict[str, float]] = Field(None)
-    logprobs: Optional[int] = Field(None)
    seed: Optional[int] = Field(None)

    # ignored or currently unsupported
@ -209,6 +208,15 @@ class CreateChatCompletionRequest(BaseModel):
        default=None,
        description="The maximum number of tokens to generate. Defaults to inf",
    )
+    logprobs: Optional[bool] = Field(
+        default=False,
+        description="Whether to output the logprobs or not. Default is True"
+    )
+    top_logprobs: Optional[int] = Field(
+        default=None,
+        ge=0,
+        description="The number of logprobs to generate. If None, no logprobs are generated. logprobs need to set to True.",
+    )
    temperature: float = temperature_field
    top_p: float = top_p_field
    min_p: float = min_p_field
@ -268,7 +276,7 @@ class ModelList(TypedDict):

 class TokenizeInputRequest(BaseModel):
    model: Optional[str] = model_field
-    input: Optional[str] = Field(description="The input to tokenize.")
+    input: str = Field(description="The input to tokenize.")

    model_config = {
        "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
--- a/scripts/releases-to-pep-503.sh
+++ b/scripts/releases-to-pep-503.sh
@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Get output directory or default to index/whl/cpu
+output_dir=${1:-"index/whl/cpu"}
+
+# Create output directory
+mkdir -p $output_dir
+
+# Change to output directory
+pushd $output_dir
+
+# Create an index html file
+echo "<!DOCTYPE html>" > index.html
+echo "<html>" >> index.html
+echo "  <head></head>" >> index.html
+echo "  <body>" >> index.html
+echo "    <a href=\"llama-cpp-python/\">llama-cpp-python</a>" >> index.html
+echo "    <br>" >> index.html
+echo "  </body>" >> index.html
+echo "</html>" >> index.html
+echo "" >> index.html
+
+# Create llama-cpp-python directory
+mkdir -p llama-cpp-python
+
+# Change to llama-cpp-python directory
+pushd llama-cpp-python
+
+# Create an index html file
+echo "<!DOCTYPE html>" > index.html
+echo "<html>" >> index.html
+echo "  <body>" >> index.html
+echo "    <h1>Links for llama-cpp-python</h1>" >> index.html
+
+# Get all releases
+releases=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases | jq -r .[].tag_name)
+
+# Get pattern from second arg or default to valid python package version pattern
+pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"}
+
+# Filter releases by pattern
+releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern)
+
+# For each release, get all assets
+for release in $releases; do
+    assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets)
+    echo "    <h2>$release</h2>" >> index.html
+    for asset in $(echo $assets | jq -r .[].browser_download_url); do
+        if [[ $asset == *".whl" ]]; then
+            echo "    <a href=\"$asset\">$asset</a>" >> index.html
+            echo "    <br>" >> index.html
+        fi
+    done
+done
+
+echo "  </body>" >> index.html
+echo "</html>" >> index.html
+echo "" >> index.html
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652
+Subproject commit 60cdf40cc32f0ad4cb11e0ca8fd38f3b93d8d640