Merge https://github.com/abetlen/llama-cpp-python

2024-04-05 10:38:53 +05:30 · 2024-04-05 10:38:53 +05:30 · 8b9cd38c0d
commit 8b9cd38c0d
parent d3afd4507f 909ef66951
18 changed files with 569 additions and 85 deletions
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@ -11,7 +11,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-latest, windows-latest, macOS-latest]
+        os: [ubuntu-20.04, windows-2019, macos-11]
    steps:
      - uses: actions/checkout@v3
@ -23,19 +23,19 @@ jobs:
        with:
          python-version: "3.8"
      - name: Install cibuildwheel
        run: python -m pip install cibuildwheel==2.12.1
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          python -m pip install -e .[all]
      - name: Build wheels
-        run: python -m cibuildwheel --output-dir wheelhouse
+        uses: pypa/cibuildwheel@v2.16.5
        env:
          # disable repair
          CIBW_REPAIR_WHEEL_COMMAND: ""
        with:
          package-dir: .
          output-dir: wheelhouse
      - uses: actions/upload-artifact@v3
        with:
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@ -0,0 +1,131 @@
 name: Build Wheels (CUDA)
 on: workflow_dispatch
 permissions:
  contents: write
 jobs:
  define_matrix:
    name: Define Build Matrix
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    defaults:
      run:
        shell: pwsh
    steps:
      - name: Define Job Output
        id: set-matrix
        run: |
          $matrix = @{
              'os' = @('ubuntu-20.04', 'windows-latest')
              'pyver' = @("3.10", "3.11", "3.12")
              'cuda' = @("12.1.1", "12.2.2", "12.3.2")
              'releasetag' = @("basic")
          }
          $matrixOut = ConvertTo-Json $matrix -Compress
          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
  build_wheels:
    name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
    needs: define_matrix
    runs-on: ${{ matrix.os }}
    strategy:
      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
    defaults:
      run:
        shell: pwsh
    env:
      CUDAVER: ${{ matrix.cuda }}
      AVXVER: ${{ matrix.releasetag }}
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: "recursive"
      - uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.pyver }}
      - name: Setup Mamba
        uses: conda-incubator/setup-miniconda@v2.2.0
        with:
          activate-environment: "build"
          python-version: ${{ matrix.pyver }}
          miniforge-variant: Mambaforge
          miniforge-version: latest
          use-mamba: true
          add-pip-as-python-dependency: true
          auto-activate-base: false
      - name: VS Integration Cache
        id: vs-integration-cache
        if: runner.os == 'Windows'
        uses: actions/cache@v3.3.2
        with:
          path: ./MSBuildExtensions
          key: cuda-${{ matrix.cuda }}-vs-integration
      - name: Get Visual Studio Integration
        if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
        run: |
          if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
          $links = (Invoke-RestMethod 'https://github.com/Jimver/cuda-toolkit/raw/dc0ca7bb29c5a92f7a963d3d5c93f8d59765136a/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
          for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
          Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
          & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
          Remove-Item 'cudainstaller.zip'
      - name: Install Visual Studio Integration
        if: runner.os == 'Windows'
        run: |
          $y = (gi '.\MSBuildExtensions').fullname + '\*'
          (gi 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_})
          $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_')
          echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
      - name: Install Dependencies
        env:
          MAMBA_DOWNLOAD_FAILFAST: "0"
          MAMBA_NO_LOW_SPEED_LIMIT: "1"
        run: |
          $cudaVersion = $env:CUDAVER
          mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
          python -m pip install build wheel
      - name: Build Wheel
        run: |
          $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
          $env:CUDA_PATH = $env:CONDA_PREFIX
          $env:CUDA_HOME = $env:CONDA_PREFIX
          $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
          if ($IsLinux) {
            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
          }
          $env:VERBOSE = '1'
          $env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all'
          $env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
          if ($env:AVXVER -eq 'AVX') {
            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
          }
          if ($env:AVXVER -eq 'AVX512') {
            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on'
          }
          if ($env:AVXVER -eq 'basic') {
            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
          }
          python -m build --wheel
          # write the build tag to the output
          Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
      - uses: softprops/action-gh-release@v1
        with:
          files: dist/*
          # Set tag_name to <tag>-cu<cuda_version>
          tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/build-wheels-metal.yaml
+++ b/.github/workflows/build-wheels-metal.yaml
@ -0,0 +1,87 @@
 name: Build Wheels (Metal)
 on: workflow_dispatch
 permissions:
  contents: write
 jobs:
  define_matrix:
    name: Define Build Matrix
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    defaults:
      run:
        shell: pwsh
    steps:
      - name: Define Job Output
        id: set-matrix
        run: |
          $matrix = @{
              'os' = @('macos-11', 'macos-12', 'macos-13')
              'pyver' = @('3.10', '3.11', '3.12')
          }
          $matrixOut = ConvertTo-Json $matrix -Compress
          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
  build_wheels:
    name: ${{ matrix.os }} Python ${{ matrix.pyver }}
    needs: define_matrix
    runs-on: ${{ matrix.os }}
    strategy:
      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
    env:
      OSVER: ${{ matrix.os }}
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: "recursive"
      - uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.pyver }}
      - name: Install Dependencies
        run: |
          python -m pip install build wheel cmake
      - name: Build Wheel
        run: |
          XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer"
          XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin"
          export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_METAL=on"
          [[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0"
          [[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0"
          [[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0"
          export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64"
          VERBOSE=1 python -m build --wheel
          if [[ "$OSVER" == "macos-13" ]]; then
            export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
            export MACOSX_DEPLOYMENT_TARGET="14.0"
            VERBOSE=1 python -m build --wheel
          fi
          for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done
          export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_METAL=on" && export ARCHFLAGS="-arch x86_64"
          VERBOSE=1 python -m build --wheel
          if [[ "$OSVER" == "macos-13" ]]; then
            export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
            export MACOSX_DEPLOYMENT_TARGET="14.0"
            VERBOSE=1 python -m build --wheel
          fi
      - uses: softprops/action-gh-release@v1
        with:
          files: dist/*
          # set release name to <tag>-metal
          tag_name: ${{ github.ref_name }}-metal
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generate-index-from-release.yaml
+++ b/.github/workflows/generate-index-from-release.yaml
@ -0,0 +1,48 @@
 name: Wheels Index
 on:
  # Trigger on any new release
  release:
    types: [published]
  # Allows you to run this workflow manually from the Actions tab
  workflow_dispatch:
 # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 permissions:
  contents: read
  pages: write
  id-token: write
 # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
 # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
 concurrency:
  group: "pages"
  cancel-in-progress: false
 jobs:
  # Single deploy job since we're just deploying
  deploy:
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Pages
        uses: actions/configure-pages@v4
      - name: Build
        run: |
          ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
          ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
          ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
          ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
      - name: Upload artifact
        uses: actions/upload-pages-artifact@v3
        with:
          # Upload entire repository
          path: 'index'
      - name: Deploy to GitHub Pages
        id: deployment
        uses: actions/deploy-pages@v4
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ## [0.2.59]
 - feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
 - feat: Binary wheels for CPU, CUDA (12.1 - 12.3), Metal by @abetlen, @jllllll, and @oobabooga in #1247
 - fix: segfault when logits_all=False by @abetlen in 8649d7671bd1a7c0d9cc6a5ad91c6ca286512ab3
 - fix: last tokens passing to sample_repetition_penalties function by @ymikhailov in #1295
 ## [0.2.58]
 - feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
 - feat: add support for KV cache quantization options by @Limour-dev in #1307
 - feat: Add logprobs support to chat completions by @windspirit95 in #1311
 - fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64 by @bretello in #1289
 - feat: Add tools/functions variables to Jinja2ChatFormatter, add function response formatting for all simple chat formats by @CISC in #1273
 - fix: Changed local API doc references to hosted by by @lawfordp2017 in #1317
 ## [0.2.57]
 - feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
@ -24,7 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [0.2.55]
- feat: Update llama.cpp to ggerganov/9731134296af3a6839cd682e51d9c2109a871de5
+- feat: Update llama.cpp to ggerganov/llama.cpp@9731134296af3a6839cd682e51d9c2109a871de5
 - docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244
 ## [0.2.54]
--- a/README.md
+++ b/README.md
@ -6,6 +6,7 @@
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]()
 Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.
 This package provides:
@ -43,6 +44,15 @@ This will also build `llama.cpp` from source and install it alongside this pytho
 If this fails, add `--verbose` to the `pip install` see the full cmake build log.
 **Pre-built Wheel (New)**
 It is also possible to install a pre-built wheel with basic CPU support.
 ```bash
 pip install llama-cpp-python \
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 ```
 ### Installation Configuration
 `llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https://github.com/ggerganov/llama.cpp#build) for a full list.
@ -99,12 +109,36 @@ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-
 </details>
 <details>
-<summary>cuBLAS (CUDA)</summary>
+<summary>CUDA</summary>
-To install with cuBLAS, set the `LLAMA_CUBLAS=on` environment variable before installing:
+To install with CUDA support, set the `LLAMA_CUDA=on` environment variable before installing:
 ```bash
-CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
 ```
 **Pre-built Wheel (New)**
 It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
 - CUDA Version is 12.1, 12.2 or 12.3
 - Python Version is 3.10, 3.11 or 3.12
 ```bash
 pip install llama-cpp-python \
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/<cuda-version>
 ```
 Where `<cuda-version>` is one of the following:
 - `cu121`: CUDA 12.1
 - `cu122`: CUDA 12.2
 - `cu123`: CUDA 12.3
 For example, to install the CUDA 12.1 wheel:
 ```bash
 pip install llama-cpp-python \
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
 ```
 </details>
@ -118,6 +152,18 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor
 CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
 ```
 **Pre-built Wheel (New)**
 It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements:
 - MacOS Version is 11.0 or later
 - Python Version is 3.10, 3.11 or 3.12
 ```bash
 pip install llama-cpp-python \
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
 ```
 </details>
 <details>
@ -321,7 +367,7 @@ For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`
 ### JSON and JSON Schema Mode
-To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_chat_completion).
+To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
 #### JSON Mode
@ -529,7 +575,7 @@ llama = Llama(
 ### Embeddings
-To generate text embeddings use [`create_embedding`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_embedding).
+To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding).
 ```python
 import llama_cpp
@ -568,7 +614,7 @@ python3 -m llama_cpp.server --model models/7B/llama-model.gguf
 Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this:
 ```bash
-CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'
+CMAKE_ARGS="-DLLAMA_CUDA=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'
 python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35
 ```
--- a/llama_cpp/init.py
+++ b/llama_cpp/init.py
@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.57"
+__version__ = "0.2.59"
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@ -730,12 +730,14 @@ class _LlamaSamplingContext:
        if len(self.prev) > 0:
            nl_token = ctx_main.model.token_nl()
            nl_logit = logits_array[nl_token]
-            if self.params.penalty_last_n > 0:
+            last_tokens = self.prev[-self.params.penalty_last_n:]
            last_tokens_size = min(len(last_tokens), self.params.penalty_last_n)
            if last_tokens_size > 0:
                last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens)
                ctx_main.sample_repetition_penalties(
                    token_data_array,
-                    # TODO: Only create this once
+                    last_tokens_p,
-                    (llama_cpp.llama_token * len(self.prev))(*self.prev),
+                    last_tokens_size,
                    self.params.penalty_last_n,
                    self.params.penalty_repeat,
                    self.params.penalty_freq,
                    self.params.penalty_present,
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -105,6 +105,9 @@ class Llama:
        draft_model: Optional[LlamaDraftModel] = None,
        # Tokenizer Override
        tokenizer: Optional[BaseLlamaTokenizer] = None,
        # KV cache quantization
        type_k: Optional[int] = None,
        type_v: Optional[int] = None,
        # Misc
        verbose: bool = True,
        # Extra Params
@ -172,6 +175,8 @@ class Llama:
            draft_model: Optional draft model to use for speculative decoding.
            tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
            verbose: Print verbose output to stderr.
            type_k: KV cache data type for K (default: f16)
            type_v: KV cache data type for V (default: f16)
        Raises:
            ValueError: If the model path does not exist.
@ -298,7 +303,11 @@ class Llama:
        )  # Must be set to True for speculative decoding
        self.context_params.embeddings = embedding # TODO: Rename to embeddings
        self.context_params.offload_kqv = offload_kqv
-
+        #  KV cache quantization
        if type_k is not None:
            self.context_params.type_k = type_k
        if type_v is not None:
            self.context_params.type_v = type_v
        # Sampling Params
        self.last_n_tokens_size = last_n_tokens_size
@ -526,14 +535,16 @@ class Llama:
            # Save tokens
            self.input_ids[n_past : n_past + n_tokens] = batch
            # Save logits
-            rows = n_tokens
+            if self.context_params.logits_all:
-            cols = self._n_vocab
+                rows = n_tokens
-            offset = (
+                cols = self._n_vocab
-                0 if self.context_params.logits_all else n_tokens - 1
+                logits = self._ctx.get_logits()[: rows * cols]
-            )  # NOTE: Only save the last token logits if logits_all is False
+                self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits
-            self.scores[n_past + offset : n_past + n_tokens, :].reshape(-1)[
+            else:
-                :
+                rows = 1
-            ] = self._ctx.get_logits()[offset * cols : rows * cols]
+                cols = self._n_vocab
                logits = self._ctx.get_logits()[: rows * cols]
                self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits
            # Update n_tokens
            self.n_tokens += n_tokens
@ -1653,6 +1664,7 @@ class Llama:
            top_k=top_k,
            min_p=min_p,
            typical_p=typical_p,
            logprobs=top_logprobs if logprobs else None,
            stream=stream,
            stop=stop,
            seed=seed,
@ -1723,6 +1735,7 @@ class Llama:
            n_threads=self.context_params.n_threads,
            n_threads_batch=self.context_params.n_threads_batch,
            rope_scaling_type=self.context_params.rope_scaling_type,
            pooling_type=self.context_params.pooling_type,
            rope_freq_base=self.context_params.rope_freq_base,
            rope_freq_scale=self.context_params.rope_freq_scale,
            yarn_ext_factor=self.context_params.yarn_ext_factor,
@ -1732,6 +1745,7 @@ class Llama:
            yarn_orig_ctx=self.context_params.yarn_orig_ctx,
            logits_all=self.context_params.logits_all,
            embedding=self.context_params.embeddings,
            offload_kqv=self.context_params.offload_kqv,
            # Sampling Params
            last_n_tokens_size=self.last_n_tokens_size,
            # LoRA Params
@ -1743,51 +1757,17 @@ class Llama:
            # Chat Format Params
            chat_format=self.chat_format,
            chat_handler=self.chat_handler,
            # Speculative Decidng
            draft_model=self.draft_model,
            # KV cache quantization
            type_k=self.context_params.type_k,
            type_v=self.context_params.type_v,
            # Misc
            verbose=self.verbose,
        )
    def __setstate__(self, state):
-        self.__init__(
+        self.__init__(**state)
            model_path=state["model_path"],
            # Model Params
            n_gpu_layers=state["n_gpu_layers"],
            split_mode=state["split_mode"],
            main_gpu=state["main_gpu"],
            tensor_split=state["tensor_split"],
            vocab_only=state["vocab_only"],
            use_mmap=state["use_mmap"],
            use_mlock=state["use_mlock"],
            kv_overrides=state["kv_overrides"],
            # Context Params
            seed=state["seed"],
            n_ctx=state["n_ctx"],
            n_batch=state["n_batch"],
            n_threads=state["n_threads"],
            n_threads_batch=state["n_threads_batch"],
            rope_freq_base=state["rope_freq_base"],
            rope_freq_scale=state["rope_freq_scale"],
            rope_scaling_type=state["rope_scaling_type"],
            yarn_ext_factor=state["yarn_ext_factor"],
            yarn_attn_factor=state["yarn_attn_factor"],
            yarn_beta_fast=state["yarn_beta_fast"],
            yarn_beta_slow=state["yarn_beta_slow"],
            yarn_orig_ctx=state["yarn_orig_ctx"],
            logits_all=state["logits_all"],
            embedding=state["embedding"],
            # Sampling Params
            last_n_tokens_size=state["last_n_tokens_size"],
            # LoRA Params
            lora_base=state["lora_base"],
            lora_path=state["lora_path"],
            # Backend Params
            numa=state["numa"],
            # Chat Format Params
            chat_format=state["chat_format"],
            chat_handler=state["chat_handler"],
            # Misc
            verbose=state["verbose"],
        )
    def save_state(self) -> LlamaState:
        assert self._ctx.ctx is not None
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -231,6 +231,7 @@ def _convert_text_completion_to_chat(
                    "role": "assistant",
                    "content": completion["choices"][0]["text"],
                },
                "logprobs": completion["choices"][0]["logprobs"],
                "finish_reason": completion["choices"][0]["finish_reason"],
            }
        ],
@ -254,6 +255,7 @@ def _convert_text_completion_chunks_to_chat(
                        "delta": {
                            "role": "assistant",
                        },
                        "logprobs": None,
                        "finish_reason": None,
                    }
                ],
@ -273,6 +275,7 @@ def _convert_text_completion_chunks_to_chat(
                        if chunk["choices"][0]["finish_reason"] is None
                        else {}
                    ),
                    "logprobs": chunk["choices"][0]["logprobs"],
                    "finish_reason": chunk["choices"][0]["finish_reason"],
                }
            ],
@ -487,6 +490,7 @@ def chat_formatter_to_chat_completion_handler(
        temperature: float = 0.2,
        top_p: float = 0.95,
        top_k: int = 40,
        logprobs: int = 0,
        min_p: float = 0.05,
        typical_p: float = 1.0,
        stream: bool = False,
@ -576,6 +580,7 @@ def chat_formatter_to_chat_completion_handler(
            top_k=top_k,
            min_p=min_p,
            typical_p=typical_p,
            logprobs=logprobs,
            stream=stream,
            stop=stop,
            seed=seed,
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -141,6 +141,70 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 byref = ctypes.byref  # type: ignore
 # from ggml.h
 # // NOTE: always add types at the end of the enum to keep backward compatibility
 # enum ggml_type {
 #     GGML_TYPE_F32     = 0,
 #     GGML_TYPE_F16     = 1,
 #     GGML_TYPE_Q4_0    = 2,
 #     GGML_TYPE_Q4_1    = 3,
 #     // GGML_TYPE_Q4_2 = 4, support has been removed
 #     // GGML_TYPE_Q4_3 = 5, support has been removed
 #     GGML_TYPE_Q5_0    = 6,
 #     GGML_TYPE_Q5_1    = 7,
 #     GGML_TYPE_Q8_0    = 8,
 #     GGML_TYPE_Q8_1    = 9,
 #     GGML_TYPE_Q2_K    = 10,
 #     GGML_TYPE_Q3_K    = 11,
 #     GGML_TYPE_Q4_K    = 12,
 #     GGML_TYPE_Q5_K    = 13,
 #     GGML_TYPE_Q6_K    = 14,
 #     GGML_TYPE_Q8_K    = 15,
 #     GGML_TYPE_IQ2_XXS = 16,
 #     GGML_TYPE_IQ2_XS  = 17,
 #     GGML_TYPE_IQ3_XXS = 18,
 #     GGML_TYPE_IQ1_S   = 19,
 #     GGML_TYPE_IQ4_NL  = 20,
 #     GGML_TYPE_IQ3_S   = 21,
 #     GGML_TYPE_IQ2_S   = 22,
 #     GGML_TYPE_IQ4_XS  = 23,
 #     GGML_TYPE_I8      = 24,
 #     GGML_TYPE_I16     = 25,
 #     GGML_TYPE_I32     = 26,
 #     GGML_TYPE_I64     = 27,
 #     GGML_TYPE_F64     = 28,
 #     GGML_TYPE_IQ1_M   = 29,
 #     GGML_TYPE_COUNT,
 # };
 GGML_TYPE_F32 = 0
 GGML_TYPE_F16 = 1
 GGML_TYPE_Q4_0 = 2
 GGML_TYPE_Q4_1 = 3
 GGML_TYPE_Q5_0 = 6
 GGML_TYPE_Q5_1 = 7
 GGML_TYPE_Q8_0 = 8
 GGML_TYPE_Q8_1 = 9
 GGML_TYPE_Q2_K = 10
 GGML_TYPE_Q3_K = 11
 GGML_TYPE_Q4_K = 12
 GGML_TYPE_Q5_K = 13
 GGML_TYPE_Q6_K = 14
 GGML_TYPE_Q8_K = 15
 GGML_TYPE_IQ2_XXS = 16
 GGML_TYPE_IQ2_XS = 17
 GGML_TYPE_IQ3_XXS = 18
 GGML_TYPE_IQ1_S = 19
 GGML_TYPE_IQ4_NL = 20
 GGML_TYPE_IQ3_S = 21
 GGML_TYPE_IQ2_S = 22
 GGML_TYPE_IQ4_XS = 23
 GGML_TYPE_I8 = 24
 GGML_TYPE_I16 = 25
 GGML_TYPE_I32 = 26
 GGML_TYPE_I64 = 27
 GGML_TYPE_F64 = 28
 GGML_TYPE_IQ1_M = 29
 GGML_TYPE_COUNT = 30
 # from ggml-backend.h
 # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
@ -175,8 +239,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E
 # define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
-# define LLAMA_SESSION_VERSION 4
+# define LLAMA_SESSION_VERSION 5
-LLAMA_SESSION_VERSION = 4
+LLAMA_SESSION_VERSION = 5
 # struct llama_model;
@ -199,14 +263,18 @@ llama_seq_id = ctypes.c_int32
 # enum llama_vocab_type {
 #     LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-#     LLAMA_VOCAB_TYPE_SPM  = 1, // SentencePiece
+#     LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
-#     LLAMA_VOCAB_TYPE_BPE  = 2, // Byte Pair Encoding
+#     LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
-#     LLAMA_VOCAB_TYPE_WPM  = 3, // WordPiece
+#     LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
 # };
 LLAMA_VOCAB_TYPE_NONE = 0
 """For models without vocab"""
 LLAMA_VOCAB_TYPE_SPM = 1
 """LLaMA tokenizer based on byte-level BPE with byte fallback"""
 LLAMA_VOCAB_TYPE_BPE = 2
 """GPT-2 tokenizer based on byte-level BPE"""
 LLAMA_VOCAB_TYPE_WPM = 3
 """BERT tokenizer based on WordPiece"""
 # // note: these values should be synchronized with ggml_rope
@ -274,6 +342,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
 #     LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
 #     LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 # };
@ -677,6 +746,7 @@ It might not exist for progress report where '.' is output repeatedly."""
 #     bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
 #     bool pure;                           // quantize all tensors to the default type
 #     void * imatrix;                      // pointer to importance matrix data
 #     void * kv_overrides;                 // pointer to vector containing overrides
 # } llama_model_quantize_params;
 class llama_model_quantize_params(ctypes.Structure):
    """Parameters for llama_model_quantize
@ -691,6 +761,7 @@ class llama_model_quantize_params(ctypes.Structure):
        only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
        pure (bool): quantize all tensors to the default type
        imatrix (ctypes.c_void_p): pointer to importance matrix data
        kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
    """
    _fields_ = [
@ -703,6 +774,7 @@ class llama_model_quantize_params(ctypes.Structure):
        ("only_copy", ctypes.c_bool),
        ("pure", ctypes.c_bool),
        ("imatrix", ctypes.c_void_p),
        ("kv_overrides", ctypes.c_void_p),
    ]
@ -1838,9 +1910,9 @@ def llama_synchronize(ctx: llama_context_p, /):
 # // Token logits obtained from the last call to llama_decode()
-# // The logits for the last token are stored in the last row
+# // The logits for which llama_batch.logits[i] != 0 are stored contiguously
-# // Logits for which llama_batch.logits[i] == 0 are undefined
+# // in the order they have appeared in the batch.
-# // Rows: n_tokens provided with llama_batch
+# // Rows: number of tokens for which llama_batch.logits[i] != 0
 # // Cols: n_vocab
 # LLAMA_API float * llama_get_logits(struct llama_context * ctx);
@ctypes_function(
@ -1859,7 +1931,8 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
 # // Logits for the ith token. Equivalent to:
-# // llama_get_logits(ctx) + i*n_vocab
+# // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
 # // returns NULL for invalid ids.
 # LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
@ctypes_function(
    "llama_get_logits_ith",
@ -1874,8 +1947,12 @@ def llama_get_logits_ith(
    ...
-# // Get all output token embeddings
+# // Get all output token embeddings.
-# // shape: [n_tokens*n_embd] (1-dimensional)
+# // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
 # // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
 # // in the order they have appeared in the batch.
 # // shape: [n_outputs*n_embd]
 # // Otherwise, returns NULL.
 # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@ctypes_function(
    "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@ -1886,9 +1963,10 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
    ...
-# // Get the embeddings for the ith token
+# // Get the embeddings for the ith token. Equivalent to:
-# // llama_get_embeddings(ctx) + i*n_embd
+# // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
 # // shape: [n_embd] (1-dimensional)
 # // returns NULL for invalid ids.
 # LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
@ctypes_function(
    "llama_get_embeddings_ith",
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@ -84,6 +84,7 @@ class ChatCompletionFunction(TypedDict):
 class ChatCompletionResponseChoice(TypedDict):
    index: int
    message: "ChatCompletionResponseMessage"
    logprobs: Optional[CompletionLogprobs]
    finish_reason: Optional[str]
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@ -405,6 +405,18 @@ async def create_chat_completion(
                    }
                },
            },
            "logprobs": {
                "summary": "Logprobs",
                "value": {
                    "model": "gpt-3.5-turbo",
                    "messages": [
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": "What is the capital of France?"},
                    ],
                    "logprobs": True,
                    "top_logprobs": 10
                },
            },
        }
    ),
    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
@ -493,7 +505,7 @@ async def tokenize(
 ) -> TokenizeInputResponse:
    tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
-    return {"tokens": tokens}
+    return TokenizeInputResponse(tokens=tokens)
@router.post(
@ -508,7 +520,7 @@ async def count_query_tokens(
 ) -> TokenizeInputCountResponse:
    tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
-    return {"count": len(tokens)}
+    return TokenizeInputCountResponse(count=len(tokens))
@router.post(
@ -523,4 +535,4 @@ async def detokenize(
 ) -> DetokenizeInputResponse:
    text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")
-    return {"text": text}
+    return DetokenizeInputResponse(text=text)
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@ -175,6 +175,9 @@ class LlamaProxy:
            chat_handler=chat_handler,
            # Speculative Decoding
            draft_model=draft_model,
            # KV Cache Quantization
            type_k=settings.type_k,
            type_v=settings.type_v,
            # Tokenizer
            tokenizer=tokenizer,
            # Misc
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@ -159,6 +159,15 @@ class ModelSettings(BaseSettings):
        default=10,
        description="Number of tokens to predict using the draft model.",
    )
    # KV Cache Quantization
    type_k: Optional[int] = Field(
        default=None,
        description="Type of the key cache quantization.",
    )
    type_v: Optional[int] = Field(
        default=None,
        description="Type of the value cache quantization.",
    )
    # Misc
    verbose: bool = Field(
        default=True, description="Whether to print debug information."
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@ -130,7 +130,6 @@ class CreateCompletionRequest(BaseModel):
    presence_penalty: Optional[float] = presence_penalty_field
    frequency_penalty: Optional[float] = frequency_penalty_field
    logit_bias: Optional[Dict[str, float]] = Field(None)
    logprobs: Optional[int] = Field(None)
    seed: Optional[int] = Field(None)
    # ignored or currently unsupported
@ -209,6 +208,15 @@ class CreateChatCompletionRequest(BaseModel):
        default=None,
        description="The maximum number of tokens to generate. Defaults to inf",
    )
    logprobs: Optional[bool] = Field(
        default=False,
        description="Whether to output the logprobs or not. Default is True"
    )
    top_logprobs: Optional[int] = Field(
        default=None,
        ge=0,
        description="The number of logprobs to generate. If None, no logprobs are generated. logprobs need to set to True.",
    )
    temperature: float = temperature_field
    top_p: float = top_p_field
    min_p: float = min_p_field
@ -268,7 +276,7 @@ class ModelList(TypedDict):
 class TokenizeInputRequest(BaseModel):
    model: Optional[str] = model_field
-    input: Optional[str] = Field(description="The input to tokenize.")
+    input: str = Field(description="The input to tokenize.")
    model_config = {
        "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
--- a/scripts/releases-to-pep-503.sh
+++ b/scripts/releases-to-pep-503.sh
@ -0,0 +1,58 @@
 #!/bin/bash
 # Get output directory or default to index/whl/cpu
 output_dir=${1:-"index/whl/cpu"}
 # Create output directory
 mkdir -p $output_dir
 # Change to output directory
 pushd $output_dir
 # Create an index html file
 echo "<!DOCTYPE html>" > index.html
 echo "<html>" >> index.html
 echo "  <head></head>" >> index.html
 echo "  <body>" >> index.html
 echo "    <a href=\"llama-cpp-python/\">llama-cpp-python</a>" >> index.html
 echo "    <br>" >> index.html
 echo "  </body>" >> index.html
 echo "</html>" >> index.html
 echo "" >> index.html
 # Create llama-cpp-python directory
 mkdir -p llama-cpp-python
 # Change to llama-cpp-python directory
 pushd llama-cpp-python
 # Create an index html file
 echo "<!DOCTYPE html>" > index.html
 echo "<html>" >> index.html
 echo "  <body>" >> index.html
 echo "    <h1>Links for llama-cpp-python</h1>" >> index.html
 # Get all releases
 releases=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases | jq -r .[].tag_name)
 # Get pattern from second arg or default to valid python package version pattern
 pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"}
 # Filter releases by pattern
 releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern)
 # For each release, get all assets
 for release in $releases; do
    assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets)
    echo "    <h2>$release</h2>" >> index.html
    for asset in $(echo $assets | jq -r .[].browser_download_url); do
        if [[ $asset == *".whl" ]]; then
            echo "    <a href=\"$asset\">$asset</a>" >> index.html
            echo "    <br>" >> index.html
        fi
    done
 done
 echo "  </body>" >> index.html
 echo "</html>" >> index.html
 echo "" >> index.html
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652
+Subproject commit 60cdf40cc32f0ad4cb11e0ca8fd38f3b93d8d640
		`@ -1 +1 @@`
			`Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652`				`Subproject commit 60cdf40cc32f0ad4cb11e0ca8fd38f3b93d8d640`