Merge https://github.com/abetlen/llama-cpp-python

2024-05-02 18:13:32 +05:30 · 2024-05-02 18:13:32 +05:30 · 1d177aaaef
commit 1d177aaaef
parent ce85be97e2 4f01c452b6
27 changed files with 1157 additions and 427 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -9,3 +9,7 @@ updates:
    directory: "/" # Location of package manifests
    schedule:
      interval: "weekly"
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"    
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@ -14,12 +14,12 @@ jobs:
        os: [ubuntu-20.04, windows-2019, macos-11]

    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
        with:
          submodules: "recursive"

      # Used to host cibuildwheel
-      - uses: actions/setup-python@v3
+      - uses: actions/setup-python@v5
        with:
          python-version: "3.8"

@ -29,7 +29,7 @@ jobs:
          python -m pip install -e .[all]

      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.16.5
+        uses: pypa/cibuildwheel@v2.17.0
        env:
          # disable repair
          CIBW_REPAIR_WHEEL_COMMAND: ""
@ -37,11 +37,12 @@ jobs:
          package-dir: .
          output-dir: wheelhouse

-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
+          name: wheels-${{ matrix.os }}
          path: ./wheelhouse/*.whl

-  build_arm64_wheels:
+  build_wheels_arm64:
    name: Build arm64 wheels
    runs-on: ubuntu-latest
    steps:
@ -55,30 +56,30 @@ jobs:
          platforms: linux/arm64

      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.16.5
+        uses: pypa/cibuildwheel@v2.17.0
        env:
          CIBW_SKIP: "*musllinux* pp*"
          CIBW_REPAIR_WHEEL_COMMAND: ""
          CIBW_ARCHS: "aarch64"
          CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
        with:
-          output-dir: wheelhouse/
+          output-dir: wheelhouse

      - name: Upload wheels as artifacts
        uses: actions/upload-artifact@v4
        with:
-          name: wheels-${{ matrix.version }}
-          path: wheelhouse/*.whl
+          name: wheels_arm64
+          path: ./wheelhouse/*.whl

  build_sdist:
    name: Build source distribution
    runs-on: ubuntu-latest

    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
        with:
          submodules: "recursive"
-      - uses: actions/setup-python@v3
+      - uses: actions/setup-python@v5
        with:
          python-version: "3.8"
      - name: Install dependencies
@ -88,21 +89,23 @@ jobs:
      - name: Build source distribution
        run: |
          python -m build --sdist
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
+          name: sdist
          path: ./dist/*.tar.gz

  release:
    name: Release
-    needs: [build_wheels, build_arm64_wheels, build_sdist]
+    needs: [build_wheels, build_wheels_arm64, build_sdist]
    runs-on: ubuntu-latest

    steps:
-      - uses: actions/download-artifact@v3
+      - uses: actions/download-artifact@v4
        with:
-          name: artifact
+          merge-multiple: true
          path: dist
-      - uses: softprops/action-gh-release@v1
+
+      - uses: softprops/action-gh-release@v2
        with:
          files: dist/*
        env:
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@ -12,18 +12,18 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: "recursive"

      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3

      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v2 
+        uses: docker/login-action@v3 
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
@ -31,7 +31,7 @@ jobs:

      - name: Build and push
        id: docker_build
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@v5
        with:
          context: .
          file: "docker/simple/Dockerfile"
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@ -22,7 +22,7 @@ jobs:
          $matrix = @{
              'os' = @('ubuntu-20.04', 'windows-latest')
              'pyver' = @("3.10", "3.11", "3.12")
-              'cuda' = @("12.1.1", "12.2.2", "12.3.2")
+              'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1")
              'releasetag' = @("basic")
          }

@ -47,12 +47,12 @@ jobs:
        with:
          submodules: "recursive"

-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.pyver }}

      - name: Setup Mamba
-        uses: conda-incubator/setup-miniconda@v2.2.0
+        uses: conda-incubator/setup-miniconda@v3.0.4
        with:
          activate-environment: "build"
          python-version: ${{ matrix.pyver }}
@ -65,7 +65,7 @@ jobs:
      - name: VS Integration Cache
        id: vs-integration-cache
        if: runner.os == 'Windows'
-        uses: actions/cache@v3.3.2
+        uses: actions/cache@v4.0.2
        with:
          path: ./MSBuildExtensions
          key: cuda-${{ matrix.cuda }}-vs-integration
@ -74,7 +74,7 @@ jobs:
        if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
        run: |
          if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
-          $links = (Invoke-RestMethod 'https://github.com/Jimver/cuda-toolkit/raw/dc0ca7bb29c5a92f7a963d3d5c93f8d59765136a/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
+          $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
          for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
          Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
          & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
@ -122,7 +122,7 @@ jobs:
          # write the build tag to the output
          Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV

-      - uses: softprops/action-gh-release@v1
+      - uses: softprops/action-gh-release@v2
        with:
          files: dist/*
          # Set tag_name to <tag>-cu<cuda_version>
--- a/.github/workflows/build-wheels-metal.yaml
+++ b/.github/workflows/build-wheels-metal.yaml
@ -41,7 +41,7 @@ jobs:
        with:
          submodules: "recursive"

-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.pyver }}

@ -78,7 +78,7 @@ jobs:
            VERBOSE=1 python -m build --wheel
          fi

-      - uses: softprops/action-gh-release@v1
+      - uses: softprops/action-gh-release@v2
        with:
          files: dist/*
          # set release name to <tag>-metal
--- a/.github/workflows/generate-index-from-release.yaml
+++ b/.github/workflows/generate-index-from-release.yaml
@ -31,12 +31,14 @@ jobs:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Pages
-        uses: actions/configure-pages@v4
+        uses: actions/configure-pages@v5
      - name: Build
        run: |
          ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
          ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
          ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
          ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
      - name: Upload artifact
        uses: actions/upload-pages-artifact@v3
--- a/.github/workflows/publish-to-test.yaml
+++ b/.github/workflows/publish-to-test.yaml
@ -16,11 +16,11 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
      with:
        submodules: "recursive"
    - name: Set up Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
      with:
        python-version: "3.8"
    - name: Append Dev Version to __version__
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@ -10,11 +10,11 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
      with:
        submodules: "recursive"
    - name: Set up Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
      with:
        python-version: "3.8"
    - name: Install dependencies
--- a/.github/workflows/test-pypi.yaml
+++ b/.github/workflows/test-pypi.yaml
@ -8,11 +8,11 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]

    steps:
      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
@ -28,11 +28,11 @@ jobs:
    runs-on: windows-latest
    strategy:
      matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]

    steps:
      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
@ -48,11 +48,11 @@ jobs:
    runs-on: macos-latest
    strategy:
      matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]

    steps:
      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -21,7 +21,7 @@ jobs:
        with:
          submodules: "recursive"
      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
@ -40,11 +40,11 @@ jobs:
        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]

    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
        with:
          submodules: "recursive"
      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
@ -57,17 +57,17 @@ jobs:

  build-macos:

-    runs-on: macos-latest
+    runs-on: macos-13
    strategy:
      matrix:
        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]

    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
        with:
          submodules: "recursive"
      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
@ -83,11 +83,11 @@ jobs:
  #   runs-on: ubuntu-latest

  #   steps:
-  #     - uses: actions/checkout@v3
+  #     - uses: actions/checkout@v4
  #       with:
  #         submodules: "recursive"
  #     - name: Set up Python 3.8
-  #       uses: actions/setup-python@v4
+  #       uses: actions/setup-python@v5
  #       with:
  #         python-version: "3.8"
  #     - name: Set up OpenCL & CLBlast
@ -107,14 +107,14 @@ jobs:

  build-macos-metal:

-    runs-on: macos-latest
+    runs-on: macos-13

    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
        with:
          submodules: "recursive"
      - name: Set up Python 3.8
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: "3.8"
      - name: Install dependencies
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

+## [0.2.68]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@
+- feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
+- fix(ci): Fix build-and-release.yaml by @Smartappli in #1413
+
+## [0.2.67]
+
+- fix: Ensure image renders before text in chat formats regardless of message content order by @abetlen in 3489ef09d3775f4a87fb7114f619e8ba9cb6b656
+- fix(ci): Fix bug in use of upload-artifact failing to merge multiple artifacts into a single release by @abetlen in d03f15bb73a1d520970357b702a9e7d4cc2a7a62
+
+## [0.2.66]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@8843a98c2ba97a25e93319a104f9ddfaf83ce4c4
+- feat: Generic Chat Formats, Tool Calling, and Huggingface Pull Support for Multimodal Models (Obsidian, LLaVA1.6, Moondream) by @abetlen in #1147
+- ci(fix): Workflow actions updates and fix arm64 wheels not included in release by @Smartappli in #1392
+- ci: Add support for pre-built cuda 12.4.1 wheels by @Smartappli in #1388
+- feat: Add support for str type kv_overrides by @abetlen in a411612b385cef100d76145da1fbd02a7b7cc894
+- fix: Functionary bug fixes by @jeffrey-fong in #1385
+- examples: fix quantize example by @iyubondyrev in #1387
+- ci: Update dependabot.yml by @Smartappli in #1391
+
+## [0.2.65]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@46e12c4692a37bdd31a0432fc5153d7d22bc7f72
+- feat: Allow for possibly non-pooled embeddings by @iamlemec in #1380
+
 ## [0.2.64]

 - feat: Update llama.cpp to ggerganov/llama.cpp@4e96a812b3ce7322a29a3008db2ed73d9087b176
--- a/README.md
+++ b/README.md
@ -121,7 +121,7 @@ CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python

 It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:

- CUDA Version is 12.1, 12.2 or 12.3
+- CUDA Version is 12.1, 12.2, 12.3, or 12.4
 - Python Version is 3.10, 3.11 or 3.12

 ```bash
@ -133,6 +133,7 @@ Where `<cuda-version>` is one of the following:
 - `cu121`: CUDA 12.1
 - `cu122`: CUDA 12.2
 - `cu123`: CUDA 12.3
+- `cu124`: CUDA 12.4

 For example, to install the CUDA 12.1 wheel:

@ -276,20 +277,26 @@ The high-level API provides a simple managed interface through the [`Llama`](htt
 Below is a short example demonstrating how to use the high-level API to for basic text completion:

 ```python
->>> from llama_cpp import Llama
->>> llm = Llama(
+from llama_cpp import Llama
+
+llm = Llama(
      model_path="./models/7B/llama-model.gguf",
      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      # n_ctx=2048, # Uncomment to increase the context window
 )
->>> output = llm(
+output = llm(
      "Q: Name the planets in the solar system? A: ", # Prompt
      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
 ) # Generate a completion, can also call create_completion
->>> print(output)
+print(output)
+```
+
+By default `llama-cpp-python` generates completions in an OpenAI compatible format:
+
+```python
 {
  "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
  "object": "text_completion",
@ -344,12 +351,12 @@ The model will will format the messages into a single prompt using the following
 Set `verbose=True` to see the selected chat format.

 ```python
->>> from llama_cpp import Llama
->>> llm = Llama(
+from llama_cpp import Llama
+llm = Llama(
      model_path="path/to/llama-2/llama-model.gguf",
      chat_format="llama-2"
 )
->>> llm.create_chat_completion(
+llm.create_chat_completion(
      messages = [
          {"role": "system", "content": "You are an assistant who perfectly describes images."},
          {
@ -374,9 +381,9 @@ To constrain chat responses to only valid JSON or a specific JSON Schema use the
 The following example will constrain the response to valid JSON strings only.

 ```python
->>> from llama_cpp import Llama
->>> llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
->>> llm.create_chat_completion(
+from llama_cpp import Llama
+llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
+llm.create_chat_completion(
    messages=[
        {
            "role": "system",
@ -396,9 +403,9 @@ The following example will constrain the response to valid JSON strings only.
 To constrain the response further to a specific JSON Schema add the schema to the `schema` property of the `response_format` argument.

 ```python
->>> from llama_cpp import Llama
->>> llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
->>> llm.create_chat_completion(
+from llama_cpp import Llama
+llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
+llm.create_chat_completion(
    messages=[
        {
            "role": "system",
@ -423,9 +430,9 @@ To constrain the response further to a specific JSON Schema add the schema to th
 The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.

 ```python
->>> from llama_cpp import Llama
->>> llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling")
->>> llm.create_chat_completion(
+from llama_cpp import Llama
+llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling")
+llm.create_chat_completion(
      messages = [
        {
          "role": "system",
@ -475,54 +482,91 @@ The various gguf-converted files for this set of models can be found [here](http
 Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files.

 ```python
->>> from llama_cpp import Llama
->>> from llama_cpp.llama_tokenizer import LlamaHFTokenizer
->>> llm = Llama.from_pretrained(
+from llama_cpp import Llama
+from llama_cpp.llama_tokenizer import LlamaHFTokenizer
+llm = Llama.from_pretrained(
  repo_id="meetkai/functionary-small-v2.2-GGUF",
  filename="functionary-small-v2.2.q4_0.gguf",
  chat_format="functionary-v2",
  tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF")
 )
 ```
+
+**NOTE**: There is no need to provide the default system messages used in Functionary as they are added automatically in the Functionary chat handler. Thus, the messages should contain just the chat messages and/or system messages that provide additional context for the model (e.g.: datetime, etc.).
 </details>

 ### Multi-modal Models

-`llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to
-read information from both text and images.
+`llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images.

 You'll first need to download one of the available multi-modal models in GGUF format:

 - [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
 - [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
 - [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1)
+- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf)
+- [moondream2](https://huggingface.co/vikhyatk/moondream2)

 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.

 ```python
->>> from llama_cpp import Llama
->>> from llama_cpp.llama_chat_format import Llava15ChatHandler
->>> chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
->>> llm = Llama(
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import Llava15ChatHandler
+chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
+llm = Llama(
  model_path="./path/to/llava/llama-model.gguf",
  chat_handler=chat_handler,
  n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
-  logits_all=True,# needed to make llava work
 )
->>> llm.create_chat_completion(
+llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are an assistant who perfectly describes images."},
        {
            "role": "user",
            "content": [
-                {"type": "image_url", "image_url": {"url": "https://.../image.png"}},
-                {"type" : "text", "text": "Describe this image in detail please."}
+                {"type" : "text", "text": "What's in this image?"},
+                {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } }
            ]
        }
    ]
 )
 ```

+You can also pull the model from the Hugging Face Hub using the `from_pretrained` method.
+
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import MoondreamChatHandler
+
+chat_handler = MoondreamChatHandler.from_pretrained(
+  repo_id="vikhyatk/moondream2",
+  filename="*mmproj*",
+)
+
+llm = Llama.from_pretrained(
+  repo_id="vikhyatk/moondream2",
+  filename="*text-model*",
+  chat_handler=chat_handler,
+  n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
+)
+
+respoonse = llm.create_chat_completion(
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type" : "text", "text": "What's in this image?"},
+                {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } }
+
+            ]
+        }
+    ]
+)
+print(response["choices"][0]["text"])
+```
+
+**Note**: Multi-modal models also support tool calling and JSON mode.
+
 <details>
 <summary>Loading a Local Image</summary>

@ -575,7 +619,7 @@ llama = Llama(

 ### Embeddings

-To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding).
+To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding) or [`embed`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.embed). Note that you must pass `embedding=True` to the constructor upon model creation for these to work properly.

 ```python
 import llama_cpp
@ -589,6 +633,12 @@ embeddings = llm.create_embedding("Hello, world!")
 embeddings = llm.create_embedding(["Hello, world!", "Goodbye, world!"])
 ```

+There are two primary notions of embeddings in a Transformer-style model: *token level* and *sequence level*. Sequence level embeddings are produced by "pooling" token level embeddings together, usually by averaging them or using the first token.
+
+Models that are explicitly geared towards embeddings will usually return sequence level embeddings by default, one for each input string. Non-embedding models such as those designed for text generation will typically return only token level embeddings, one for each token in each sequence. Thus the dimensionality of the return type will be one higher for token level embeddings.
+
+It is possible to control pooling behavior in some cases using the `pooling_type` flag on model creation. You can ensure token level embeddings from any model using `LLAMA_POOLING_TYPE_NONE`. The reverse, getting a generation oriented model to yield sequence level embeddings is currently not possible, but you can always do the pooling manually.
+
 ### Adjusting the Context Window

 The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.
@ -665,18 +715,18 @@ The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github
 Below is a short example demonstrating how to use the low-level API to tokenize a prompt:

 ```python
->>> import llama_cpp
->>> import ctypes
->>> llama_cpp.llama_backend_init(False) # Must be called once at the start of each program
->>> params = llama_cpp.llama_context_default_params()
+import llama_cpp
+import ctypes
+llama_cpp.llama_backend_init(False) # Must be called once at the start of each program
+params = llama_cpp.llama_context_default_params()
 # use bytes for char * params
->>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
->>> ctx = llama_cpp.llama_new_context_with_model(model, params)
->>> max_tokens = params.n_ctx
+model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
+ctx = llama_cpp.llama_new_context_with_model(model, params)
+max_tokens = params.n_ctx
 # use ctypes arrays for array params
->>> tokens = (llama_cpp.llama_token * int(max_tokens))()
->>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True))
->>> llama_cpp.llama_free(ctx)
+tokens = (llama_cpp.llama_token * int(max_tokens))()
+n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True))
+llama_cpp.llama_free(ctx)
 ```

 Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
--- a/docs/server.md
+++ b/docs/server.md
@ -98,6 +98,8 @@ You'll first need to download one of the available multi-modal models in GGUF fo
 - [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
 - [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
 - [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1)
+- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf)
+- [moondream2](https://huggingface.co/vikhyatk/moondream2)

 Then when you run the server you'll need to also specify the path to the clip model used for image embedding and the `llava-1-5` chat_format

--- a/examples/low_level_api/quantize.py
+++ b/examples/low_level_api/quantize.py
@ -4,14 +4,16 @@ import llama_cpp


 def main(args):
+    fname_inp = args.fname_inp.encode("utf-8")
+    fname_out = args.fname_out.encode("utf-8")
    if not os.path.exists(fname_inp):
        raise RuntimeError(f"Input file does not exist ({fname_inp})")
    if os.path.exists(fname_out):
        raise RuntimeError(f"Output file already exists ({fname_out})")
-    fname_inp = args.fname_inp.encode("utf-8")
-    fname_out = args.fname_out.encode("utf-8")
-    itype = args.itype
-    return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype)
+    ftype = args.type
+    args = llama_cpp.llama_model_quantize_default_params()
+    args.ftype = ftype
+    return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, args)
    if return_code != 0:
        raise RuntimeError("Failed to quantize model")

@ -20,6 +22,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("fname_inp", type=str, help="Path to input model")
    parser.add_argument("fname_out", type=str, help="Path to output model")
-    parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)")
+    parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp.py for enum")
    args = parser.parse_args()
    main(args)
+
--- a/llama_cpp/init.py
+++ b/llama_cpp/init.py
@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *

-__version__ = "0.2.64"
+__version__ = "0.2.68"
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@ -15,6 +15,7 @@ import numpy.typing as npt

 from .llama_types import *
 from .llama_grammar import LlamaGrammar
+from ._utils import suppress_stdout_stderr

 import llama_cpp.llama_cpp as llama_cpp

@ -47,6 +48,7 @@ class _LlamaModel:
        if not os.path.exists(path_model):
            raise ValueError(f"Model path does not exist: {path_model}")

+        with suppress_stdout_stderr(disable=verbose):
            self.model = llama_cpp.llama_load_model_from_file(
                self.path_model.encode("utf-8"), self.params
            )
@ -273,6 +275,10 @@ class _LlamaContext:
        assert self.ctx is not None
        return llama_cpp.llama_n_ctx(self.ctx)

+    def pooling_type(self) -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_pooling_type(self.ctx)
+
    def kv_cache_clear(self):
        assert self.ctx is not None
        llama_cpp.llama_kv_cache_clear(self.ctx)
@ -641,6 +647,16 @@ def _should_add_bos(model: _LlamaModel) -> bool:
        return llama_cpp.llama_vocab_type(model.model) == llama_cpp.LLAMA_VOCAB_TYPE_SPM


+# Embedding functions
+
+
+def _normalize_embedding(embedding):
+    norm = float(np.linalg.norm(embedding))
+    if norm == 0.0:
+        return embedding
+    return [v / norm for v in embedding]
+
+
 # Python wrappers over common/sampling structs


--- a/llama_cpp/_utils.py
+++ b/llama_cpp/_utils.py
@ -1,13 +1,15 @@
 import os
 import sys

-import sys
 from typing import Any, Dict

 # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
 outnull_file = open(os.devnull, "w")
 errnull_file = open(os.devnull, "w")

+STDOUT_FILENO = 1
+STDERR_FILENO = 2
+
 class suppress_stdout_stderr(object):
    # NOTE: these must be "saved" here to avoid exceptions when using
    #       this context manager inside of a __del__ method
@ -22,12 +24,8 @@ class suppress_stdout_stderr(object):
        if self.disable:
            return self

-        # Check if sys.stdout and sys.stderr have fileno method
-        if not hasattr(self.sys.stdout, 'fileno') or not hasattr(self.sys.stderr, 'fileno'):
-            return self  # Return the instance without making changes
-
-        self.old_stdout_fileno_undup = self.sys.stdout.fileno()
-        self.old_stderr_fileno_undup = self.sys.stderr.fileno()
+        self.old_stdout_fileno_undup = STDOUT_FILENO
+        self.old_stderr_fileno_undup = STDERR_FILENO

        self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup)
        self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup)
@ -47,7 +45,6 @@ class suppress_stdout_stderr(object):
            return

        # Check if sys.stdout and sys.stderr have fileno method
-        if hasattr(self.sys.stdout, 'fileno') and hasattr(self.sys.stderr, 'fileno'):
        self.sys.stdout = self.old_stdout
        self.sys.stderr = self.old_stderr

--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -50,6 +50,7 @@ from ._internals import (
    _LlamaTokenDataArray,  # type: ignore
    _LlamaSamplingParams,  # type: ignore
    _LlamaSamplingContext,  # type: ignore
+    _normalize_embedding,  # type: ignore
 )
 from ._logger import set_verbose
 from ._utils import suppress_stdout_stderr
@ -72,7 +73,7 @@ class Llama:
        vocab_only: bool = False,
        use_mmap: bool = True,
        use_mlock: bool = False,
-        kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None,
+        kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None,
        # Context Params
        seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
        n_ctx: int = 512,
@ -91,6 +92,7 @@ class Llama:
        logits_all: bool = False,
        embedding: bool = False,
        offload_kqv: bool = True,
+        flash_attn: bool = False,
        # Sampling Params
        last_n_tokens_size: int = 64,
        # LoRA Params
@ -167,6 +169,7 @@ class Llama:
            logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
            embedding: Embedding mode only.
            offload_kqv: Offload K, Q, V to GPU.
+            flash_attn: Use flash attention.
            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
            lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
            lora_path: Path to a LoRA file to apply to the model.
@ -253,6 +256,13 @@ class Llama:
                elif isinstance(v, float):
                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
                    self._kv_overrides_array[i].value.float_value = v
+                elif isinstance(v, str): # type: ignore
+                    v_bytes = v.encode("utf-8")
+                    if len(v_bytes) > 128: # TODO: Make this a constant
+                        raise ValueError(f"Value for {k} is too long: {v}")
+                    v_bytes = v_bytes.ljust(128, b"\0")
+                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
+                    self._kv_overrides_array[i].value.str_value[:128] = v_bytes
                else:
                    raise ValueError(f"Unknown value type for {k}: {v}")

@ -302,6 +312,7 @@ class Llama:
        )  # Must be set to True for speculative decoding
        self.context_params.embeddings = embedding # TODO: Rename to embeddings
        self.context_params.offload_kqv = offload_kqv
+        self.context_params.flash_attn = flash_attn
        #  KV cache quantization
        if type_k is not None:
            self.context_params.type_k = type_k
@ -760,7 +771,7 @@ class Llama:
        input = input if isinstance(input, list) else [input]

        # get numeric embeddings
-        embeds: List[List[float]]
+        embeds: Union[List[List[float]], List[List[List[float]]]]
        total_tokens: int
        embeds, total_tokens = self.embed(input, return_count=True)  # type: ignore

@ -787,7 +798,7 @@ class Llama:
    def embed(
        self,
        input: Union[str, List[str]],
-        normalize: bool = True,
+        normalize: bool = False,
        truncate: bool = True,
        return_count: bool = False,
    ):
@ -803,6 +814,10 @@ class Llama:
        n_embd = self.n_embd()
        n_batch = self.n_batch

+        # get pooling information
+        pooling_type = self.pooling_type()
+        logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE
+
        if self.context_params.embeddings == False:
            raise RuntimeError(
                "Llama model must be created with embedding=True to call this method"
@ -820,29 +835,37 @@ class Llama:
        self._batch.reset()

        # decode and fetch embeddings
-        data: List[List[float]] = []
+        data: Union[List[List[float]], List[List[List[float]]]] = []

-        def decode_batch(n_seq: int):
+        def decode_batch(seq_sizes: List[int]):
            assert self._ctx.ctx is not None
            llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
            self._ctx.decode(self._batch)
            self._batch.reset()

            # store embeddings
-            for i in range(n_seq):
-                ptr = llama_cpp.llama_get_embeddings_seq(
-                    self._ctx.ctx, i
-                )
-                if not ptr:
-                    raise RuntimeError("Failed to get embeddings from sequence pooling type is not set")
+            if pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE:
+                pos: int = 0
+                for i, size in enumerate(seq_sizes):
+                    ptr = llama_cpp.llama_get_embeddings(self._ctx.ctx)
+                    embedding: List[List[float]] = [
+                        ptr[pos + j * n_embd : pos + (j + 1) * n_embd] for j in range(size)
+                    ]
+                    if normalize:
+                        embedding = [_normalize_embedding(e) for e in embedding]
+                    data.append(embedding)
+                    pos += size
+            else:
+                for i in range(len(seq_sizes)):
+                    ptr = llama_cpp.llama_get_embeddings_seq(self._ctx.ctx, i)
                    embedding: List[float] = ptr[:n_embd]
                    if normalize:
-                    norm = float(np.linalg.norm(embedding))
-                    embedding = [v / norm for v in embedding]
+                        embedding = _normalize_embedding(embedding)
                    data.append(embedding)

        # init state
        total_tokens = 0
+        s_batch = []
        t_batch = 0
        p_batch = 0

@ -863,17 +886,21 @@ class Llama:

            # time to eval batch
            if t_batch + n_tokens > n_batch:
-                decode_batch(p_batch)
+                decode_batch(s_batch)
+                s_batch = []
                t_batch = 0
                p_batch = 0

            # add to batch
-            self._batch.add_sequence(tokens, p_batch, False)
+            self._batch.add_sequence(tokens, p_batch, logits_all)
+
+            # update batch stats
+            s_batch.append(n_tokens)
            t_batch += n_tokens
            p_batch += 1

        # hanlde last batch
-        decode_batch(p_batch)
+        decode_batch(s_batch)

        if self.verbose:
            llama_cpp.llama_print_timings(self._ctx.ctx)
@ -1750,6 +1777,7 @@ class Llama:
            logits_all=self.context_params.logits_all,
            embedding=self.context_params.embeddings,
            offload_kqv=self.context_params.offload_kqv,
+            flash_attn=self.context_params.flash_attn,
            # Sampling Params
            last_n_tokens_size=self.last_n_tokens_size,
            # LoRA Params
@ -1845,6 +1873,10 @@ class Llama:
        """Return the newline token."""
        return self._model.token_nl()

+    def pooling_type(self) -> str:
+        """Return the pooling type."""
+        return self._ctx.pooling_type()
+
    @staticmethod
    def logits_to_logprobs(
        logits: Union[npt.NDArray[np.single], List], axis: int = -1
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -6,6 +6,8 @@ import ctypes
 import dataclasses
 import random
 import string
+
+from contextlib import ExitStack
 from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol, cast

 import jinja2
@ -1828,15 +1830,23 @@ def functionary_v1_v2_chat_handler(
        version: Literal["v1", "v2"],
        functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+        tool_choice: Union[Dict, str] = "auto",
    ):
        all_messages: List[llama_types.ChatCompletionRequestMessage] = []
+        if tool_choice == "none":
+            all_messages.append(
+                llama_types.ChatCompletionRequestSystemMessage(
+                    role="system", content=generate_schema_from_functions([])
+                )
+            )
+        else:
            if functions is not None:
                all_messages.append(
                    llama_types.ChatCompletionRequestSystemMessage(
                        role="system", content=generate_schema_from_functions(functions)
                    )
                )
-        elif tools is not None:
+            elif tools is not None and tool_choice != "none":
                all_messages.append(
                    llama_types.ChatCompletionRequestSystemMessage(
                        role="system",
@ -1888,7 +1898,7 @@ def functionary_v1_v2_chat_handler(
        function_call = "auto"

    prompt = prepare_messages_for_inference(
-        messages, tokenizer, version, functions, tools
+        messages, tokenizer, version, functions, tools, function_call
    )

    # If no tools/functions are provided
@ -1985,17 +1995,12 @@ def functionary_v1_v2_chat_handler(

    content = ""
    function_calls, function_bodies = [], []
+    completion_tokens = 0

    if version == "v1":
        # If no or "auto" tool_choice/function_call
        if isinstance(function_call, str) and function_call == "auto":
            stops = ["\n", END_ASSISTANT_TOKEN]
-        # If tool_choice/function_call is "none"
-        elif isinstance(function_call, str) and function_call == "none":
-            prompt = prepare_messages_for_inference(
-                messages, tokenizer, version, [], []
-            )
-            stops = END_ASSISTANT_TOKEN
        # If tool_choice/function_call is provided
        elif isinstance(function_call, dict):
            prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
@ -2009,12 +2014,15 @@ def functionary_v1_v2_chat_handler(

        completion = create_completion(stop=stops)
        completion_text = completion["choices"][0]["text"]
+        completion_tokens += completion["usage"]["completion_tokens"]
+        

        # If the generation does not involve a function call
        if (
            START_FUNCTION_CALL_TOKEN not in prompt
            and START_FUNCTION_CALL_TOKEN not in completion_text
        ):
+            completion["usage"]["completion_tokens"] = completion_tokens
            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
        # If the generation involves a function call in completion, generate the parameters
        elif (
@ -2032,23 +2040,14 @@ def functionary_v1_v2_chat_handler(
            )
            grammar = get_grammar(function_calls[-1])
            completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
+            completion_tokens += completion["usage"]["completion_tokens"]
            function_bodies.append(completion["choices"][0]["text"].strip())
        # If the prompt involves a function call, just append generated parameters to function_bodies
        else:
            function_bodies.append(completion_text.strip())
    else:
-        # If tool_choice/function_call is "none"
-        if isinstance(function_call, str) and function_call == "none":
-            prompt = (
-                prepare_messages_for_inference(messages, tokenizer, version, [], [])
-                + "all\n<|content|>"
-            )
-            stops = [STOP_TOKEN, FROM_TOKEN]
-            completion = create_completion(stop=stops)
-            completion["choices"][0]["text"] = completion["choices"][0]["text"].strip()
-            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
        # If tool_choice/function_call is provided
-        elif isinstance(function_call, dict):
+        if isinstance(function_call, dict):
            prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
            function_call = function_call["name"]
            function_calls.append(function_call)
@ -2056,6 +2055,7 @@ def functionary_v1_v2_chat_handler(
            stops = [STOP_TOKEN, FROM_TOKEN]
            completion = create_completion(stop=stops)
            completion_text = completion["choices"][0]["text"]
+            completion_tokens += completion["usage"]["completion_tokens"]
            function_bodies.append(completion_text.strip())
        # If "auto" or no tool_choice/function_call
        elif isinstance(function_call, str) and function_call == "auto":
@ -2065,6 +2065,7 @@ def functionary_v1_v2_chat_handler(
                stops = CONTENT_TOKEN
                completion = create_completion(stop=stops)
                completion_text = completion["choices"][0]["text"]
+                completion_tokens += completion["usage"]["completion_tokens"]
                function_name = completion_text.strip()
                if function_name == "all":
                    prompt += "all\n<|content|>"
@ -2077,12 +2078,23 @@ def functionary_v1_v2_chat_handler(
                stops = [RECIPIENT_TOKEN, STOP_TOKEN]
                completion = create_completion(stop=stops)
                completion_text = completion["choices"][0]["text"]
+                completion_tokens += completion["usage"]["completion_tokens"]
                if function_name == "all":
-                    content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n")
+                    if completion_text.endswith("\n<|from|>assistant\n"):
+                        content += completion_text[:-len("\n<|from|>assistant\n")]
+                    if completion_text.endswith("\n<|from|> assistant\n"):
+                        content += completion_text[-len("\n<|from|> assistant\n")]
+                    else:
+                        content += completion_text
                    content = content.lstrip()
                    # Check whether the model wants to generate another turn
                    if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
-                        cleaned_completion_text = completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n").strip()
+                        if completion_text.endswith("\n<|from|>assistant\n"):
+                            cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
+                        elif completion_text.endswith("\n<|from|> assistant\n"):
+                            cleaned_completion_text = completion_text[-len("\n<|from|> assistant\n")].strip()
+                        else:
+                            cleaned_completion_text = completion_text.strip()
                        prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
                    else:
                        break
@ -2092,6 +2104,7 @@ def functionary_v1_v2_chat_handler(
                    prompt += completion_text.strip()
                    grammar = None
                    completion = create_completion(stop=stops)
+                    completion_tokens += completion["usage"]["completion_tokens"]
                    if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
                        prompt += "\n<|from|>assistant\n<|recipient|>"
                    else:
@ -2120,12 +2133,16 @@ def functionary_v1_v2_chat_handler(
        )

    # TODO: support stream mode
-    function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = { 
-        "function_call": {
+    function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = {}
+    if len(tool_calls) > 0:
+        if tools is not None:
+            function_call_dict["tool_calls"] = tool_calls
+        else:
+            function_call_dict["function_call"] = {
                "name": tool_calls[0]["function"]["name"],
                "arguments": tool_calls[0]["function"]["arguments"],
            }
-    } if len(tool_calls) == 1 else {}
+    completion["usage"]["completion_tokens"] = completion_tokens
    return llama_types.CreateChatCompletionResponse(
        id="chat" + completion["id"],
        object="chat.completion",
@ -2138,7 +2155,6 @@ def functionary_v1_v2_chat_handler(
                "message": {
                    "role": "assistant",
                    "content": None if content == "" else content,
-                    "tool_calls": tool_calls,
                    **function_call_dict,
                },
                "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
@ -2149,42 +2165,86 @@ def functionary_v1_v2_chat_handler(


 class Llava15ChatHandler:
-    _clip_free = None
+    DEFAULT_SYSTEM_MESSAGE =  "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."

-    def __init__(self, clip_model_path: str, verbose: bool = False):
+    CHAT_FORMAT = (
+        "{% for message in messages %}"
+        "{% if message.role == 'system' %}"
+        "{{ message.content }}"
+        "{% endif %}"
+        "{% if message.role == 'user' %}"
+        "{% if message.content is string %}"
+        "\nUSER: {{ message.content }}"
+        "{% endif %}"
+        "{% if message.content is iterable %}"
+        "\nUSER: "
+
+        "{% for content in message.content %}"
+        "{% if content.type == 'image_url' and content.image_url is string %}"
+        "{{ content.image_url }}"
+        "{% endif %}"
+        "{% if content.type == 'image_url' and content.image_url is mapping %}"
+        "{{ content.image_url.url }}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        "{% for content in message.content %}"
+        "{% if content.type == 'text' %}"
+        "{{ content.text }}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        "{% endif %}"
+        "{% endif %}"
+        "{% if message.role == 'assistant' and message.content is not none %}"
+        "\nASSISTANT: {{ message.content }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}"
+        "\nASSISTANT: "
+        "{% endif %}"
+    )
+
+    def __init__(self, clip_model_path: str, verbose: bool = True):
        import llama_cpp.llava_cpp as llava_cpp

-        self._llava_cpp = llava_cpp
        self.clip_model_path = clip_model_path
        self.verbose = verbose
-        self._clip_free = self._llava_cpp._libllava.clip_free  # type: ignore
+
+        self._llava_cpp = llava_cpp # TODO: Fix
+        self._exit_stack = ExitStack()
+        self._last_image_embed: Optional[llava_cpp.CtypesPointer[llava_cpp.llava_image_embed]] = None
+        self._last_image_hash: Optional[int] = None

        if not os.path.exists(clip_model_path):
            raise ValueError(f"Clip model path does not exist: {clip_model_path}")

        with suppress_stdout_stderr(disable=self.verbose):
-            self.clip_ctx = self._llava_cpp.clip_model_load(
+            clip_ctx = self._llava_cpp.clip_model_load(
                self.clip_model_path.encode(), 0
            )

-    def __del__(self):
+            if clip_ctx is None:
+                raise ValueError(f"Failed to load clip model: {clip_model_path}")
+            
+            self.clip_ctx = clip_ctx
+
+            def clip_free():
                with suppress_stdout_stderr(disable=self.verbose):
-            if self.clip_ctx is not None and self._clip_free is not None:
-                self._clip_free(self.clip_ctx)
-                self.clip_ctx = None
+                    self._llava_cpp.clip_free(self.clip_ctx)
+            
+            self._exit_stack.callback(clip_free)
+        
+        def last_image_embed_free():
+            with suppress_stdout_stderr(disable=self.verbose):
+                if self._last_image_embed is not None:
+                    self._llava_cpp.llava_image_embed_free(self._last_image_embed)
+                    self._last_image_embed = None
+
+        self._exit_stack.callback(last_image_embed_free)

    def load_image(self, image_url: str) -> bytes:
-        if image_url.startswith("data:"):
-            import base64
-
-            image_bytes = base64.b64decode(image_url.split(",")[1])
-            return image_bytes
-        else:
-            import urllib.request
-
-            with urllib.request.urlopen(image_url) as f:
-                image_bytes = f.read()
-                return image_bytes
+        return self._load_image(image_url)

    def __call__(
        self,
@ -2202,6 +2262,7 @@ class Llava15ChatHandler:
        typical_p: float = 1.0,
        stream: bool = False,
        stop: Optional[Union[str, List[str]]] = [],
+        seed: Optional[int] = None,
        response_format: Optional[
            llama_types.ChatCompletionRequestResponseFormat
        ] = None,
@ -2216,68 +2277,54 @@ class Llava15ChatHandler:
        model: Optional[str] = None,
        logits_processor: Optional[llama.LogitsProcessorList] = None,
        grammar: Optional[llama.LlamaGrammar] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
        **kwargs,  # type: ignore
    ) -> Union[
        llama_types.CreateChatCompletionResponse,
        Iterator[llama_types.CreateChatCompletionStreamResponse],
    ]:
-        assert (
-            llama.context_params.logits_all is True
-        )  # BUG: logits_all=True is required for llava
        assert self.clip_ctx is not None
-        system_prompt = _get_system_message(messages)
-        system_prompt = (
-            system_prompt
-            if system_prompt != ""
-            else "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
-        )
-        user_role = "\nUSER:"
-        assistant_role = "\nASSISTANT:"
-        llama.reset()
-        llama.eval(llama.tokenize(system_prompt.encode("utf8"), add_bos=True))
-        for message in messages:
-            if message["role"] == "user" and message["content"] is not None:
-                if isinstance(message["content"], str):
-                    llama.eval(
-                        llama.tokenize(
-                            f"{user_role} {message['content']}".encode("utf8"),
-                            add_bos=False,
-                        )
-                    )
-                else:
-                    assert isinstance(message["content"], list)
-                    llama.eval(
-                        llama.tokenize(f"{user_role} ".encode("utf8"), add_bos=False)
-                    )
-                    for content in message["content"]:
-                        if content["type"] == "text":
-                            llama.eval(
-                                llama.tokenize(
-                                    f"{content['text']}".encode("utf8"), add_bos=False
-                                )
-                            )
-                        if content["type"] == "image_url":
-                            image_bytes = (
-                                self.load_image(content["image_url"]["url"])
-                                if isinstance(content["image_url"], dict)
-                                else self.load_image(content["image_url"])
-                            )
-                            import array

-                            data_array = array.array("B", image_bytes)
-                            c_ubyte_ptr = (
-                                ctypes.c_ubyte * len(data_array)
-                            ).from_buffer(data_array)
+        system_prompt = _get_system_message(messages)
+        if system_prompt == "":
+            messages = [llama_types.ChatCompletionRequestSystemMessage(role="system", content=self.DEFAULT_SYSTEM_MESSAGE)] + messages
+
+        image_urls = self.get_image_urls(messages)
+        template = jinja2.Template(self.CHAT_FORMAT)
+        text = template.render(messages=messages, add_generation_prompt=True)
+        split_text = self.split_text_on_image_urls(text, image_urls)
+
+        def embed_image_bytes(image_bytes: bytes):
+            if self._last_image_embed is not None and self._last_image_hash is not None and hash(image_bytes) == self._last_image_hash:
+                return self._last_image_embed
            with suppress_stdout_stderr(disable=self.verbose):
                embed = (
                    self._llava_cpp.llava_image_embed_make_with_bytes(
                        self.clip_ctx,
-                                        llama.context_params.n_threads,
-                                        c_ubyte_ptr,
+                        llama.context_params.n_threads_batch,
+                        (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
                        len(image_bytes),
                    )
                )
-                            try:
+                self._last_image_embed = embed
+                self._last_image_hash = hash(image_bytes)
+                return embed
+
+        # Evaluate prompt
+        llama.reset()
+        for i, (type_, value) in enumerate(split_text):
+            if type_ == "text":
+                tokens = llama.tokenize(value.encode("utf8"), add_bos=i == 0)
+                if llama.n_tokens + len(tokens) > llama.n_ctx():
+                    raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
+                llama.eval(tokens)
+            else:
+                image_bytes = self.load_image(value)
+                embed = embed_image_bytes(image_bytes)
+                if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
+                    raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
                n_past = ctypes.c_int(llama.n_tokens)
                n_past_p = ctypes.pointer(n_past)
                with suppress_stdout_stderr(disable=self.verbose):
@ -2287,36 +2334,66 @@ class Llava15ChatHandler:
                        llama.n_batch,
                        n_past_p,
                    )
-                                assert llama.n_ctx() >= n_past.value
                llama.n_tokens = n_past.value
-                            finally:
-                                with suppress_stdout_stderr(disable=self.verbose):
-                                    self._llava_cpp.llava_image_embed_free(embed)
-            if message["role"] == "assistant" and message["content"] is not None:
-                llama.eval(
-                    llama.tokenize(
-                        f"ASSISTANT: {message['content']}".encode("utf8"), add_bos=False
-                    )
-                )
-                assert llama.n_ctx() >= llama.n_tokens
-        llama.eval(llama.tokenize(f"{assistant_role}".encode("utf8"), add_bos=False))
-        assert llama.n_ctx() >= llama.n_tokens

+        # Get prompt tokens to avoid a cache miss
        prompt = llama.input_ids[: llama.n_tokens].tolist()

        if response_format is not None and response_format["type"] == "json_object":
            grammar = _grammar_for_response_format(response_format)

-        return _convert_completion_to_chat(
-            llama.create_completion(
+        # Convert legacy functions to tools
+        if functions is not None:
+            tools = [
+                {
+                    "type": "function",
+                    "function": function,
+                }
+                for function in functions
+            ]
+
+        # Convert legacy function_call to tool_choice
+        if function_call is not None:
+            if isinstance(function_call, str) and (
+                function_call == "none" or function_call == "auto"
+            ):
+                tool_choice = function_call
+            if isinstance(function_call, dict) and "name" in function_call:
+                tool_choice = {
+                    "type": "function",
+                    "function": {
+                        "name": function_call["name"],
+                    },
+                }
+
+        tool = None
+        if tool_choice is not None and isinstance(tool_choice, dict) and tools is not None:
+            name = tool_choice["function"]["name"]
+            tool = next((t for t in tools if t["function"]["name"] == name), None)
+            if tool is None:
+                raise ValueError(f"Tool choice '{name}' not found in tools.")
+            schema = tool["function"]["parameters"]
+            try:
+                # create grammar from json schema
+                grammar = llama_grammar.LlamaGrammar.from_json_schema(
+                    json.dumps(schema), verbose=llama.verbose
+                )
+            except Exception as e:
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                )
+
+        completion_or_chunks = llama.create_completion(
            prompt=prompt,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            min_p=min_p,
            typical_p=typical_p,
+            logprobs=top_logprobs if logprobs else None,
            stream=stream,
            stop=stop,
+            seed=seed,
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
            frequency_penalty=frequency_penalty,
@ -2328,8 +2405,370 @@ class Llava15ChatHandler:
            model=model,
            logits_processor=logits_processor,
            grammar=grammar,
-            ),
-            stream=stream,
+            logit_bias=logit_bias,
+        )
+        if tool is not None:
+            tool_name = tool["function"]["name"]
+            return _convert_completion_to_chat_function(
+                tool_name, completion_or_chunks, stream
+            )
+        return _convert_completion_to_chat(completion_or_chunks, stream=stream)
+
+    @staticmethod
+    def _load_image(image_url: str) -> bytes:
+        # TODO: Add Pillow support for other image formats beyond (jpg, png)
+        if image_url.startswith("data:"):
+            import base64
+
+            image_bytes = base64.b64decode(image_url.split(",")[1])
+            return image_bytes
+        else:
+            import urllib.request
+
+            with urllib.request.urlopen(image_url) as f:
+                image_bytes = f.read()
+                return image_bytes
+
+    @staticmethod
+    def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
+        image_urls: List[str] = []
+        for message in messages:
+            if message["role"] == "user":
+                if message["content"] is None:
+                    continue
+                for content in message["content"]:
+                    if isinstance(content, dict) and "type" in content:
+                        if content["type"] == "image_url":
+                            if (
+                                isinstance(content["image_url"], dict)
+                                and "url" in content["image_url"]
+                            ):
+                                image_urls.append(content["image_url"]["url"])
+                            else:
+                                image_urls.append(content["image_url"])
+        return image_urls
+
+    @staticmethod
+    def split_text_on_image_urls(text: str, image_urls: List[str]):
+        def find_first(s: str, substrs: List[str]):
+            for i, substr in enumerate(substrs):
+                pos = s.find(substr)
+                if pos != -1:
+                    return pos, i
+            return None, None
+
+        split_text: List[Tuple[Literal["text", "image_url"], str]] = []
+        remaining = text
+        while remaining:
+            # Find first image_url
+            pos, i = find_first(remaining, image_urls)
+            if pos is not None and i is not None:
+                if pos > 0:
+                    split_text.append(("text", remaining[:pos]))
+                split_text.append(("image_url", image_urls[i]))
+                remaining = remaining[pos + len(image_urls[i]) :]
+            else:
+                split_text.append(("text", remaining))
+                remaining = ""
+        return split_text
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        repo_id: str,
+        filename: Optional[str],
+        local_dir: Optional[Union[str, os.PathLike[str]]] = None,
+        local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
+        cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
+        **kwargs: Any,
+    ) -> "Llava15ChatHandler":
+        import fnmatch
+        from pathlib import Path
+        try:
+            from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore
+            from huggingface_hub.utils import validate_repo_id # type: ignore
+        except ImportError:
+            raise ImportError(
+                "Llama.from_pretrained requires the huggingface-hub package. "
+                "You can install it with `pip install huggingface-hub`."
+            )
+
+        validate_repo_id(repo_id)
+
+        hffs = HfFileSystem()
+
+        files = [
+            file["name"] if isinstance(file, dict) else file
+            for file in hffs.ls(repo_id) # type: ignore
+        ]
+
+        # split each file into repo_id, subfolder, filename
+        file_list: List[str] = []
+        for file in files:
+            rel_path = Path(file).relative_to(repo_id)
+            file_list.append(str(rel_path))
+
+        matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)]  # type: ignore
+
+        if len(matching_files) == 0:
+            raise ValueError(
+                f"No file found in {repo_id} that match {filename}\n\n"
+                f"Available Files:\n{json.dumps(file_list)}"
+            )
+
+        if len(matching_files) > 1:
+            raise ValueError(
+                f"Multiple files found in {repo_id} matching {filename}\n\n"
+                f"Available Files:\n{json.dumps(files)}"
+            )
+
+        (matching_file,) = matching_files
+
+        subfolder = str(Path(matching_file).parent)
+        filename = Path(matching_file).name
+
+        # download the file
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            subfolder=subfolder,
+            local_dir=cast(Union[str, Path, None], local_dir),
+            local_dir_use_symlinks=local_dir_use_symlinks,
+            cache_dir=cast(Union[str, Path, None], cache_dir),
+        )
+
+        if local_dir is None:
+            model_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=filename,
+                subfolder=subfolder,
+                local_dir=local_dir,
+                local_dir_use_symlinks=local_dir_use_symlinks,
+                cache_dir=cast(Union[str, Path, None], cache_dir),
+                local_files_only=True,
+            )
+        else:
+            model_path = os.path.join(local_dir, filename)
+
+        return cls(
+            clip_model_path=model_path,
+            **kwargs,
+        )
+
+class ObsidianChatHandler(Llava15ChatHandler):
+    # Prompt Format
+    # The model followed ChatML format. However, with ### as the seperator
+
+    # <|im_start|>user
+    # What is this sign about?\n<image>
+    # ###
+    # <|im_start|>assistant
+    # The sign is about bullying, and it is placed on a black background with a red background.
+    # ###
+
+    CHAT_FORMAT = (
+        "{% for message in messages %}"
+        # System message
+        "{% if message.role == 'system' %}"
+        "<|im_start|>system\n"
+        "{{ message.content }}\n"
+        "###\n"
+        "{% endif %}"
+        # User message
+        "{% if message.role == 'user' %}"
+        "<|im_start|>user\n"
+        "{% if message.content is string %}"
+        "{{ message.content }}"
+        "{% endif %}"
+        "{% if message.content is iterable %}"
+
+        "{% for content in message.content %}"
+        "{% if content.type == 'image_url' and content.image_url is string %}"
+        "{{ content.image_url }}"
+        "{% endif %}"
+        "{% if content.type == 'image_url' and content.image_url is mapping %}"
+        "{{ content.image_url.url }}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        "{% for content in message.content %}"
+        "{% if content.type == 'text' %}"
+        "{{ content.text }}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        "{% endif %}"
+        "###\n"
+        "{% endif %}"
+        # Assistant message
+        "{% if message.role == 'assistant' %}"
+        "<|im_start|>assistant\n"
+        "{{ message.content }}"
+        "###\n"
+        "{% endif %}"
+        "{% endfor %}"
+        # Generation prompt
+        "{% if add_generation_prompt %}"
+        "<|im_start|>assistant\n"
+        "{% endif %}"
+    )
+
+class MoondreamChatHandler(Llava15ChatHandler):
+    # Chat Format:
+    # f"<image>\n\n{chat_history}Question: {question}\n\nAnswer:"
+    CHAT_FORMAT = (
+        "{% for message in messages %}"
+        "{% if message.role == 'user' %}"
+        "{% if message.content is iterable %}"
+
+        # <image>
+        "{% for content in message.content %}"
+        "{% if content.type == 'image_url' %}"
+        "{% if content.image_url is string %}"
+        "{{ content.image_url }}\n\n"
+        "{% endif %}"
+        "{% if content.image_url is mapping %}"
+        "{{ content.image_url.url }}\n\n"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        # Question:
+        "{% for content in message.content %}"
+        "{% if content.type == 'text' %}"
+        "Question: {{ content.text }}\n\n"
+        "{% endif %}"
+        "{% endfor %}"
+
+        "{% endif %}"
+
+        # Question:
+        "{% if message.content is string %}"
+        "Question: {{ message.content }}\n\n"
+        "{% endif %}"
+
+        "{% endif %}"
+
+        # Answer:
+        "{% if message.role == 'assistant' %}"
+        "Answer:{{ message.content }}\n\n"
+        "{% endif %}"
+        "{% endfor %}"
+
+        # Generation prompt
+        "{% if add_generation_prompt %}"
+        "Answer:"
+        "{% endif %}"
+    )
+
+class Llava16ChatHandler(Llava15ChatHandler):
+    DEFAULT_SYSTEM_MESSAGE = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. "
+
+    # Example prompt
+    # "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
+
+    CHAT_FORMAT = (
+        "{% for message in messages %}"
+        "{% if message.role == 'system' %}"
+        "{{ message.content }}"
+        "{% endif %}"
+        "{% if message.role == 'user' %}"
+        "{% if message.content is iterable %}"
+
+        # <image>
+        "{% for content in message.content %}"
+        "{% if content.type == 'image_url' %}"
+        "{% if content.image_url is string %}"
+        "{{ content.image_url }}\n"
+        "{% endif %}"
+        "{% if content.image_url is mapping %}"
+        "{{ content.image_url.url }}\n"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        # Question:
+        "{% for content in message.content %}"
+        "{% if content.type == 'text' %}"
+        "{{ content.text }}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        "{% endif %}"
+        
+        # Question:
+        "{% if message.content is string %}"
+        "{{ message.content }}"
+        "{% endif %}"
+
+        "{% endif %}"
+
+        # Answer:
+        "{% if message.role == 'assistant' %}"
+        "{{ message.content }}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        # Generation prompt
+        "{% if add_generation_prompt %}"
+        "Answer:"
+        "{% endif %}"
+    )
+
+class NanoLlavaChatHandler(Llava15ChatHandler):
+    # Prompt Format
+    # The model follow the ChatML standard, however, without \n at the end of <|im_end|>:
+
+    # <|im_start|>system
+    # Answer the question<|im_end|><|im_start|>user
+    # <image>
+    # What is the picture about?<|im_end|><|im_start|>assistant
+
+    CHAT_FORMAT = (
+        "{% for message in messages %}"
+        # System message
+        "{% if message.role == 'system' %}"
+        "<|im_start|>system\n"
+        "{{ message.content }}"
+        "<|im_end|>"
+        "{% endif %}"
+        # User message
+        "{% if message.role == 'user' %}"
+        "<|im_start|>user\n"
+        "{% if message.content is string %}"
+        "{{ message.content }}"
+        "{% endif %}"
+        "{% if message.content is iterable %}"
+
+        "{% for content in message.content %}"
+        "{% if content.type == 'image_url' and content.image_url is string %}"
+        "{{ content.image_url }}"
+        "{% endif %}"
+        "{% if content.type == 'image_url' and content.image_url is mapping %}"
+        "{{ content.image_url.url }}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        "{% for content in message.content %}"
+        "{% if content.type == 'text' %}"
+        "{{ content.text }}"
+        "{% endif %}"
+        "{% endfor %}"
+
+        "{% endif %}"
+        "<|im_end|>"
+        "{% endif %}"
+        # Assistant message
+        "{% if message.role == 'assistant' %}"
+        "<|im_start|>assistant\n"
+        "{{ message.content }}"
+        "<|im_end|>"
+        "{% endif %}"
+        "{% endfor %}"
+        # Generation prompt
+        "{% if add_generation_prompt %}"
+        "<|im_start|>assistant\n"
+        "{% endif %}"
    )


--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -242,8 +242,8 @@ LLAMA_FILE_MAGIC_GGSQ = 0x67677371

 # define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
-# define LLAMA_SESSION_VERSION 5
-LLAMA_SESSION_VERSION = 5
+# define LLAMA_SESSION_VERSION 6
+LLAMA_SESSION_VERSION = 6

 # define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
 LLAMA_STATE_SEQ_MAGIC = LLAMA_FILE_MAGIC_GGSQ
@ -284,6 +284,27 @@ LLAMA_VOCAB_TYPE_WPM = 3
 """BERT tokenizer based on WordPiece"""


+# // pre-tokenization types
+# enum llama_vocab_pre_type {
+#     LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
+#     LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
+#     LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
+#     LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
+#     LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
+#     LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
+#     LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
+#     LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+# };
+LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
+LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
+LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2
+LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3
+LLAMA_VOCAB_PRE_TYPE_FALCON = 4
+LLAMA_VOCAB_PRE_TYPE_MPT = 5
+LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
+LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
+
+
 # // note: these values should be synchronized with ggml_rope
 # // TODO: maybe move this enum to ggml.h (ggml_rope_type)
 # enum llama_rope_type {
@ -552,19 +573,25 @@ class llama_batch(ctypes.Structure):
 #     LLAMA_KV_OVERRIDE_TYPE_INT,
 #     LLAMA_KV_OVERRIDE_TYPE_FLOAT,
 #     LLAMA_KV_OVERRIDE_TYPE_BOOL,
+#     LLAMA_KV_OVERRIDE_TYPE_STR,
 # };
 LLAMA_KV_OVERRIDE_TYPE_INT = 0
 LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1
 LLAMA_KV_OVERRIDE_TYPE_BOOL = 2
+LLAMA_KV_OVERRIDE_TYPE_STR = 3


 # struct llama_model_kv_override {
-#     char key[128];
 #     enum llama_model_kv_override_type tag;
+
+#     char key[128];
+
+
 #     union {
-#         int64_t int_value;
-#         double float_value;
-#         bool bool_value;
+#         int64_t val_i64;
+#         double  val_f64;
+#         bool    val_bool;
+#         char    val_str[128];
 #     };
 # };
 class llama_model_kv_override_value(ctypes.Union):
@ -572,16 +599,28 @@ class llama_model_kv_override_value(ctypes.Union):
        ("int_value", ctypes.c_int64),
        ("float_value", ctypes.c_double),
        ("bool_value", ctypes.c_bool),
+        ("str_value", ctypes.c_char * 128),
    ]

+    if TYPE_CHECKING:
+        int_value: int
+        float_value: float
+        bool_value: bool
+        str_value: bytes
+

 class llama_model_kv_override(ctypes.Structure):
    _fields_ = [
-        ("key", ctypes.c_char * 128),
        ("tag", ctypes.c_int),
+        ("key", ctypes.c_char * 128),
        ("value", llama_model_kv_override_value),
    ]

+    if TYPE_CHECKING:
+        tag: int
+        key: bytes
+        value: Union[int, float, bool, bytes]
+

 # struct llama_model_params {
 #     int32_t n_gpu_layers; // number of layers to store in VRAM
@ -612,6 +651,7 @@ class llama_model_kv_override(ctypes.Structure):
 #     bool vocab_only;    // only load the vocabulary, no weights
 #     bool use_mmap;      // use mmap if possible
 #     bool use_mlock;     // force system to keep model in RAM
+#     bool check_tensors; // validate model tensor data
 # };
 class llama_model_params(ctypes.Structure):
    """Parameters for llama_model
@ -626,7 +666,8 @@ class llama_model_params(ctypes.Structure):
        kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
        vocab_only (bool): only load the vocabulary, no weights
        use_mmap (bool): use mmap if possible
-        use_mlock (bool): force system to keep model in RAM"""
+        use_mlock (bool): force system to keep model in RAM
+        check_tensors (bool): validate model tensor data"""

    if TYPE_CHECKING:
        n_gpu_layers: int
@ -639,6 +680,7 @@ class llama_model_params(ctypes.Structure):
        vocab_only: bool
        use_mmap: bool
        use_mlock: bool
+        check_tensors: bool

    _fields_ = [
        ("n_gpu_layers", ctypes.c_int32),
@ -651,6 +693,7 @@ class llama_model_params(ctypes.Structure):
        ("vocab_only", ctypes.c_bool),
        ("use_mmap", ctypes.c_bool),
        ("use_mlock", ctypes.c_bool),
+        ("check_tensors", ctypes.c_bool),
    ]


@ -687,6 +730,7 @@ class llama_model_params(ctypes.Structure):
 #     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
 #     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+#     bool flash_attn;  // whether to use flash attention


 #     // Abort callback
@ -723,6 +767,7 @@ class llama_context_params(ctypes.Structure):
        logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
        embeddings (bool): if true, extract embeddings (together with logits)
        offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
+        flash_attn (bool): whether to use flash attention
        abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
        abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
    """
@ -752,6 +797,7 @@ class llama_context_params(ctypes.Structure):
        logits_all: bool
        embeddings: bool
        offload_kqv: bool
+        flash_attn: bool
        abort_callback: Callable[[ctypes.c_void_p], bool]
        abort_callback_data: ctypes.c_void_p

@ -780,6 +826,7 @@ class llama_context_params(ctypes.Structure):
        ("logits_all", ctypes.c_bool),
        ("embeddings", ctypes.c_bool),
        ("offload_kqv", ctypes.c_bool),
+        ("flash_attn", ctypes.c_bool),
        ("abort_callback", ggml_abort_callback),
        ("abort_callback_data", ctypes.c_void_p),
    ]
@ -811,6 +858,7 @@ It might not exist for progress report where '.' is output repeatedly."""
 #     bool quantize_output_tensor;         // quantize output.weight
 #     bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
 #     bool pure;                           // quantize all tensors to the default type
+#     bool keep_split;                     // quantize to the same number of shards
 #     void * imatrix;                      // pointer to importance matrix data
 #     void * kv_overrides;                 // pointer to vector containing overrides
 # } llama_model_quantize_params;
@ -826,6 +874,7 @@ class llama_model_quantize_params(ctypes.Structure):
        quantize_output_tensor (bool): quantize output.weight
        only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
        pure (bool): quantize all tensors to the default type
+        keep_split (bool): quantize to the same number of shards
        imatrix (ctypes.c_void_p): pointer to importance matrix data
        kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
    """
@ -839,6 +888,7 @@ class llama_model_quantize_params(ctypes.Structure):
        quantize_output_tensor: bool
        only_copy: bool
        pure: bool
+        keep_split: bool
        imatrix: ctypes.c_void_p
        kv_overrides: ctypes.c_void_p

@ -851,6 +901,7 @@ class llama_model_quantize_params(ctypes.Structure):
        ("quantize_output_tensor", ctypes.c_bool),
        ("only_copy", ctypes.c_bool),
        ("pure", ctypes.c_bool),
+        ("keep_split", ctypes.c_bool),
        ("imatrix", ctypes.c_void_p),
        ("kv_overrides", ctypes.c_void_p),
    ]
@ -1037,8 +1088,7 @@ GGML_NUMA_STRATEGY_COUNT = 5
    [ctypes.c_int],
    None,
 )
-def llama_numa_init(numa: int, /):
-    ...
+def llama_numa_init(numa: int, /): ...


 # // Call once at the end of the program - currently only used for MPI
@ -1063,8 +1113,7 @@ def llama_backend_free():
 )
 def llama_load_model_from_file(
    path_model: bytes, params: llama_model_params, /
-) -> Optional[llama_model_p]:
-    ...
+) -> Optional[llama_model_p]: ...


 # LLAMA_API void llama_free_model(struct llama_model * model);
@ -1073,8 +1122,7 @@ def llama_load_model_from_file(
    [llama_model_p_ctypes],
    None,
 )
-def llama_free_model(model: llama_model_p, /):
-    ...
+def llama_free_model(model: llama_model_p, /): ...


 # LLAMA_API struct llama_context * llama_new_context_with_model(
@ -1087,8 +1135,7 @@ def llama_free_model(model: llama_model_p, /):
 )
 def llama_new_context_with_model(
    model: llama_model_p, params: llama_context_params, /
-) -> Optional[llama_context_p]:
-    ...
+) -> Optional[llama_context_p]: ...


 # // Frees all allocated memory
@ -1109,98 +1156,87 @@ def llama_free(ctx: llama_context_p, /):
    [],
    ctypes.c_int64,
 )
-def llama_time_us() -> int:
-    ...
+def llama_time_us() -> int: ...


 # LLAMA_API size_t llama_max_devices(void);
@ctypes_function("llama_max_devices", [], ctypes.c_size_t)
-def llama_max_devices() -> int:
-    ...
+def llama_max_devices() -> int: ...


 # LLAMA_API bool llama_supports_mmap       (void);
@ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
-def llama_supports_mmap() -> bool:
-    ...
+def llama_supports_mmap() -> bool: ...


 # LLAMA_API bool llama_supports_mlock      (void);
@ctypes_function("llama_supports_mlock", [], ctypes.c_bool)
-def llama_supports_mlock() -> bool:
-    ...
+def llama_supports_mlock() -> bool: ...


 # LLAMA_API bool llama_supports_gpu_offload(void);
@ctypes_function("llama_supports_gpu_offload", [], ctypes.c_bool)
-def llama_supports_gpu_offload() -> bool:
-    ...
+def llama_supports_gpu_offload() -> bool: ...


 # LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
@ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
-def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
-    ...
+def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ...


 # LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
@ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_ctx(ctx: llama_context_p, /) -> int:
-    ...
+def llama_n_ctx(ctx: llama_context_p, /) -> int: ...


 # LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
@ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_batch(ctx: llama_context_p, /) -> int:
-    ...
+def llama_n_batch(ctx: llama_context_p, /) -> int: ...


 # LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
@ctypes_function("llama_n_ubatch", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_ubatch(ctx: llama_context_p, /) -> int:
-    ...
+def llama_n_ubatch(ctx: llama_context_p, /) -> int: ...


 # LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
@ctypes_function("llama_n_seq_max", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_seq_max(ctx: llama_context_p, /) -> int:
-    ...
+def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
+
+
+# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
+@ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int)
+def llama_pooling_type(ctx: llama_context_p, /) -> int: ...


 # LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model   * model);
@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int)
-def llama_vocab_type(model: llama_model_p, /) -> int:
-    ...
+def llama_vocab_type(model: llama_model_p, /) -> int: ...


 # LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model   * model);
@ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int)
-def llama_rope_type(model: llama_model_p, /) -> int:
-    ...
+def llama_rope_type(model: llama_model_p, /) -> int: ...


 # LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
@ctypes_function("llama_n_vocab", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_vocab(model: llama_model_p, /) -> int:
-    ...
+def llama_n_vocab(model: llama_model_p, /) -> int: ...


 # LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
@ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_ctx_train(model: llama_model_p, /) -> int:
-    ...
+def llama_n_ctx_train(model: llama_model_p, /) -> int: ...


 # LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
@ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_embd(model: llama_model_p, /) -> int:
-    ...
+def llama_n_embd(model: llama_model_p, /) -> int: ...


 # LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_layer(model: llama_model_p, /) -> int:
-    ...
+def llama_n_layer(model: llama_model_p, /) -> int: ...


 # // Get the model's RoPE frequency scaling factor
@ -1583,7 +1619,7 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
    ...


-# // Clear the KV cache
+# // Clear the KV cache - both cell info is erased and KV data is zeroed
 # LLAMA_API void llama_kv_cache_clear(
 #         struct llama_context * ctx);
@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)
@ -1902,8 +1938,7 @@ def llama_state_load_file(
    n_token_capacity: Union[ctypes.c_size_t, int],
    n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
    /,
-) -> bool:
-    ...
+) -> bool: ...


 # LLAMA_API DEPRECATED(bool llama_load_session_file(
@ -1931,8 +1966,7 @@ def llama_load_session_file(
    n_token_capacity: Union[ctypes.c_size_t, int],
    n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
    /,
-) -> int:
-    ...
+) -> int: ...


 # LLAMA_API bool llama_state_save_file(
@ -1956,8 +1990,7 @@ def llama_state_save_file(
    tokens: CtypesArray[llama_token],
    n_token_count: Union[ctypes.c_size_t, int],
    /,
-) -> bool:
-    ...
+) -> bool: ...


 # LLAMA_API DEPRECATED(bool llama_save_session_file(
@ -1982,8 +2015,7 @@ def llama_save_session_file(
    tokens: CtypesArray[llama_token],
    n_token_count: Union[ctypes.c_size_t, int],
    /,
-) -> int:
-    ...
+) -> int: ...


 # // Get the exact size needed to copy the KV cache of a single sequence
@ -2061,8 +2093,7 @@ def llama_state_seq_save_file(
    tokens: CtypesArray[llama_token],
    n_token_count: Union[ctypes.c_size_t, int],
    /,
-) -> int:
-    ...
+) -> int: ...


 # LLAMA_API size_t llama_state_seq_load_file(
@ -2092,8 +2123,7 @@ def llama_state_seq_load_file(
    n_token_capacity: Union[ctypes.c_size_t, int],
    n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
    /,
-) -> int:
-    ...
+) -> int: ...


 # //
@ -2356,8 +2386,7 @@ def llama_get_embeddings_seq(
 )
 def llama_token_get_text(
    model: llama_model_p, token: Union[llama_token, int], /
-) -> bytes:
-    ...
+) -> bytes: ...


 # LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
@ -2366,8 +2395,7 @@ def llama_token_get_text(
 )
 def llama_token_get_score(
    model: llama_model_p, token: Union[llama_token, int], /
-) -> float:
-    ...
+) -> float: ...


 # LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
@ -2376,8 +2404,7 @@ def llama_token_get_score(
 )
 def llama_token_get_type(
    model: llama_model_p, token: Union[llama_token, int], /
-) -> int:
-    ...
+) -> int: ...


 # // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
@ -2385,9 +2412,7 @@ def llama_token_get_type(
@ctypes_function(
    "llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool
 )
-def llama_token_is_eog(
-    model: llama_model_p, token: Union[llama_token, int], /
-) -> bool:
+def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /) -> bool:
    """Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)"""
    ...

@ -2456,20 +2481,17 @@ def llama_token_prefix(model: llama_model_p) -> int:

 # LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
@ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token)
-def llama_token_middle(model: llama_model_p, /) -> int:
-    ...
+def llama_token_middle(model: llama_model_p, /) -> int: ...


 # LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
@ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token)
-def llama_token_suffix(model: llama_model_p, /) -> int:
-    ...
+def llama_token_suffix(model: llama_model_p, /) -> int: ...


 # LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
-def llama_token_eot(model: llama_model_p, /) -> int:
-    ...
+def llama_token_eot(model: llama_model_p, /) -> int: ...


 # //
@ -2610,8 +2632,7 @@ def llama_chat_apply_template(
    chat: CtypesArray[llama_chat_message],
    n_msg: int,
    /,
-) -> int:
-    ...
+) -> int: ...


 # //
@ -3091,7 +3112,7 @@ def llama_sample_token_greedy(
    ...


-# /// @details Randomly selects a token from the candidates based on their probabilities.
+# /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
 # LLAMA_API llama_token llama_sample_token(
 #         struct llama_context * ctx,
 #       llama_token_data_array * candidates);
@ -3224,8 +3245,7 @@ def llama_beam_search(
    n_past: Union[ctypes.c_int, int],
    n_predict: Union[ctypes.c_int, int],
    /,
-):
-    ...
+): ...


 # /// @details Build a split GGUF final path for this chunk.
@ -3344,5 +3364,4 @@ def llama_log_set(
    [ctypes.c_void_p, llama_context_p_ctypes],
    None,
 )
-def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /):
-    ...
+def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /): ...
--- a/llama_cpp/llama_grammar.py
+++ b/llama_cpp/llama_grammar.py
@ -556,17 +556,11 @@ def add_rule(
 # }
 def decode_utf8(src: const_char_p) -> Tuple[int, const_char_p]:
    """Decodes a UTF-8 character from the source string."""
-    lookup = (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4)
-    first_byte = ord(src[0])  # type: int
-    highbits = first_byte >> 4  # type: int
-    len = lookup[highbits]  # type: int
-    mask = (1 << (8 - len)) - 1  # type: int
-    value = first_byte & mask  # type: int
-    end = src + len  # type: const_char_p # may overrun!
-    pos = src + 1  # type: const_char_p
-    while pos < end and pos[0]:
-        value = (value << 6) + (ord(pos[0]) & 0x3F)
-        pos += 1
+    # Get the codepoint of the first character
+    value = ord(src[0])
+    # Move the pointer ahead one character
+    pos = src + 1
+
    return value, pos


--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@ -24,7 +24,7 @@ class EmbeddingUsage(TypedDict):
 class Embedding(TypedDict):
    index: int
    object: str
-    embedding: List[float]
+    embedding: Union[List[float], List[List[float]]]


 class CreateEmbeddingResponse(TypedDict):
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import sys
 import os
 import ctypes
@ -14,10 +16,22 @@ from ctypes import (
    Structure,
 )
 import pathlib
-from typing import List, Union, NewType, Optional, TypeVar, Callable, Any
+from typing import (
+    List,
+    Union,
+    NewType,
+    Optional,
+    TypeVar,
+    Callable,
+    Any,
+    TYPE_CHECKING,
+    Generic,
+)
+from typing_extensions import TypeAlias

 import llama_cpp.llama_cpp as llama_cpp

+
 # Load the library
 def _load_shared_library(lib_base_name: str):
    # Construct the paths to the possible shared library names
@ -79,8 +93,27 @@ _libllava = _load_shared_library(_libllava_base_name)

 # ctypes helper

+if TYPE_CHECKING:
+    CtypesCData = TypeVar("CtypesCData", bound=ctypes._CData)  # type: ignore
+
+    CtypesArray: TypeAlias = ctypes.Array[CtypesCData]  # type: ignore
+
+    CtypesPointer: TypeAlias = ctypes._Pointer[CtypesCData]  # type: ignore
+
+    CtypesVoidPointer: TypeAlias = ctypes.c_void_p
+
+    class CtypesRef(Generic[CtypesCData]):
+        pass
+
+    CtypesPointerOrRef: TypeAlias = Union[
+        CtypesPointer[CtypesCData], CtypesRef[CtypesCData]
+    ]
+
+    CtypesFuncPointer: TypeAlias = ctypes._FuncPointer  # type: ignore
+
 F = TypeVar("F", bound=Callable[..., Any])

+
 def ctypes_function_for_shared_library(lib: ctypes.CDLL):
    def ctypes_function(
        name: str, argtypes: List[Any], restype: Any, enabled: bool = True
@ -111,6 +144,7 @@ ctypes_function = ctypes_function_for_shared_library(_libllava)
 clip_ctx_p = NewType("clip_ctx_p", int)
 clip_ctx_p_ctypes = c_void_p

+
 # struct llava_image_embed {
 #     float * embed;
 #     int n_image_pos;
@ -121,36 +155,72 @@ class llava_image_embed(Structure):
        ("n_image_pos", c_int),
    ]

+
 # /** sanity check for clip <-> llava embed size match */
 # LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
-@ctypes_function("llava_validate_embed_size", [llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes], c_bool)
-def llava_validate_embed_size(ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /) -> bool:
-    ...
+@ctypes_function(
+    "llava_validate_embed_size",
+    [llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes],
+    c_bool,
+)
+def llava_validate_embed_size(
+    ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /
+) -> bool: ...


 # /** build an image embed from image file bytes */
 # LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
-@ctypes_function("llava_image_embed_make_with_bytes", [clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int], POINTER(llava_image_embed))
-def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_bytes: bytes, image_bytes_length: Union[c_int, int], /) -> "_Pointer[llava_image_embed]":
-    ...
+@ctypes_function(
+    "llava_image_embed_make_with_bytes",
+    [clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int],
+    POINTER(llava_image_embed),
+)
+def llava_image_embed_make_with_bytes(
+    ctx_clip: clip_ctx_p,
+    n_threads: Union[c_int, int],
+    image_bytes: CtypesArray[c_uint8],
+    image_bytes_length: Union[c_int, int],
+    /,
+) -> "_Pointer[llava_image_embed]": ...
+

 # /** build an image embed from a path to an image filename */
 # LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
-@ctypes_function("llava_image_embed_make_with_filename", [clip_ctx_p_ctypes, c_int, c_char_p], POINTER(llava_image_embed))
-def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /) -> "_Pointer[llava_image_embed]":
-    ...
+@ctypes_function(
+    "llava_image_embed_make_with_filename",
+    [clip_ctx_p_ctypes, c_int, c_char_p],
+    POINTER(llava_image_embed),
+)
+def llava_image_embed_make_with_filename(
+    ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /
+) -> "_Pointer[llava_image_embed]": ...
+

 # LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 # /** free an embedding made with llava_image_embed_make_* */
@ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None)
-def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /):
-    ...
+def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): ...
+

 # /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
 # LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
-@ctypes_function("llava_eval_image_embed", [llama_cpp.llama_context_p_ctypes, POINTER(llava_image_embed), c_int, POINTER(c_int)], c_bool)
-def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointer[llava_image_embed]", n_batch: Union[c_int, int], n_past: "_Pointer[c_int]", /) -> bool:
-    ...
+@ctypes_function(
+    "llava_eval_image_embed",
+    [
+        llama_cpp.llama_context_p_ctypes,
+        POINTER(llava_image_embed),
+        c_int,
+        POINTER(c_int),
+    ],
+    c_bool,
+)
+def llava_eval_image_embed(
+    ctx_llama: llama_cpp.llama_context_p,
+    embed: "_Pointer[llava_image_embed]",
+    n_batch: Union[c_int, int],
+    n_past: "_Pointer[c_int]",
+    /,
+) -> bool: ...


 ################################################
@ -161,11 +231,12 @@ def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointe
 # /** load mmproj model */
 # CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
@ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes)
-def clip_model_load(fname: bytes, verbosity: Union[c_int, int], /) -> Optional[clip_ctx_p]:
-    ...
+def clip_model_load(
+    fname: bytes, verbosity: Union[c_int, int], /
+) -> Optional[clip_ctx_p]: ...
+

 # /** free mmproj model */
 # CLIP_API void clip_free(struct clip_ctx * ctx);
@ctypes_function("clip_free", [clip_ctx_p_ctypes], None)
-def clip_free(ctx: clip_ctx_p, /):
-    ...
+def clip_free(ctx: clip_ctx_p, /): ...
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@ -72,9 +72,74 @@ class LlamaProxy:
        chat_handler = None
        if settings.chat_format == "llava-1-5":
            assert settings.clip_model_path is not None, "clip model not found"
+            if settings.hf_model_repo_id is not None:
+                chat_handler = (
+                    llama_cpp.llama_chat_format.Llava15ChatHandler.from_pretrained(
+                        repo_id=settings.hf_model_repo_id,
+                        filename=settings.clip_model_path,
+                        verbose=settings.verbose,
+                    )
+                )
+            else:
                chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
                )
+        elif settings.chat_format == "obsidian":
+            assert settings.clip_model_path is not None, "clip model not found"
+            if settings.hf_model_repo_id is not None:
+                chat_handler = (
+                    llama_cpp.llama_chat_format.ObsidianChatHandler.from_pretrained(
+                        repo_id=settings.hf_model_repo_id,
+                        filename=settings.clip_model_path,
+                        verbose=settings.verbose,
+                    )
+                )
+            else:
+                chat_handler = llama_cpp.llama_chat_format.ObsidianChatHandler(
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                )
+        elif settings.chat_format == "llava-1-6":
+            assert settings.clip_model_path is not None, "clip model not found"
+            if settings.hf_model_repo_id is not None:
+                chat_handler = (
+                    llama_cpp.llama_chat_format.Llava16ChatHandler.from_pretrained(
+                        repo_id=settings.hf_model_repo_id,
+                        filename=settings.clip_model_path,
+                        verbose=settings.verbose,
+                    )
+                )
+            else:
+                chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler(
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                )
+        elif settings.chat_format == "moondream":
+            assert settings.clip_model_path is not None, "clip model not found"
+            if settings.hf_model_repo_id is not None:
+                chat_handler = (
+                    llama_cpp.llama_chat_format.MoondreamChatHandler.from_pretrained(
+                        repo_id=settings.hf_model_repo_id,
+                        filename=settings.clip_model_path,
+                        verbose=settings.verbose,
+                    )
+                )
+            else:
+                chat_handler = llama_cpp.llama_chat_format.MoondreamChatHandler(
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                )
+        elif settings.chat_format == "nanollava":
+            assert settings.clip_model_path is not None, "clip model not found"
+            if settings.hf_model_repo_id is not None:
+                chat_handler = (
+                    llama_cpp.llama_chat_format.NanoLlavaChatHandler.from_pretrained(
+                        repo_id=settings.hf_model_repo_id,
+                        filename=settings.clip_model_path,
+                        verbose=settings.verbose,
+                    )
+                )
+            else:
+                chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                )
        elif settings.chat_format == "hf-autotokenizer":
            assert (
                settings.hf_pretrained_model_name_or_path is not None
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@ -2,8 +2,10 @@ from __future__ import annotations

 import multiprocessing

-from typing import Optional, List, Literal, Union
-from pydantic import Field, root_validator
+from typing import Optional, List, Literal, Union, Dict, cast
+from typing_extensions import Self
+
+from pydantic import Field, model_validator
 from pydantic_settings import BaseSettings

 import llama_cpp
@ -94,6 +96,9 @@ class ModelSettings(BaseSettings):
    offload_kqv: bool = Field(
        default=True, description="Whether to offload kqv to the GPU."
    )
+    flash_attn: bool = Field(
+        default=False, description="Whether to use flash attention."
+    )
    # Sampling Params
    last_n_tokens_size: int = Field(
        default=64,
@ -173,15 +178,16 @@ class ModelSettings(BaseSettings):
        default=True, description="Whether to print debug information."
    )

-    @root_validator(pre=True)  # pre=True to ensure this runs before any other validation
-    def set_dynamic_defaults(cls, values):
+    @model_validator(mode="before")  # pre=True to ensure this runs before any other validation
+    def set_dynamic_defaults(self) -> Self:
        # If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count()
        cpu_count = multiprocessing.cpu_count()
+        values = cast(Dict[str, int], self)
        if values.get('n_threads', 0) == -1:
            values['n_threads'] = cpu_count
        if values.get('n_threads_batch', 0) == -1:
            values['n_threads_batch'] = cpu_count
-        return values
+        return self


 class ServerSettings(BaseSettings):
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@ -6,7 +6,7 @@ from scipy.special import log_softmax

 import llama_cpp

-MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama.gguf"
+MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama-spm.gguf"


 def test_llama_cpp_tokenization():
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 4e96a812b3ce7322a29a3008db2ed73d9087b176
+Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961