This commit is contained in:
baalajimaestro 2024-05-02 18:13:32 +05:30
commit 1d177aaaef
Signed by: baalajimaestro
GPG key ID: F93C394FE9BBAFD5
27 changed files with 1157 additions and 427 deletions

View file

@ -9,3 +9,7 @@ updates:
directory: "/" # Location of package manifests
schedule:
interval: "weekly"
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"

View file

@ -14,12 +14,12 @@ jobs:
os: [ubuntu-20.04, windows-2019, macos-11]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
with:
submodules: "recursive"
# Used to host cibuildwheel
- uses: actions/setup-python@v3
- uses: actions/setup-python@v5
with:
python-version: "3.8"
@ -29,7 +29,7 @@ jobs:
python -m pip install -e .[all]
- name: Build wheels
uses: pypa/cibuildwheel@v2.16.5
uses: pypa/cibuildwheel@v2.17.0
env:
# disable repair
CIBW_REPAIR_WHEEL_COMMAND: ""
@ -37,11 +37,12 @@ jobs:
package-dir: .
output-dir: wheelhouse
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
with:
name: wheels-${{ matrix.os }}
path: ./wheelhouse/*.whl
build_arm64_wheels:
build_wheels_arm64:
name: Build arm64 wheels
runs-on: ubuntu-latest
steps:
@ -55,30 +56,30 @@ jobs:
platforms: linux/arm64
- name: Build wheels
uses: pypa/cibuildwheel@v2.16.5
uses: pypa/cibuildwheel@v2.17.0
env:
CIBW_SKIP: "*musllinux* pp*"
CIBW_REPAIR_WHEEL_COMMAND: ""
CIBW_ARCHS: "aarch64"
CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
with:
output-dir: wheelhouse/
output-dir: wheelhouse
- name: Upload wheels as artifacts
uses: actions/upload-artifact@v4
with:
name: wheels-${{ matrix.version }}
path: wheelhouse/*.whl
name: wheels_arm64
path: ./wheelhouse/*.whl
build_sdist:
name: Build source distribution
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
with:
submodules: "recursive"
- uses: actions/setup-python@v3
- uses: actions/setup-python@v5
with:
python-version: "3.8"
- name: Install dependencies
@ -88,21 +89,23 @@ jobs:
- name: Build source distribution
run: |
python -m build --sdist
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
with:
name: sdist
path: ./dist/*.tar.gz
release:
name: Release
needs: [build_wheels, build_arm64_wheels, build_sdist]
needs: [build_wheels, build_wheels_arm64, build_sdist]
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v3
- uses: actions/download-artifact@v4
with:
name: artifact
merge-multiple: true
path: dist
- uses: softprops/action-gh-release@v1
- uses: softprops/action-gh-release@v2
with:
files: dist/*
env:

View file

@ -12,18 +12,18 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: "recursive"
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
@ -31,7 +31,7 @@ jobs:
- name: Build and push
id: docker_build
uses: docker/build-push-action@v4
uses: docker/build-push-action@v5
with:
context: .
file: "docker/simple/Dockerfile"

View file

@ -22,7 +22,7 @@ jobs:
$matrix = @{
'os' = @('ubuntu-20.04', 'windows-latest')
'pyver' = @("3.10", "3.11", "3.12")
'cuda' = @("12.1.1", "12.2.2", "12.3.2")
'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1")
'releasetag' = @("basic")
}
@ -47,12 +47,12 @@ jobs:
with:
submodules: "recursive"
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.pyver }}
- name: Setup Mamba
uses: conda-incubator/setup-miniconda@v2.2.0
uses: conda-incubator/setup-miniconda@v3.0.4
with:
activate-environment: "build"
python-version: ${{ matrix.pyver }}
@ -65,7 +65,7 @@ jobs:
- name: VS Integration Cache
id: vs-integration-cache
if: runner.os == 'Windows'
uses: actions/cache@v3.3.2
uses: actions/cache@v4.0.2
with:
path: ./MSBuildExtensions
key: cuda-${{ matrix.cuda }}-vs-integration
@ -74,7 +74,7 @@ jobs:
if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
run: |
if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
$links = (Invoke-RestMethod 'https://github.com/Jimver/cuda-toolkit/raw/dc0ca7bb29c5a92f7a963d3d5c93f8d59765136a/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
$links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
& 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
@ -122,7 +122,7 @@ jobs:
# write the build tag to the output
Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
- uses: softprops/action-gh-release@v1
- uses: softprops/action-gh-release@v2
with:
files: dist/*
# Set tag_name to <tag>-cu<cuda_version>

View file

@ -41,7 +41,7 @@ jobs:
with:
submodules: "recursive"
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.pyver }}
@ -78,7 +78,7 @@ jobs:
VERBOSE=1 python -m build --wheel
fi
- uses: softprops/action-gh-release@v1
- uses: softprops/action-gh-release@v2
with:
files: dist/*
# set release name to <tag>-metal

View file

@ -31,12 +31,14 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Pages
uses: actions/configure-pages@v4
uses: actions/configure-pages@v5
- name: Build
run: |
./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
- name: Upload artifact
uses: actions/upload-pages-artifact@v3

View file

@ -16,11 +16,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
with:
submodules: "recursive"
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.8"
- name: Append Dev Version to __version__

View file

@ -10,11 +10,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
with:
submodules: "recursive"
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.8"
- name: Install dependencies

View file

@ -8,11 +8,11 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
@ -28,11 +28,11 @@ jobs:
runs-on: windows-latest
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
@ -48,11 +48,11 @@ jobs:
runs-on: macos-latest
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies

View file

@ -21,7 +21,7 @@ jobs:
with:
submodules: "recursive"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
@ -40,11 +40,11 @@ jobs:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
with:
submodules: "recursive"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
@ -57,17 +57,17 @@ jobs:
build-macos:
runs-on: macos-latest
runs-on: macos-13
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
with:
submodules: "recursive"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
@ -83,11 +83,11 @@ jobs:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v3
# - uses: actions/checkout@v4
# with:
# submodules: "recursive"
# - name: Set up Python 3.8
# uses: actions/setup-python@v4
# uses: actions/setup-python@v5
# with:
# python-version: "3.8"
# - name: Set up OpenCL & CLBlast
@ -107,14 +107,14 @@ jobs:
build-macos-metal:
runs-on: macos-latest
runs-on: macos-13
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
with:
submodules: "recursive"
- name: Set up Python 3.8
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.8"
- name: Install dependencies

View file

@ -7,6 +7,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
## [0.2.68]
- feat: Update llama.cpp to ggerganov/llama.cpp@
- feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
- fix(ci): Fix build-and-release.yaml by @Smartappli in #1413
## [0.2.67]
- fix: Ensure image renders before text in chat formats regardless of message content order by @abetlen in 3489ef09d3775f4a87fb7114f619e8ba9cb6b656
- fix(ci): Fix bug in use of upload-artifact failing to merge multiple artifacts into a single release by @abetlen in d03f15bb73a1d520970357b702a9e7d4cc2a7a62
## [0.2.66]
- feat: Update llama.cpp to ggerganov/llama.cpp@8843a98c2ba97a25e93319a104f9ddfaf83ce4c4
- feat: Generic Chat Formats, Tool Calling, and Huggingface Pull Support for Multimodal Models (Obsidian, LLaVA1.6, Moondream) by @abetlen in #1147
- ci(fix): Workflow actions updates and fix arm64 wheels not included in release by @Smartappli in #1392
- ci: Add support for pre-built cuda 12.4.1 wheels by @Smartappli in #1388
- feat: Add support for str type kv_overrides by @abetlen in a411612b385cef100d76145da1fbd02a7b7cc894
- fix: Functionary bug fixes by @jeffrey-fong in #1385
- examples: fix quantize example by @iyubondyrev in #1387
- ci: Update dependabot.yml by @Smartappli in #1391
## [0.2.65]
- feat: Update llama.cpp to ggerganov/llama.cpp@46e12c4692a37bdd31a0432fc5153d7d22bc7f72
- feat: Allow for possibly non-pooled embeddings by @iamlemec in #1380
## [0.2.64]
- feat: Update llama.cpp to ggerganov/llama.cpp@4e96a812b3ce7322a29a3008db2ed73d9087b176

132
README.md
View file

@ -121,7 +121,7 @@ CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
- CUDA Version is 12.1, 12.2 or 12.3
- CUDA Version is 12.1, 12.2, 12.3, or 12.4
- Python Version is 3.10, 3.11 or 3.12
```bash
@ -133,6 +133,7 @@ Where `<cuda-version>` is one of the following:
- `cu121`: CUDA 12.1
- `cu122`: CUDA 12.2
- `cu123`: CUDA 12.3
- `cu124`: CUDA 12.4
For example, to install the CUDA 12.1 wheel:
@ -276,20 +277,26 @@ The high-level API provides a simple managed interface through the [`Llama`](htt
Below is a short example demonstrating how to use the high-level API to for basic text completion:
```python
>>> from llama_cpp import Llama
>>> llm = Llama(
from llama_cpp import Llama
llm = Llama(
model_path="./models/7B/llama-model.gguf",
# n_gpu_layers=-1, # Uncomment to use GPU acceleration
# seed=1337, # Uncomment to set a specific seed
# n_ctx=2048, # Uncomment to increase the context window
)
>>> output = llm(
output = llm(
"Q: Name the planets in the solar system? A: ", # Prompt
max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
>>> print(output)
print(output)
```
By default `llama-cpp-python` generates completions in an OpenAI compatible format:
```python
{
"id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
"object": "text_completion",
@ -344,12 +351,12 @@ The model will will format the messages into a single prompt using the following
Set `verbose=True` to see the selected chat format.
```python
>>> from llama_cpp import Llama
>>> llm = Llama(
from llama_cpp import Llama
llm = Llama(
model_path="path/to/llama-2/llama-model.gguf",
chat_format="llama-2"
)
>>> llm.create_chat_completion(
llm.create_chat_completion(
messages = [
{"role": "system", "content": "You are an assistant who perfectly describes images."},
{
@ -374,9 +381,9 @@ To constrain chat responses to only valid JSON or a specific JSON Schema use the
The following example will constrain the response to valid JSON strings only.
```python
>>> from llama_cpp import Llama
>>> llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
>>> llm.create_chat_completion(
from llama_cpp import Llama
llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
llm.create_chat_completion(
messages=[
{
"role": "system",
@ -396,9 +403,9 @@ The following example will constrain the response to valid JSON strings only.
To constrain the response further to a specific JSON Schema add the schema to the `schema` property of the `response_format` argument.
```python
>>> from llama_cpp import Llama
>>> llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
>>> llm.create_chat_completion(
from llama_cpp import Llama
llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
llm.create_chat_completion(
messages=[
{
"role": "system",
@ -423,9 +430,9 @@ To constrain the response further to a specific JSON Schema add the schema to th
The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.
```python
>>> from llama_cpp import Llama
>>> llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling")
>>> llm.create_chat_completion(
from llama_cpp import Llama
llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling")
llm.create_chat_completion(
messages = [
{
"role": "system",
@ -475,54 +482,91 @@ The various gguf-converted files for this set of models can be found [here](http
Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files.
```python
>>> from llama_cpp import Llama
>>> from llama_cpp.llama_tokenizer import LlamaHFTokenizer
>>> llm = Llama.from_pretrained(
from llama_cpp import Llama
from llama_cpp.llama_tokenizer import LlamaHFTokenizer
llm = Llama.from_pretrained(
repo_id="meetkai/functionary-small-v2.2-GGUF",
filename="functionary-small-v2.2.q4_0.gguf",
chat_format="functionary-v2",
tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF")
)
```
**NOTE**: There is no need to provide the default system messages used in Functionary as they are added automatically in the Functionary chat handler. Thus, the messages should contain just the chat messages and/or system messages that provide additional context for the model (e.g.: datetime, etc.).
</details>
### Multi-modal Models
`llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to
read information from both text and images.
`llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images.
You'll first need to download one of the available multi-modal models in GGUF format:
- [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
- [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
- [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1)
- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf)
- [moondream2](https://huggingface.co/vikhyatk/moondream2)
Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
```python
>>> from llama_cpp import Llama
>>> from llama_cpp.llama_chat_format import Llava15ChatHandler
>>> chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
>>> llm = Llama(
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler
chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
llm = Llama(
model_path="./path/to/llava/llama-model.gguf",
chat_handler=chat_handler,
n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
logits_all=True,# needed to make llava work
)
>>> llm.create_chat_completion(
llm.create_chat_completion(
messages = [
{"role": "system", "content": "You are an assistant who perfectly describes images."},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "https://.../image.png"}},
{"type" : "text", "text": "Describe this image in detail please."}
{"type" : "text", "text": "What's in this image?"},
{"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } }
]
}
]
)
```
You can also pull the model from the Hugging Face Hub using the `from_pretrained` method.
```python
from llama_cpp import Llama
from llama_cpp.llama_chat_format import MoondreamChatHandler
chat_handler = MoondreamChatHandler.from_pretrained(
repo_id="vikhyatk/moondream2",
filename="*mmproj*",
)
llm = Llama.from_pretrained(
repo_id="vikhyatk/moondream2",
filename="*text-model*",
chat_handler=chat_handler,
n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
)
respoonse = llm.create_chat_completion(
messages = [
{
"role": "user",
"content": [
{"type" : "text", "text": "What's in this image?"},
{"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } }
]
}
]
)
print(response["choices"][0]["text"])
```
**Note**: Multi-modal models also support tool calling and JSON mode.
<details>
<summary>Loading a Local Image</summary>
@ -575,7 +619,7 @@ llama = Llama(
### Embeddings
To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding).
To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding) or [`embed`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.embed). Note that you must pass `embedding=True` to the constructor upon model creation for these to work properly.
```python
import llama_cpp
@ -589,6 +633,12 @@ embeddings = llm.create_embedding("Hello, world!")
embeddings = llm.create_embedding(["Hello, world!", "Goodbye, world!"])
```
There are two primary notions of embeddings in a Transformer-style model: *token level* and *sequence level*. Sequence level embeddings are produced by "pooling" token level embeddings together, usually by averaging them or using the first token.
Models that are explicitly geared towards embeddings will usually return sequence level embeddings by default, one for each input string. Non-embedding models such as those designed for text generation will typically return only token level embeddings, one for each token in each sequence. Thus the dimensionality of the return type will be one higher for token level embeddings.
It is possible to control pooling behavior in some cases using the `pooling_type` flag on model creation. You can ensure token level embeddings from any model using `LLAMA_POOLING_TYPE_NONE`. The reverse, getting a generation oriented model to yield sequence level embeddings is currently not possible, but you can always do the pooling manually.
### Adjusting the Context Window
The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.
@ -665,18 +715,18 @@ The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github
Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
```python
>>> import llama_cpp
>>> import ctypes
>>> llama_cpp.llama_backend_init(False) # Must be called once at the start of each program
>>> params = llama_cpp.llama_context_default_params()
import llama_cpp
import ctypes
llama_cpp.llama_backend_init(False) # Must be called once at the start of each program
params = llama_cpp.llama_context_default_params()
# use bytes for char * params
>>> model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
>>> ctx = llama_cpp.llama_new_context_with_model(model, params)
>>> max_tokens = params.n_ctx
model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
ctx = llama_cpp.llama_new_context_with_model(model, params)
max_tokens = params.n_ctx
# use ctypes arrays for array params
>>> tokens = (llama_cpp.llama_token * int(max_tokens))()
>>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True))
>>> llama_cpp.llama_free(ctx)
tokens = (llama_cpp.llama_token * int(max_tokens))()
n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True))
llama_cpp.llama_free(ctx)
```
Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.

View file

@ -98,6 +98,8 @@ You'll first need to download one of the available multi-modal models in GGUF fo
- [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
- [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
- [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1)
- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf)
- [moondream2](https://huggingface.co/vikhyatk/moondream2)
Then when you run the server you'll need to also specify the path to the clip model used for image embedding and the `llava-1-5` chat_format

View file

@ -4,14 +4,16 @@ import llama_cpp
def main(args):
fname_inp = args.fname_inp.encode("utf-8")
fname_out = args.fname_out.encode("utf-8")
if not os.path.exists(fname_inp):
raise RuntimeError(f"Input file does not exist ({fname_inp})")
if os.path.exists(fname_out):
raise RuntimeError(f"Output file already exists ({fname_out})")
fname_inp = args.fname_inp.encode("utf-8")
fname_out = args.fname_out.encode("utf-8")
itype = args.itype
return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype)
ftype = args.type
args = llama_cpp.llama_model_quantize_default_params()
args.ftype = ftype
return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, args)
if return_code != 0:
raise RuntimeError("Failed to quantize model")
@ -20,6 +22,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("fname_inp", type=str, help="Path to input model")
parser.add_argument("fname_out", type=str, help="Path to output model")
parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)")
parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp.py for enum")
args = parser.parse_args()
main(args)

View file

@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *
__version__ = "0.2.64"
__version__ = "0.2.68"

View file

@ -15,6 +15,7 @@ import numpy.typing as npt
from .llama_types import *
from .llama_grammar import LlamaGrammar
from ._utils import suppress_stdout_stderr
import llama_cpp.llama_cpp as llama_cpp
@ -47,6 +48,7 @@ class _LlamaModel:
if not os.path.exists(path_model):
raise ValueError(f"Model path does not exist: {path_model}")
with suppress_stdout_stderr(disable=verbose):
self.model = llama_cpp.llama_load_model_from_file(
self.path_model.encode("utf-8"), self.params
)
@ -273,6 +275,10 @@ class _LlamaContext:
assert self.ctx is not None
return llama_cpp.llama_n_ctx(self.ctx)
def pooling_type(self) -> int:
assert self.ctx is not None
return llama_cpp.llama_pooling_type(self.ctx)
def kv_cache_clear(self):
assert self.ctx is not None
llama_cpp.llama_kv_cache_clear(self.ctx)
@ -641,6 +647,16 @@ def _should_add_bos(model: _LlamaModel) -> bool:
return llama_cpp.llama_vocab_type(model.model) == llama_cpp.LLAMA_VOCAB_TYPE_SPM
# Embedding functions
def _normalize_embedding(embedding):
norm = float(np.linalg.norm(embedding))
if norm == 0.0:
return embedding
return [v / norm for v in embedding]
# Python wrappers over common/sampling structs

View file

@ -1,13 +1,15 @@
import os
import sys
import sys
from typing import Any, Dict
# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
outnull_file = open(os.devnull, "w")
errnull_file = open(os.devnull, "w")
STDOUT_FILENO = 1
STDERR_FILENO = 2
class suppress_stdout_stderr(object):
# NOTE: these must be "saved" here to avoid exceptions when using
# this context manager inside of a __del__ method
@ -22,12 +24,8 @@ class suppress_stdout_stderr(object):
if self.disable:
return self
# Check if sys.stdout and sys.stderr have fileno method
if not hasattr(self.sys.stdout, 'fileno') or not hasattr(self.sys.stderr, 'fileno'):
return self # Return the instance without making changes
self.old_stdout_fileno_undup = self.sys.stdout.fileno()
self.old_stderr_fileno_undup = self.sys.stderr.fileno()
self.old_stdout_fileno_undup = STDOUT_FILENO
self.old_stderr_fileno_undup = STDERR_FILENO
self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup)
self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup)
@ -47,7 +45,6 @@ class suppress_stdout_stderr(object):
return
# Check if sys.stdout and sys.stderr have fileno method
if hasattr(self.sys.stdout, 'fileno') and hasattr(self.sys.stderr, 'fileno'):
self.sys.stdout = self.old_stdout
self.sys.stderr = self.old_stderr

View file

@ -50,6 +50,7 @@ from ._internals import (
_LlamaTokenDataArray, # type: ignore
_LlamaSamplingParams, # type: ignore
_LlamaSamplingContext, # type: ignore
_normalize_embedding, # type: ignore
)
from ._logger import set_verbose
from ._utils import suppress_stdout_stderr
@ -72,7 +73,7 @@ class Llama:
vocab_only: bool = False,
use_mmap: bool = True,
use_mlock: bool = False,
kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None,
kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None,
# Context Params
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
n_ctx: int = 512,
@ -91,6 +92,7 @@ class Llama:
logits_all: bool = False,
embedding: bool = False,
offload_kqv: bool = True,
flash_attn: bool = False,
# Sampling Params
last_n_tokens_size: int = 64,
# LoRA Params
@ -167,6 +169,7 @@ class Llama:
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
embedding: Embedding mode only.
offload_kqv: Offload K, Q, V to GPU.
flash_attn: Use flash attention.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
lora_path: Path to a LoRA file to apply to the model.
@ -253,6 +256,13 @@ class Llama:
elif isinstance(v, float):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
self._kv_overrides_array[i].value.float_value = v
elif isinstance(v, str): # type: ignore
v_bytes = v.encode("utf-8")
if len(v_bytes) > 128: # TODO: Make this a constant
raise ValueError(f"Value for {k} is too long: {v}")
v_bytes = v_bytes.ljust(128, b"\0")
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
self._kv_overrides_array[i].value.str_value[:128] = v_bytes
else:
raise ValueError(f"Unknown value type for {k}: {v}")
@ -302,6 +312,7 @@ class Llama:
) # Must be set to True for speculative decoding
self.context_params.embeddings = embedding # TODO: Rename to embeddings
self.context_params.offload_kqv = offload_kqv
self.context_params.flash_attn = flash_attn
# KV cache quantization
if type_k is not None:
self.context_params.type_k = type_k
@ -760,7 +771,7 @@ class Llama:
input = input if isinstance(input, list) else [input]
# get numeric embeddings
embeds: List[List[float]]
embeds: Union[List[List[float]], List[List[List[float]]]]
total_tokens: int
embeds, total_tokens = self.embed(input, return_count=True) # type: ignore
@ -787,7 +798,7 @@ class Llama:
def embed(
self,
input: Union[str, List[str]],
normalize: bool = True,
normalize: bool = False,
truncate: bool = True,
return_count: bool = False,
):
@ -803,6 +814,10 @@ class Llama:
n_embd = self.n_embd()
n_batch = self.n_batch
# get pooling information
pooling_type = self.pooling_type()
logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE
if self.context_params.embeddings == False:
raise RuntimeError(
"Llama model must be created with embedding=True to call this method"
@ -820,29 +835,37 @@ class Llama:
self._batch.reset()
# decode and fetch embeddings
data: List[List[float]] = []
data: Union[List[List[float]], List[List[List[float]]]] = []
def decode_batch(n_seq: int):
def decode_batch(seq_sizes: List[int]):
assert self._ctx.ctx is not None
llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
self._ctx.decode(self._batch)
self._batch.reset()
# store embeddings
for i in range(n_seq):
ptr = llama_cpp.llama_get_embeddings_seq(
self._ctx.ctx, i
)
if not ptr:
raise RuntimeError("Failed to get embeddings from sequence pooling type is not set")
if pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE:
pos: int = 0
for i, size in enumerate(seq_sizes):
ptr = llama_cpp.llama_get_embeddings(self._ctx.ctx)
embedding: List[List[float]] = [
ptr[pos + j * n_embd : pos + (j + 1) * n_embd] for j in range(size)
]
if normalize:
embedding = [_normalize_embedding(e) for e in embedding]
data.append(embedding)
pos += size
else:
for i in range(len(seq_sizes)):
ptr = llama_cpp.llama_get_embeddings_seq(self._ctx.ctx, i)
embedding: List[float] = ptr[:n_embd]
if normalize:
norm = float(np.linalg.norm(embedding))
embedding = [v / norm for v in embedding]
embedding = _normalize_embedding(embedding)
data.append(embedding)
# init state
total_tokens = 0
s_batch = []
t_batch = 0
p_batch = 0
@ -863,17 +886,21 @@ class Llama:
# time to eval batch
if t_batch + n_tokens > n_batch:
decode_batch(p_batch)
decode_batch(s_batch)
s_batch = []
t_batch = 0
p_batch = 0
# add to batch
self._batch.add_sequence(tokens, p_batch, False)
self._batch.add_sequence(tokens, p_batch, logits_all)
# update batch stats
s_batch.append(n_tokens)
t_batch += n_tokens
p_batch += 1
# hanlde last batch
decode_batch(p_batch)
decode_batch(s_batch)
if self.verbose:
llama_cpp.llama_print_timings(self._ctx.ctx)
@ -1750,6 +1777,7 @@ class Llama:
logits_all=self.context_params.logits_all,
embedding=self.context_params.embeddings,
offload_kqv=self.context_params.offload_kqv,
flash_attn=self.context_params.flash_attn,
# Sampling Params
last_n_tokens_size=self.last_n_tokens_size,
# LoRA Params
@ -1845,6 +1873,10 @@ class Llama:
"""Return the newline token."""
return self._model.token_nl()
def pooling_type(self) -> str:
"""Return the pooling type."""
return self._ctx.pooling_type()
@staticmethod
def logits_to_logprobs(
logits: Union[npt.NDArray[np.single], List], axis: int = -1

View file

@ -6,6 +6,8 @@ import ctypes
import dataclasses
import random
import string
from contextlib import ExitStack
from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol, cast
import jinja2
@ -1828,15 +1830,23 @@ def functionary_v1_v2_chat_handler(
version: Literal["v1", "v2"],
functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
tools: Optional[List[llama_types.ChatCompletionTool]] = None,
tool_choice: Union[Dict, str] = "auto",
):
all_messages: List[llama_types.ChatCompletionRequestMessage] = []
if tool_choice == "none":
all_messages.append(
llama_types.ChatCompletionRequestSystemMessage(
role="system", content=generate_schema_from_functions([])
)
)
else:
if functions is not None:
all_messages.append(
llama_types.ChatCompletionRequestSystemMessage(
role="system", content=generate_schema_from_functions(functions)
)
)
elif tools is not None:
elif tools is not None and tool_choice != "none":
all_messages.append(
llama_types.ChatCompletionRequestSystemMessage(
role="system",
@ -1888,7 +1898,7 @@ def functionary_v1_v2_chat_handler(
function_call = "auto"
prompt = prepare_messages_for_inference(
messages, tokenizer, version, functions, tools
messages, tokenizer, version, functions, tools, function_call
)
# If no tools/functions are provided
@ -1985,17 +1995,12 @@ def functionary_v1_v2_chat_handler(
content = ""
function_calls, function_bodies = [], []
completion_tokens = 0
if version == "v1":
# If no or "auto" tool_choice/function_call
if isinstance(function_call, str) and function_call == "auto":
stops = ["\n", END_ASSISTANT_TOKEN]
# If tool_choice/function_call is "none"
elif isinstance(function_call, str) and function_call == "none":
prompt = prepare_messages_for_inference(
messages, tokenizer, version, [], []
)
stops = END_ASSISTANT_TOKEN
# If tool_choice/function_call is provided
elif isinstance(function_call, dict):
prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
@ -2009,12 +2014,15 @@ def functionary_v1_v2_chat_handler(
completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"]
completion_tokens += completion["usage"]["completion_tokens"]
# If the generation does not involve a function call
if (
START_FUNCTION_CALL_TOKEN not in prompt
and START_FUNCTION_CALL_TOKEN not in completion_text
):
completion["usage"]["completion_tokens"] = completion_tokens
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
# If the generation involves a function call in completion, generate the parameters
elif (
@ -2032,23 +2040,14 @@ def functionary_v1_v2_chat_handler(
)
grammar = get_grammar(function_calls[-1])
completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
completion_tokens += completion["usage"]["completion_tokens"]
function_bodies.append(completion["choices"][0]["text"].strip())
# If the prompt involves a function call, just append generated parameters to function_bodies
else:
function_bodies.append(completion_text.strip())
else:
# If tool_choice/function_call is "none"
if isinstance(function_call, str) and function_call == "none":
prompt = (
prepare_messages_for_inference(messages, tokenizer, version, [], [])
+ "all\n<|content|>"
)
stops = [STOP_TOKEN, FROM_TOKEN]
completion = create_completion(stop=stops)
completion["choices"][0]["text"] = completion["choices"][0]["text"].strip()
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
# If tool_choice/function_call is provided
elif isinstance(function_call, dict):
if isinstance(function_call, dict):
prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
function_call = function_call["name"]
function_calls.append(function_call)
@ -2056,6 +2055,7 @@ def functionary_v1_v2_chat_handler(
stops = [STOP_TOKEN, FROM_TOKEN]
completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"]
completion_tokens += completion["usage"]["completion_tokens"]
function_bodies.append(completion_text.strip())
# If "auto" or no tool_choice/function_call
elif isinstance(function_call, str) and function_call == "auto":
@ -2065,6 +2065,7 @@ def functionary_v1_v2_chat_handler(
stops = CONTENT_TOKEN
completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"]
completion_tokens += completion["usage"]["completion_tokens"]
function_name = completion_text.strip()
if function_name == "all":
prompt += "all\n<|content|>"
@ -2077,12 +2078,23 @@ def functionary_v1_v2_chat_handler(
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
completion = create_completion(stop=stops)
completion_text = completion["choices"][0]["text"]
completion_tokens += completion["usage"]["completion_tokens"]
if function_name == "all":
content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n")
if completion_text.endswith("\n<|from|>assistant\n"):
content += completion_text[:-len("\n<|from|>assistant\n")]
if completion_text.endswith("\n<|from|> assistant\n"):
content += completion_text[-len("\n<|from|> assistant\n")]
else:
content += completion_text
content = content.lstrip()
# Check whether the model wants to generate another turn
if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
cleaned_completion_text = completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n").strip()
if completion_text.endswith("\n<|from|>assistant\n"):
cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
elif completion_text.endswith("\n<|from|> assistant\n"):
cleaned_completion_text = completion_text[-len("\n<|from|> assistant\n")].strip()
else:
cleaned_completion_text = completion_text.strip()
prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
else:
break
@ -2092,6 +2104,7 @@ def functionary_v1_v2_chat_handler(
prompt += completion_text.strip()
grammar = None
completion = create_completion(stop=stops)
completion_tokens += completion["usage"]["completion_tokens"]
if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
prompt += "\n<|from|>assistant\n<|recipient|>"
else:
@ -2120,12 +2133,16 @@ def functionary_v1_v2_chat_handler(
)
# TODO: support stream mode
function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = {
"function_call": {
function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = {}
if len(tool_calls) > 0:
if tools is not None:
function_call_dict["tool_calls"] = tool_calls
else:
function_call_dict["function_call"] = {
"name": tool_calls[0]["function"]["name"],
"arguments": tool_calls[0]["function"]["arguments"],
}
} if len(tool_calls) == 1 else {}
completion["usage"]["completion_tokens"] = completion_tokens
return llama_types.CreateChatCompletionResponse(
id="chat" + completion["id"],
object="chat.completion",
@ -2138,7 +2155,6 @@ def functionary_v1_v2_chat_handler(
"message": {
"role": "assistant",
"content": None if content == "" else content,
"tool_calls": tool_calls,
**function_call_dict,
},
"finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
@ -2149,42 +2165,86 @@ def functionary_v1_v2_chat_handler(
class Llava15ChatHandler:
_clip_free = None
DEFAULT_SYSTEM_MESSAGE = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
def __init__(self, clip_model_path: str, verbose: bool = False):
CHAT_FORMAT = (
"{% for message in messages %}"
"{% if message.role == 'system' %}"
"{{ message.content }}"
"{% endif %}"
"{% if message.role == 'user' %}"
"{% if message.content is string %}"
"\nUSER: {{ message.content }}"
"{% endif %}"
"{% if message.content is iterable %}"
"\nUSER: "
"{% for content in message.content %}"
"{% if content.type == 'image_url' and content.image_url is string %}"
"{{ content.image_url }}"
"{% endif %}"
"{% if content.type == 'image_url' and content.image_url is mapping %}"
"{{ content.image_url.url }}"
"{% endif %}"
"{% endfor %}"
"{% for content in message.content %}"
"{% if content.type == 'text' %}"
"{{ content.text }}"
"{% endif %}"
"{% endfor %}"
"{% endif %}"
"{% endif %}"
"{% if message.role == 'assistant' and message.content is not none %}"
"\nASSISTANT: {{ message.content }}"
"{% endif %}"
"{% endfor %}"
"{% if add_generation_prompt %}"
"\nASSISTANT: "
"{% endif %}"
)
def __init__(self, clip_model_path: str, verbose: bool = True):
import llama_cpp.llava_cpp as llava_cpp
self._llava_cpp = llava_cpp
self.clip_model_path = clip_model_path
self.verbose = verbose
self._clip_free = self._llava_cpp._libllava.clip_free # type: ignore
self._llava_cpp = llava_cpp # TODO: Fix
self._exit_stack = ExitStack()
self._last_image_embed: Optional[llava_cpp.CtypesPointer[llava_cpp.llava_image_embed]] = None
self._last_image_hash: Optional[int] = None
if not os.path.exists(clip_model_path):
raise ValueError(f"Clip model path does not exist: {clip_model_path}")
with suppress_stdout_stderr(disable=self.verbose):
self.clip_ctx = self._llava_cpp.clip_model_load(
clip_ctx = self._llava_cpp.clip_model_load(
self.clip_model_path.encode(), 0
)
def __del__(self):
if clip_ctx is None:
raise ValueError(f"Failed to load clip model: {clip_model_path}")
self.clip_ctx = clip_ctx
def clip_free():
with suppress_stdout_stderr(disable=self.verbose):
if self.clip_ctx is not None and self._clip_free is not None:
self._clip_free(self.clip_ctx)
self.clip_ctx = None
self._llava_cpp.clip_free(self.clip_ctx)
self._exit_stack.callback(clip_free)
def last_image_embed_free():
with suppress_stdout_stderr(disable=self.verbose):
if self._last_image_embed is not None:
self._llava_cpp.llava_image_embed_free(self._last_image_embed)
self._last_image_embed = None
self._exit_stack.callback(last_image_embed_free)
def load_image(self, image_url: str) -> bytes:
if image_url.startswith("data:"):
import base64
image_bytes = base64.b64decode(image_url.split(",")[1])
return image_bytes
else:
import urllib.request
with urllib.request.urlopen(image_url) as f:
image_bytes = f.read()
return image_bytes
return self._load_image(image_url)
def __call__(
self,
@ -2202,6 +2262,7 @@ class Llava15ChatHandler:
typical_p: float = 1.0,
stream: bool = False,
stop: Optional[Union[str, List[str]]] = [],
seed: Optional[int] = None,
response_format: Optional[
llama_types.ChatCompletionRequestResponseFormat
] = None,
@ -2216,68 +2277,54 @@ class Llava15ChatHandler:
model: Optional[str] = None,
logits_processor: Optional[llama.LogitsProcessorList] = None,
grammar: Optional[llama.LlamaGrammar] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
**kwargs, # type: ignore
) -> Union[
llama_types.CreateChatCompletionResponse,
Iterator[llama_types.CreateChatCompletionStreamResponse],
]:
assert (
llama.context_params.logits_all is True
) # BUG: logits_all=True is required for llava
assert self.clip_ctx is not None
system_prompt = _get_system_message(messages)
system_prompt = (
system_prompt
if system_prompt != ""
else "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
)
user_role = "\nUSER:"
assistant_role = "\nASSISTANT:"
llama.reset()
llama.eval(llama.tokenize(system_prompt.encode("utf8"), add_bos=True))
for message in messages:
if message["role"] == "user" and message["content"] is not None:
if isinstance(message["content"], str):
llama.eval(
llama.tokenize(
f"{user_role} {message['content']}".encode("utf8"),
add_bos=False,
)
)
else:
assert isinstance(message["content"], list)
llama.eval(
llama.tokenize(f"{user_role} ".encode("utf8"), add_bos=False)
)
for content in message["content"]:
if content["type"] == "text":
llama.eval(
llama.tokenize(
f"{content['text']}".encode("utf8"), add_bos=False
)
)
if content["type"] == "image_url":
image_bytes = (
self.load_image(content["image_url"]["url"])
if isinstance(content["image_url"], dict)
else self.load_image(content["image_url"])
)
import array
data_array = array.array("B", image_bytes)
c_ubyte_ptr = (
ctypes.c_ubyte * len(data_array)
).from_buffer(data_array)
system_prompt = _get_system_message(messages)
if system_prompt == "":
messages = [llama_types.ChatCompletionRequestSystemMessage(role="system", content=self.DEFAULT_SYSTEM_MESSAGE)] + messages
image_urls = self.get_image_urls(messages)
template = jinja2.Template(self.CHAT_FORMAT)
text = template.render(messages=messages, add_generation_prompt=True)
split_text = self.split_text_on_image_urls(text, image_urls)
def embed_image_bytes(image_bytes: bytes):
if self._last_image_embed is not None and self._last_image_hash is not None and hash(image_bytes) == self._last_image_hash:
return self._last_image_embed
with suppress_stdout_stderr(disable=self.verbose):
embed = (
self._llava_cpp.llava_image_embed_make_with_bytes(
self.clip_ctx,
llama.context_params.n_threads,
c_ubyte_ptr,
llama.context_params.n_threads_batch,
(ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
len(image_bytes),
)
)
try:
self._last_image_embed = embed
self._last_image_hash = hash(image_bytes)
return embed
# Evaluate prompt
llama.reset()
for i, (type_, value) in enumerate(split_text):
if type_ == "text":
tokens = llama.tokenize(value.encode("utf8"), add_bos=i == 0)
if llama.n_tokens + len(tokens) > llama.n_ctx():
raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
llama.eval(tokens)
else:
image_bytes = self.load_image(value)
embed = embed_image_bytes(image_bytes)
if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
n_past = ctypes.c_int(llama.n_tokens)
n_past_p = ctypes.pointer(n_past)
with suppress_stdout_stderr(disable=self.verbose):
@ -2287,36 +2334,66 @@ class Llava15ChatHandler:
llama.n_batch,
n_past_p,
)
assert llama.n_ctx() >= n_past.value
llama.n_tokens = n_past.value
finally:
with suppress_stdout_stderr(disable=self.verbose):
self._llava_cpp.llava_image_embed_free(embed)
if message["role"] == "assistant" and message["content"] is not None:
llama.eval(
llama.tokenize(
f"ASSISTANT: {message['content']}".encode("utf8"), add_bos=False
)
)
assert llama.n_ctx() >= llama.n_tokens
llama.eval(llama.tokenize(f"{assistant_role}".encode("utf8"), add_bos=False))
assert llama.n_ctx() >= llama.n_tokens
# Get prompt tokens to avoid a cache miss
prompt = llama.input_ids[: llama.n_tokens].tolist()
if response_format is not None and response_format["type"] == "json_object":
grammar = _grammar_for_response_format(response_format)
return _convert_completion_to_chat(
llama.create_completion(
# Convert legacy functions to tools
if functions is not None:
tools = [
{
"type": "function",
"function": function,
}
for function in functions
]
# Convert legacy function_call to tool_choice
if function_call is not None:
if isinstance(function_call, str) and (
function_call == "none" or function_call == "auto"
):
tool_choice = function_call
if isinstance(function_call, dict) and "name" in function_call:
tool_choice = {
"type": "function",
"function": {
"name": function_call["name"],
},
}
tool = None
if tool_choice is not None and isinstance(tool_choice, dict) and tools is not None:
name = tool_choice["function"]["name"]
tool = next((t for t in tools if t["function"]["name"] == name), None)
if tool is None:
raise ValueError(f"Tool choice '{name}' not found in tools.")
schema = tool["function"]["parameters"]
try:
# create grammar from json schema
grammar = llama_grammar.LlamaGrammar.from_json_schema(
json.dumps(schema), verbose=llama.verbose
)
except Exception as e:
grammar = llama_grammar.LlamaGrammar.from_string(
llama_grammar.JSON_GBNF, verbose=llama.verbose
)
completion_or_chunks = llama.create_completion(
prompt=prompt,
temperature=temperature,
top_p=top_p,
top_k=top_k,
min_p=min_p,
typical_p=typical_p,
logprobs=top_logprobs if logprobs else None,
stream=stream,
stop=stop,
seed=seed,
max_tokens=max_tokens,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
@ -2328,8 +2405,370 @@ class Llava15ChatHandler:
model=model,
logits_processor=logits_processor,
grammar=grammar,
),
stream=stream,
logit_bias=logit_bias,
)
if tool is not None:
tool_name = tool["function"]["name"]
return _convert_completion_to_chat_function(
tool_name, completion_or_chunks, stream
)
return _convert_completion_to_chat(completion_or_chunks, stream=stream)
@staticmethod
def _load_image(image_url: str) -> bytes:
# TODO: Add Pillow support for other image formats beyond (jpg, png)
if image_url.startswith("data:"):
import base64
image_bytes = base64.b64decode(image_url.split(",")[1])
return image_bytes
else:
import urllib.request
with urllib.request.urlopen(image_url) as f:
image_bytes = f.read()
return image_bytes
@staticmethod
def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
image_urls: List[str] = []
for message in messages:
if message["role"] == "user":
if message["content"] is None:
continue
for content in message["content"]:
if isinstance(content, dict) and "type" in content:
if content["type"] == "image_url":
if (
isinstance(content["image_url"], dict)
and "url" in content["image_url"]
):
image_urls.append(content["image_url"]["url"])
else:
image_urls.append(content["image_url"])
return image_urls
@staticmethod
def split_text_on_image_urls(text: str, image_urls: List[str]):
def find_first(s: str, substrs: List[str]):
for i, substr in enumerate(substrs):
pos = s.find(substr)
if pos != -1:
return pos, i
return None, None
split_text: List[Tuple[Literal["text", "image_url"], str]] = []
remaining = text
while remaining:
# Find first image_url
pos, i = find_first(remaining, image_urls)
if pos is not None and i is not None:
if pos > 0:
split_text.append(("text", remaining[:pos]))
split_text.append(("image_url", image_urls[i]))
remaining = remaining[pos + len(image_urls[i]) :]
else:
split_text.append(("text", remaining))
remaining = ""
return split_text
@classmethod
def from_pretrained(
cls,
repo_id: str,
filename: Optional[str],
local_dir: Optional[Union[str, os.PathLike[str]]] = None,
local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
**kwargs: Any,
) -> "Llava15ChatHandler":
import fnmatch
from pathlib import Path
try:
from huggingface_hub import hf_hub_download, HfFileSystem # type: ignore
from huggingface_hub.utils import validate_repo_id # type: ignore
except ImportError:
raise ImportError(
"Llama.from_pretrained requires the huggingface-hub package. "
"You can install it with `pip install huggingface-hub`."
)
validate_repo_id(repo_id)
hffs = HfFileSystem()
files = [
file["name"] if isinstance(file, dict) else file
for file in hffs.ls(repo_id) # type: ignore
]
# split each file into repo_id, subfolder, filename
file_list: List[str] = []
for file in files:
rel_path = Path(file).relative_to(repo_id)
file_list.append(str(rel_path))
matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore
if len(matching_files) == 0:
raise ValueError(
f"No file found in {repo_id} that match {filename}\n\n"
f"Available Files:\n{json.dumps(file_list)}"
)
if len(matching_files) > 1:
raise ValueError(
f"Multiple files found in {repo_id} matching {filename}\n\n"
f"Available Files:\n{json.dumps(files)}"
)
(matching_file,) = matching_files
subfolder = str(Path(matching_file).parent)
filename = Path(matching_file).name
# download the file
hf_hub_download(
repo_id=repo_id,
filename=filename,
subfolder=subfolder,
local_dir=cast(Union[str, Path, None], local_dir),
local_dir_use_symlinks=local_dir_use_symlinks,
cache_dir=cast(Union[str, Path, None], cache_dir),
)
if local_dir is None:
model_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
subfolder=subfolder,
local_dir=local_dir,
local_dir_use_symlinks=local_dir_use_symlinks,
cache_dir=cast(Union[str, Path, None], cache_dir),
local_files_only=True,
)
else:
model_path = os.path.join(local_dir, filename)
return cls(
clip_model_path=model_path,
**kwargs,
)
class ObsidianChatHandler(Llava15ChatHandler):
# Prompt Format
# The model followed ChatML format. However, with ### as the seperator
# <|im_start|>user
# What is this sign about?\n<image>
# ###
# <|im_start|>assistant
# The sign is about bullying, and it is placed on a black background with a red background.
# ###
CHAT_FORMAT = (
"{% for message in messages %}"
# System message
"{% if message.role == 'system' %}"
"<|im_start|>system\n"
"{{ message.content }}\n"
"###\n"
"{% endif %}"
# User message
"{% if message.role == 'user' %}"
"<|im_start|>user\n"
"{% if message.content is string %}"
"{{ message.content }}"
"{% endif %}"
"{% if message.content is iterable %}"
"{% for content in message.content %}"
"{% if content.type == 'image_url' and content.image_url is string %}"
"{{ content.image_url }}"
"{% endif %}"
"{% if content.type == 'image_url' and content.image_url is mapping %}"
"{{ content.image_url.url }}"
"{% endif %}"
"{% endfor %}"
"{% for content in message.content %}"
"{% if content.type == 'text' %}"
"{{ content.text }}"
"{% endif %}"
"{% endfor %}"
"{% endif %}"
"###\n"
"{% endif %}"
# Assistant message
"{% if message.role == 'assistant' %}"
"<|im_start|>assistant\n"
"{{ message.content }}"
"###\n"
"{% endif %}"
"{% endfor %}"
# Generation prompt
"{% if add_generation_prompt %}"
"<|im_start|>assistant\n"
"{% endif %}"
)
class MoondreamChatHandler(Llava15ChatHandler):
# Chat Format:
# f"<image>\n\n{chat_history}Question: {question}\n\nAnswer:"
CHAT_FORMAT = (
"{% for message in messages %}"
"{% if message.role == 'user' %}"
"{% if message.content is iterable %}"
# <image>
"{% for content in message.content %}"
"{% if content.type == 'image_url' %}"
"{% if content.image_url is string %}"
"{{ content.image_url }}\n\n"
"{% endif %}"
"{% if content.image_url is mapping %}"
"{{ content.image_url.url }}\n\n"
"{% endif %}"
"{% endif %}"
"{% endfor %}"
# Question:
"{% for content in message.content %}"
"{% if content.type == 'text' %}"
"Question: {{ content.text }}\n\n"
"{% endif %}"
"{% endfor %}"
"{% endif %}"
# Question:
"{% if message.content is string %}"
"Question: {{ message.content }}\n\n"
"{% endif %}"
"{% endif %}"
# Answer:
"{% if message.role == 'assistant' %}"
"Answer:{{ message.content }}\n\n"
"{% endif %}"
"{% endfor %}"
# Generation prompt
"{% if add_generation_prompt %}"
"Answer:"
"{% endif %}"
)
class Llava16ChatHandler(Llava15ChatHandler):
DEFAULT_SYSTEM_MESSAGE = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. "
# Example prompt
# "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
CHAT_FORMAT = (
"{% for message in messages %}"
"{% if message.role == 'system' %}"
"{{ message.content }}"
"{% endif %}"
"{% if message.role == 'user' %}"
"{% if message.content is iterable %}"
# <image>
"{% for content in message.content %}"
"{% if content.type == 'image_url' %}"
"{% if content.image_url is string %}"
"{{ content.image_url }}\n"
"{% endif %}"
"{% if content.image_url is mapping %}"
"{{ content.image_url.url }}\n"
"{% endif %}"
"{% endif %}"
"{% endfor %}"
# Question:
"{% for content in message.content %}"
"{% if content.type == 'text' %}"
"{{ content.text }}"
"{% endif %}"
"{% endfor %}"
"{% endif %}"
# Question:
"{% if message.content is string %}"
"{{ message.content }}"
"{% endif %}"
"{% endif %}"
# Answer:
"{% if message.role == 'assistant' %}"
"{{ message.content }}"
"{% endif %}"
"{% endfor %}"
# Generation prompt
"{% if add_generation_prompt %}"
"Answer:"
"{% endif %}"
)
class NanoLlavaChatHandler(Llava15ChatHandler):
# Prompt Format
# The model follow the ChatML standard, however, without \n at the end of <|im_end|>:
# <|im_start|>system
# Answer the question<|im_end|><|im_start|>user
# <image>
# What is the picture about?<|im_end|><|im_start|>assistant
CHAT_FORMAT = (
"{% for message in messages %}"
# System message
"{% if message.role == 'system' %}"
"<|im_start|>system\n"
"{{ message.content }}"
"<|im_end|>"
"{% endif %}"
# User message
"{% if message.role == 'user' %}"
"<|im_start|>user\n"
"{% if message.content is string %}"
"{{ message.content }}"
"{% endif %}"
"{% if message.content is iterable %}"
"{% for content in message.content %}"
"{% if content.type == 'image_url' and content.image_url is string %}"
"{{ content.image_url }}"
"{% endif %}"
"{% if content.type == 'image_url' and content.image_url is mapping %}"
"{{ content.image_url.url }}"
"{% endif %}"
"{% endfor %}"
"{% for content in message.content %}"
"{% if content.type == 'text' %}"
"{{ content.text }}"
"{% endif %}"
"{% endfor %}"
"{% endif %}"
"<|im_end|>"
"{% endif %}"
# Assistant message
"{% if message.role == 'assistant' %}"
"<|im_start|>assistant\n"
"{{ message.content }}"
"<|im_end|>"
"{% endif %}"
"{% endfor %}"
# Generation prompt
"{% if add_generation_prompt %}"
"<|im_start|>assistant\n"
"{% endif %}"
)

View file

@ -242,8 +242,8 @@ LLAMA_FILE_MAGIC_GGSQ = 0x67677371
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
# define LLAMA_SESSION_VERSION 5
LLAMA_SESSION_VERSION = 5
# define LLAMA_SESSION_VERSION 6
LLAMA_SESSION_VERSION = 6
# define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
LLAMA_STATE_SEQ_MAGIC = LLAMA_FILE_MAGIC_GGSQ
@ -284,6 +284,27 @@ LLAMA_VOCAB_TYPE_WPM = 3
"""BERT tokenizer based on WordPiece"""
# // pre-tokenization types
# enum llama_vocab_pre_type {
# LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
# LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
# LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
# LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
# LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
# LLAMA_VOCAB_PRE_TYPE_MPT = 5,
# LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
# LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
# };
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3
LLAMA_VOCAB_PRE_TYPE_FALCON = 4
LLAMA_VOCAB_PRE_TYPE_MPT = 5
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
# // note: these values should be synchronized with ggml_rope
# // TODO: maybe move this enum to ggml.h (ggml_rope_type)
# enum llama_rope_type {
@ -552,19 +573,25 @@ class llama_batch(ctypes.Structure):
# LLAMA_KV_OVERRIDE_TYPE_INT,
# LLAMA_KV_OVERRIDE_TYPE_FLOAT,
# LLAMA_KV_OVERRIDE_TYPE_BOOL,
# LLAMA_KV_OVERRIDE_TYPE_STR,
# };
LLAMA_KV_OVERRIDE_TYPE_INT = 0
LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1
LLAMA_KV_OVERRIDE_TYPE_BOOL = 2
LLAMA_KV_OVERRIDE_TYPE_STR = 3
# struct llama_model_kv_override {
# char key[128];
# enum llama_model_kv_override_type tag;
# char key[128];
# union {
# int64_t int_value;
# double float_value;
# bool bool_value;
# int64_t val_i64;
# double val_f64;
# bool val_bool;
# char val_str[128];
# };
# };
class llama_model_kv_override_value(ctypes.Union):
@ -572,16 +599,28 @@ class llama_model_kv_override_value(ctypes.Union):
("int_value", ctypes.c_int64),
("float_value", ctypes.c_double),
("bool_value", ctypes.c_bool),
("str_value", ctypes.c_char * 128),
]
if TYPE_CHECKING:
int_value: int
float_value: float
bool_value: bool
str_value: bytes
class llama_model_kv_override(ctypes.Structure):
_fields_ = [
("key", ctypes.c_char * 128),
("tag", ctypes.c_int),
("key", ctypes.c_char * 128),
("value", llama_model_kv_override_value),
]
if TYPE_CHECKING:
tag: int
key: bytes
value: Union[int, float, bool, bytes]
# struct llama_model_params {
# int32_t n_gpu_layers; // number of layers to store in VRAM
@ -612,6 +651,7 @@ class llama_model_kv_override(ctypes.Structure):
# bool vocab_only; // only load the vocabulary, no weights
# bool use_mmap; // use mmap if possible
# bool use_mlock; // force system to keep model in RAM
# bool check_tensors; // validate model tensor data
# };
class llama_model_params(ctypes.Structure):
"""Parameters for llama_model
@ -626,7 +666,8 @@ class llama_model_params(ctypes.Structure):
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
vocab_only (bool): only load the vocabulary, no weights
use_mmap (bool): use mmap if possible
use_mlock (bool): force system to keep model in RAM"""
use_mlock (bool): force system to keep model in RAM
check_tensors (bool): validate model tensor data"""
if TYPE_CHECKING:
n_gpu_layers: int
@ -639,6 +680,7 @@ class llama_model_params(ctypes.Structure):
vocab_only: bool
use_mmap: bool
use_mlock: bool
check_tensors: bool
_fields_ = [
("n_gpu_layers", ctypes.c_int32),
@ -651,6 +693,7 @@ class llama_model_params(ctypes.Structure):
("vocab_only", ctypes.c_bool),
("use_mmap", ctypes.c_bool),
("use_mlock", ctypes.c_bool),
("check_tensors", ctypes.c_bool),
]
@ -687,6 +730,7 @@ class llama_model_params(ctypes.Structure):
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
# bool embeddings; // if true, extract embeddings (together with logits)
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
# bool flash_attn; // whether to use flash attention
# // Abort callback
@ -723,6 +767,7 @@ class llama_context_params(ctypes.Structure):
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
embeddings (bool): if true, extract embeddings (together with logits)
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
flash_attn (bool): whether to use flash attention
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
"""
@ -752,6 +797,7 @@ class llama_context_params(ctypes.Structure):
logits_all: bool
embeddings: bool
offload_kqv: bool
flash_attn: bool
abort_callback: Callable[[ctypes.c_void_p], bool]
abort_callback_data: ctypes.c_void_p
@ -780,6 +826,7 @@ class llama_context_params(ctypes.Structure):
("logits_all", ctypes.c_bool),
("embeddings", ctypes.c_bool),
("offload_kqv", ctypes.c_bool),
("flash_attn", ctypes.c_bool),
("abort_callback", ggml_abort_callback),
("abort_callback_data", ctypes.c_void_p),
]
@ -811,6 +858,7 @@ It might not exist for progress report where '.' is output repeatedly."""
# bool quantize_output_tensor; // quantize output.weight
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
# bool pure; // quantize all tensors to the default type
# bool keep_split; // quantize to the same number of shards
# void * imatrix; // pointer to importance matrix data
# void * kv_overrides; // pointer to vector containing overrides
# } llama_model_quantize_params;
@ -826,6 +874,7 @@ class llama_model_quantize_params(ctypes.Structure):
quantize_output_tensor (bool): quantize output.weight
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
pure (bool): quantize all tensors to the default type
keep_split (bool): quantize to the same number of shards
imatrix (ctypes.c_void_p): pointer to importance matrix data
kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
"""
@ -839,6 +888,7 @@ class llama_model_quantize_params(ctypes.Structure):
quantize_output_tensor: bool
only_copy: bool
pure: bool
keep_split: bool
imatrix: ctypes.c_void_p
kv_overrides: ctypes.c_void_p
@ -851,6 +901,7 @@ class llama_model_quantize_params(ctypes.Structure):
("quantize_output_tensor", ctypes.c_bool),
("only_copy", ctypes.c_bool),
("pure", ctypes.c_bool),
("keep_split", ctypes.c_bool),
("imatrix", ctypes.c_void_p),
("kv_overrides", ctypes.c_void_p),
]
@ -1037,8 +1088,7 @@ GGML_NUMA_STRATEGY_COUNT = 5
[ctypes.c_int],
None,
)
def llama_numa_init(numa: int, /):
...
def llama_numa_init(numa: int, /): ...
# // Call once at the end of the program - currently only used for MPI
@ -1063,8 +1113,7 @@ def llama_backend_free():
)
def llama_load_model_from_file(
path_model: bytes, params: llama_model_params, /
) -> Optional[llama_model_p]:
...
) -> Optional[llama_model_p]: ...
# LLAMA_API void llama_free_model(struct llama_model * model);
@ -1073,8 +1122,7 @@ def llama_load_model_from_file(
[llama_model_p_ctypes],
None,
)
def llama_free_model(model: llama_model_p, /):
...
def llama_free_model(model: llama_model_p, /): ...
# LLAMA_API struct llama_context * llama_new_context_with_model(
@ -1087,8 +1135,7 @@ def llama_free_model(model: llama_model_p, /):
)
def llama_new_context_with_model(
model: llama_model_p, params: llama_context_params, /
) -> Optional[llama_context_p]:
...
) -> Optional[llama_context_p]: ...
# // Frees all allocated memory
@ -1109,98 +1156,87 @@ def llama_free(ctx: llama_context_p, /):
[],
ctypes.c_int64,
)
def llama_time_us() -> int:
...
def llama_time_us() -> int: ...
# LLAMA_API size_t llama_max_devices(void);
@ctypes_function("llama_max_devices", [], ctypes.c_size_t)
def llama_max_devices() -> int:
...
def llama_max_devices() -> int: ...
# LLAMA_API bool llama_supports_mmap (void);
@ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
def llama_supports_mmap() -> bool:
...
def llama_supports_mmap() -> bool: ...
# LLAMA_API bool llama_supports_mlock (void);
@ctypes_function("llama_supports_mlock", [], ctypes.c_bool)
def llama_supports_mlock() -> bool:
...
def llama_supports_mlock() -> bool: ...
# LLAMA_API bool llama_supports_gpu_offload(void);
@ctypes_function("llama_supports_gpu_offload", [], ctypes.c_bool)
def llama_supports_gpu_offload() -> bool:
...
def llama_supports_gpu_offload() -> bool: ...
# LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
@ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
...
def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ...
# LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
@ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
def llama_n_ctx(ctx: llama_context_p, /) -> int:
...
def llama_n_ctx(ctx: llama_context_p, /) -> int: ...
# LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
@ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32)
def llama_n_batch(ctx: llama_context_p, /) -> int:
...
def llama_n_batch(ctx: llama_context_p, /) -> int: ...
# LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
@ctypes_function("llama_n_ubatch", [llama_context_p_ctypes], ctypes.c_uint32)
def llama_n_ubatch(ctx: llama_context_p, /) -> int:
...
def llama_n_ubatch(ctx: llama_context_p, /) -> int: ...
# LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
@ctypes_function("llama_n_seq_max", [llama_context_p_ctypes], ctypes.c_uint32)
def llama_n_seq_max(ctx: llama_context_p, /) -> int:
...
def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
@ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int)
def llama_pooling_type(ctx: llama_context_p, /) -> int: ...
# LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int)
def llama_vocab_type(model: llama_model_p, /) -> int:
...
def llama_vocab_type(model: llama_model_p, /) -> int: ...
# LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
@ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int)
def llama_rope_type(model: llama_model_p, /) -> int:
...
def llama_rope_type(model: llama_model_p, /) -> int: ...
# LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
@ctypes_function("llama_n_vocab", [llama_model_p_ctypes], ctypes.c_int32)
def llama_n_vocab(model: llama_model_p, /) -> int:
...
def llama_n_vocab(model: llama_model_p, /) -> int: ...
# LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
@ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
def llama_n_ctx_train(model: llama_model_p, /) -> int:
...
def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
# LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
@ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
def llama_n_embd(model: llama_model_p, /) -> int:
...
def llama_n_embd(model: llama_model_p, /) -> int: ...
# LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
def llama_n_layer(model: llama_model_p, /) -> int:
...
def llama_n_layer(model: llama_model_p, /) -> int: ...
# // Get the model's RoPE frequency scaling factor
@ -1583,7 +1619,7 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
...
# // Clear the KV cache
# // Clear the KV cache - both cell info is erased and KV data is zeroed
# LLAMA_API void llama_kv_cache_clear(
# struct llama_context * ctx);
@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)
@ -1902,8 +1938,7 @@ def llama_state_load_file(
n_token_capacity: Union[ctypes.c_size_t, int],
n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
/,
) -> bool:
...
) -> bool: ...
# LLAMA_API DEPRECATED(bool llama_load_session_file(
@ -1931,8 +1966,7 @@ def llama_load_session_file(
n_token_capacity: Union[ctypes.c_size_t, int],
n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
/,
) -> int:
...
) -> int: ...
# LLAMA_API bool llama_state_save_file(
@ -1956,8 +1990,7 @@ def llama_state_save_file(
tokens: CtypesArray[llama_token],
n_token_count: Union[ctypes.c_size_t, int],
/,
) -> bool:
...
) -> bool: ...
# LLAMA_API DEPRECATED(bool llama_save_session_file(
@ -1982,8 +2015,7 @@ def llama_save_session_file(
tokens: CtypesArray[llama_token],
n_token_count: Union[ctypes.c_size_t, int],
/,
) -> int:
...
) -> int: ...
# // Get the exact size needed to copy the KV cache of a single sequence
@ -2061,8 +2093,7 @@ def llama_state_seq_save_file(
tokens: CtypesArray[llama_token],
n_token_count: Union[ctypes.c_size_t, int],
/,
) -> int:
...
) -> int: ...
# LLAMA_API size_t llama_state_seq_load_file(
@ -2092,8 +2123,7 @@ def llama_state_seq_load_file(
n_token_capacity: Union[ctypes.c_size_t, int],
n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
/,
) -> int:
...
) -> int: ...
# //
@ -2356,8 +2386,7 @@ def llama_get_embeddings_seq(
)
def llama_token_get_text(
model: llama_model_p, token: Union[llama_token, int], /
) -> bytes:
...
) -> bytes: ...
# LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
@ -2366,8 +2395,7 @@ def llama_token_get_text(
)
def llama_token_get_score(
model: llama_model_p, token: Union[llama_token, int], /
) -> float:
...
) -> float: ...
# LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
@ -2376,8 +2404,7 @@ def llama_token_get_score(
)
def llama_token_get_type(
model: llama_model_p, token: Union[llama_token, int], /
) -> int:
...
) -> int: ...
# // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
@ -2385,9 +2412,7 @@ def llama_token_get_type(
@ctypes_function(
"llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool
)
def llama_token_is_eog(
model: llama_model_p, token: Union[llama_token, int], /
) -> bool:
def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /) -> bool:
"""Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)"""
...
@ -2456,20 +2481,17 @@ def llama_token_prefix(model: llama_model_p) -> int:
# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
@ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token)
def llama_token_middle(model: llama_model_p, /) -> int:
...
def llama_token_middle(model: llama_model_p, /) -> int: ...
# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
@ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token)
def llama_token_suffix(model: llama_model_p, /) -> int:
...
def llama_token_suffix(model: llama_model_p, /) -> int: ...
# LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
def llama_token_eot(model: llama_model_p, /) -> int:
...
def llama_token_eot(model: llama_model_p, /) -> int: ...
# //
@ -2610,8 +2632,7 @@ def llama_chat_apply_template(
chat: CtypesArray[llama_chat_message],
n_msg: int,
/,
) -> int:
...
) -> int: ...
# //
@ -3091,7 +3112,7 @@ def llama_sample_token_greedy(
...
# /// @details Randomly selects a token from the candidates based on their probabilities.
# /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
# LLAMA_API llama_token llama_sample_token(
# struct llama_context * ctx,
# llama_token_data_array * candidates);
@ -3224,8 +3245,7 @@ def llama_beam_search(
n_past: Union[ctypes.c_int, int],
n_predict: Union[ctypes.c_int, int],
/,
):
...
): ...
# /// @details Build a split GGUF final path for this chunk.
@ -3344,5 +3364,4 @@ def llama_log_set(
[ctypes.c_void_p, llama_context_p_ctypes],
None,
)
def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /):
...
def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /): ...

View file

@ -556,17 +556,11 @@ def add_rule(
# }
def decode_utf8(src: const_char_p) -> Tuple[int, const_char_p]:
"""Decodes a UTF-8 character from the source string."""
lookup = (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4)
first_byte = ord(src[0]) # type: int
highbits = first_byte >> 4 # type: int
len = lookup[highbits] # type: int
mask = (1 << (8 - len)) - 1 # type: int
value = first_byte & mask # type: int
end = src + len # type: const_char_p # may overrun!
pos = src + 1 # type: const_char_p
while pos < end and pos[0]:
value = (value << 6) + (ord(pos[0]) & 0x3F)
pos += 1
# Get the codepoint of the first character
value = ord(src[0])
# Move the pointer ahead one character
pos = src + 1
return value, pos

View file

@ -24,7 +24,7 @@ class EmbeddingUsage(TypedDict):
class Embedding(TypedDict):
index: int
object: str
embedding: List[float]
embedding: Union[List[float], List[List[float]]]
class CreateEmbeddingResponse(TypedDict):

View file

@ -1,3 +1,5 @@
from __future__ import annotations
import sys
import os
import ctypes
@ -14,10 +16,22 @@ from ctypes import (
Structure,
)
import pathlib
from typing import List, Union, NewType, Optional, TypeVar, Callable, Any
from typing import (
List,
Union,
NewType,
Optional,
TypeVar,
Callable,
Any,
TYPE_CHECKING,
Generic,
)
from typing_extensions import TypeAlias
import llama_cpp.llama_cpp as llama_cpp
# Load the library
def _load_shared_library(lib_base_name: str):
# Construct the paths to the possible shared library names
@ -79,8 +93,27 @@ _libllava = _load_shared_library(_libllava_base_name)
# ctypes helper
if TYPE_CHECKING:
CtypesCData = TypeVar("CtypesCData", bound=ctypes._CData) # type: ignore
CtypesArray: TypeAlias = ctypes.Array[CtypesCData] # type: ignore
CtypesPointer: TypeAlias = ctypes._Pointer[CtypesCData] # type: ignore
CtypesVoidPointer: TypeAlias = ctypes.c_void_p
class CtypesRef(Generic[CtypesCData]):
pass
CtypesPointerOrRef: TypeAlias = Union[
CtypesPointer[CtypesCData], CtypesRef[CtypesCData]
]
CtypesFuncPointer: TypeAlias = ctypes._FuncPointer # type: ignore
F = TypeVar("F", bound=Callable[..., Any])
def ctypes_function_for_shared_library(lib: ctypes.CDLL):
def ctypes_function(
name: str, argtypes: List[Any], restype: Any, enabled: bool = True
@ -111,6 +144,7 @@ ctypes_function = ctypes_function_for_shared_library(_libllava)
clip_ctx_p = NewType("clip_ctx_p", int)
clip_ctx_p_ctypes = c_void_p
# struct llava_image_embed {
# float * embed;
# int n_image_pos;
@ -121,36 +155,72 @@ class llava_image_embed(Structure):
("n_image_pos", c_int),
]
# /** sanity check for clip <-> llava embed size match */
# LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
@ctypes_function("llava_validate_embed_size", [llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes], c_bool)
def llava_validate_embed_size(ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /) -> bool:
...
@ctypes_function(
"llava_validate_embed_size",
[llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes],
c_bool,
)
def llava_validate_embed_size(
ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /
) -> bool: ...
# /** build an image embed from image file bytes */
# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
@ctypes_function("llava_image_embed_make_with_bytes", [clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int], POINTER(llava_image_embed))
def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_bytes: bytes, image_bytes_length: Union[c_int, int], /) -> "_Pointer[llava_image_embed]":
...
@ctypes_function(
"llava_image_embed_make_with_bytes",
[clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int],
POINTER(llava_image_embed),
)
def llava_image_embed_make_with_bytes(
ctx_clip: clip_ctx_p,
n_threads: Union[c_int, int],
image_bytes: CtypesArray[c_uint8],
image_bytes_length: Union[c_int, int],
/,
) -> "_Pointer[llava_image_embed]": ...
# /** build an image embed from a path to an image filename */
# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
@ctypes_function("llava_image_embed_make_with_filename", [clip_ctx_p_ctypes, c_int, c_char_p], POINTER(llava_image_embed))
def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /) -> "_Pointer[llava_image_embed]":
...
@ctypes_function(
"llava_image_embed_make_with_filename",
[clip_ctx_p_ctypes, c_int, c_char_p],
POINTER(llava_image_embed),
)
def llava_image_embed_make_with_filename(
ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /
) -> "_Pointer[llava_image_embed]": ...
# LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
# /** free an embedding made with llava_image_embed_make_* */
@ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None)
def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /):
...
def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): ...
# /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
# LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
@ctypes_function("llava_eval_image_embed", [llama_cpp.llama_context_p_ctypes, POINTER(llava_image_embed), c_int, POINTER(c_int)], c_bool)
def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointer[llava_image_embed]", n_batch: Union[c_int, int], n_past: "_Pointer[c_int]", /) -> bool:
...
@ctypes_function(
"llava_eval_image_embed",
[
llama_cpp.llama_context_p_ctypes,
POINTER(llava_image_embed),
c_int,
POINTER(c_int),
],
c_bool,
)
def llava_eval_image_embed(
ctx_llama: llama_cpp.llama_context_p,
embed: "_Pointer[llava_image_embed]",
n_batch: Union[c_int, int],
n_past: "_Pointer[c_int]",
/,
) -> bool: ...
################################################
@ -161,11 +231,12 @@ def llava_eval_image_embed(ctx_llama: llama_cpp.llama_context_p, embed: "_Pointe
# /** load mmproj model */
# CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
@ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes)
def clip_model_load(fname: bytes, verbosity: Union[c_int, int], /) -> Optional[clip_ctx_p]:
...
def clip_model_load(
fname: bytes, verbosity: Union[c_int, int], /
) -> Optional[clip_ctx_p]: ...
# /** free mmproj model */
# CLIP_API void clip_free(struct clip_ctx * ctx);
@ctypes_function("clip_free", [clip_ctx_p_ctypes], None)
def clip_free(ctx: clip_ctx_p, /):
...
def clip_free(ctx: clip_ctx_p, /): ...

View file

@ -72,9 +72,74 @@ class LlamaProxy:
chat_handler = None
if settings.chat_format == "llava-1-5":
assert settings.clip_model_path is not None, "clip model not found"
if settings.hf_model_repo_id is not None:
chat_handler = (
llama_cpp.llama_chat_format.Llava15ChatHandler.from_pretrained(
repo_id=settings.hf_model_repo_id,
filename=settings.clip_model_path,
verbose=settings.verbose,
)
)
else:
chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
clip_model_path=settings.clip_model_path, verbose=settings.verbose
)
elif settings.chat_format == "obsidian":
assert settings.clip_model_path is not None, "clip model not found"
if settings.hf_model_repo_id is not None:
chat_handler = (
llama_cpp.llama_chat_format.ObsidianChatHandler.from_pretrained(
repo_id=settings.hf_model_repo_id,
filename=settings.clip_model_path,
verbose=settings.verbose,
)
)
else:
chat_handler = llama_cpp.llama_chat_format.ObsidianChatHandler(
clip_model_path=settings.clip_model_path, verbose=settings.verbose
)
elif settings.chat_format == "llava-1-6":
assert settings.clip_model_path is not None, "clip model not found"
if settings.hf_model_repo_id is not None:
chat_handler = (
llama_cpp.llama_chat_format.Llava16ChatHandler.from_pretrained(
repo_id=settings.hf_model_repo_id,
filename=settings.clip_model_path,
verbose=settings.verbose,
)
)
else:
chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler(
clip_model_path=settings.clip_model_path, verbose=settings.verbose
)
elif settings.chat_format == "moondream":
assert settings.clip_model_path is not None, "clip model not found"
if settings.hf_model_repo_id is not None:
chat_handler = (
llama_cpp.llama_chat_format.MoondreamChatHandler.from_pretrained(
repo_id=settings.hf_model_repo_id,
filename=settings.clip_model_path,
verbose=settings.verbose,
)
)
else:
chat_handler = llama_cpp.llama_chat_format.MoondreamChatHandler(
clip_model_path=settings.clip_model_path, verbose=settings.verbose
)
elif settings.chat_format == "nanollava":
assert settings.clip_model_path is not None, "clip model not found"
if settings.hf_model_repo_id is not None:
chat_handler = (
llama_cpp.llama_chat_format.NanoLlavaChatHandler.from_pretrained(
repo_id=settings.hf_model_repo_id,
filename=settings.clip_model_path,
verbose=settings.verbose,
)
)
else:
chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
clip_model_path=settings.clip_model_path, verbose=settings.verbose
)
elif settings.chat_format == "hf-autotokenizer":
assert (
settings.hf_pretrained_model_name_or_path is not None

View file

@ -2,8 +2,10 @@ from __future__ import annotations
import multiprocessing
from typing import Optional, List, Literal, Union
from pydantic import Field, root_validator
from typing import Optional, List, Literal, Union, Dict, cast
from typing_extensions import Self
from pydantic import Field, model_validator
from pydantic_settings import BaseSettings
import llama_cpp
@ -94,6 +96,9 @@ class ModelSettings(BaseSettings):
offload_kqv: bool = Field(
default=True, description="Whether to offload kqv to the GPU."
)
flash_attn: bool = Field(
default=False, description="Whether to use flash attention."
)
# Sampling Params
last_n_tokens_size: int = Field(
default=64,
@ -173,15 +178,16 @@ class ModelSettings(BaseSettings):
default=True, description="Whether to print debug information."
)
@root_validator(pre=True) # pre=True to ensure this runs before any other validation
def set_dynamic_defaults(cls, values):
@model_validator(mode="before") # pre=True to ensure this runs before any other validation
def set_dynamic_defaults(self) -> Self:
# If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count()
cpu_count = multiprocessing.cpu_count()
values = cast(Dict[str, int], self)
if values.get('n_threads', 0) == -1:
values['n_threads'] = cpu_count
if values.get('n_threads_batch', 0) == -1:
values['n_threads_batch'] = cpu_count
return values
return self
class ServerSettings(BaseSettings):

View file

@ -6,7 +6,7 @@ from scipy.special import log_softmax
import llama_cpp
MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama.gguf"
MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama-spm.gguf"
def test_llama_cpp_tokenization():

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 4e96a812b3ce7322a29a3008db2ed73d9087b176
Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961