This commit is contained in:
baalajimaestro 2024-04-05 10:38:53 +05:30
commit 8b9cd38c0d
Signed by: baalajimaestro
GPG key ID: F93C394FE9BBAFD5
18 changed files with 569 additions and 85 deletions

View file

@ -11,7 +11,7 @@ jobs:
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
strategy: strategy:
matrix: matrix:
os: [ubuntu-latest, windows-latest, macOS-latest] os: [ubuntu-20.04, windows-2019, macos-11]
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
@ -23,19 +23,19 @@ jobs:
with: with:
python-version: "3.8" python-version: "3.8"
- name: Install cibuildwheel
run: python -m pip install cibuildwheel==2.12.1
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
python -m pip install -e .[all] python -m pip install -e .[all]
- name: Build wheels - name: Build wheels
run: python -m cibuildwheel --output-dir wheelhouse uses: pypa/cibuildwheel@v2.16.5
env: env:
# disable repair # disable repair
CIBW_REPAIR_WHEEL_COMMAND: "" CIBW_REPAIR_WHEEL_COMMAND: ""
with:
package-dir: .
output-dir: wheelhouse
- uses: actions/upload-artifact@v3 - uses: actions/upload-artifact@v3
with: with:

131
.github/workflows/build-wheels-cuda.yaml vendored Normal file
View file

@ -0,0 +1,131 @@
name: Build Wheels (CUDA)
on: workflow_dispatch
permissions:
contents: write
jobs:
define_matrix:
name: Define Build Matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
defaults:
run:
shell: pwsh
steps:
- name: Define Job Output
id: set-matrix
run: |
$matrix = @{
'os' = @('ubuntu-20.04', 'windows-latest')
'pyver' = @("3.10", "3.11", "3.12")
'cuda' = @("12.1.1", "12.2.2", "12.3.2")
'releasetag' = @("basic")
}
$matrixOut = ConvertTo-Json $matrix -Compress
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
build_wheels:
name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
needs: define_matrix
runs-on: ${{ matrix.os }}
strategy:
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
defaults:
run:
shell: pwsh
env:
CUDAVER: ${{ matrix.cuda }}
AVXVER: ${{ matrix.releasetag }}
steps:
- uses: actions/checkout@v4
with:
submodules: "recursive"
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.pyver }}
- name: Setup Mamba
uses: conda-incubator/setup-miniconda@v2.2.0
with:
activate-environment: "build"
python-version: ${{ matrix.pyver }}
miniforge-variant: Mambaforge
miniforge-version: latest
use-mamba: true
add-pip-as-python-dependency: true
auto-activate-base: false
- name: VS Integration Cache
id: vs-integration-cache
if: runner.os == 'Windows'
uses: actions/cache@v3.3.2
with:
path: ./MSBuildExtensions
key: cuda-${{ matrix.cuda }}-vs-integration
- name: Get Visual Studio Integration
if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
run: |
if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
$links = (Invoke-RestMethod 'https://github.com/Jimver/cuda-toolkit/raw/dc0ca7bb29c5a92f7a963d3d5c93f8d59765136a/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
& 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
Remove-Item 'cudainstaller.zip'
- name: Install Visual Studio Integration
if: runner.os == 'Windows'
run: |
$y = (gi '.\MSBuildExtensions').fullname + '\*'
(gi 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_})
$cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_')
echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
- name: Install Dependencies
env:
MAMBA_DOWNLOAD_FAILFAST: "0"
MAMBA_NO_LOW_SPEED_LIMIT: "1"
run: |
$cudaVersion = $env:CUDAVER
mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
python -m pip install build wheel
- name: Build Wheel
run: |
$cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
$env:CUDA_PATH = $env:CONDA_PREFIX
$env:CUDA_HOME = $env:CONDA_PREFIX
$env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
if ($IsLinux) {
$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
}
$env:VERBOSE = '1'
$env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all'
$env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
if ($env:AVXVER -eq 'AVX') {
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
}
if ($env:AVXVER -eq 'AVX512') {
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on'
}
if ($env:AVXVER -eq 'basic') {
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
}
python -m build --wheel
# write the build tag to the output
Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
- uses: softprops/action-gh-release@v1
with:
files: dist/*
# Set tag_name to <tag>-cu<cuda_version>
tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View file

@ -0,0 +1,87 @@
name: Build Wheels (Metal)
on: workflow_dispatch
permissions:
contents: write
jobs:
define_matrix:
name: Define Build Matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
defaults:
run:
shell: pwsh
steps:
- name: Define Job Output
id: set-matrix
run: |
$matrix = @{
'os' = @('macos-11', 'macos-12', 'macos-13')
'pyver' = @('3.10', '3.11', '3.12')
}
$matrixOut = ConvertTo-Json $matrix -Compress
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
build_wheels:
name: ${{ matrix.os }} Python ${{ matrix.pyver }}
needs: define_matrix
runs-on: ${{ matrix.os }}
strategy:
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
env:
OSVER: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
with:
submodules: "recursive"
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.pyver }}
- name: Install Dependencies
run: |
python -m pip install build wheel cmake
- name: Build Wheel
run: |
XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer"
XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin"
export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_METAL=on"
[[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0"
[[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0"
[[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0"
export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64"
VERBOSE=1 python -m build --wheel
if [[ "$OSVER" == "macos-13" ]]; then
export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
export MACOSX_DEPLOYMENT_TARGET="14.0"
VERBOSE=1 python -m build --wheel
fi
for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done
export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_METAL=on" && export ARCHFLAGS="-arch x86_64"
VERBOSE=1 python -m build --wheel
if [[ "$OSVER" == "macos-13" ]]; then
export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
export MACOSX_DEPLOYMENT_TARGET="14.0"
VERBOSE=1 python -m build --wheel
fi
- uses: softprops/action-gh-release@v1
with:
files: dist/*
# set release name to <tag>-metal
tag_name: ${{ github.ref_name }}-metal
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View file

@ -0,0 +1,48 @@
name: Wheels Index
on:
# Trigger on any new release
release:
types: [published]
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
permissions:
contents: read
pages: write
id-token: write
# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
concurrency:
group: "pages"
cancel-in-progress: false
jobs:
# Single deploy job since we're just deploying
deploy:
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Pages
uses: actions/configure-pages@v4
- name: Build
run: |
./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
# Upload entire repository
path: 'index'
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4

View file

@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
## [0.2.59]
- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
- feat: Binary wheels for CPU, CUDA (12.1 - 12.3), Metal by @abetlen, @jllllll, and @oobabooga in #1247
- fix: segfault when logits_all=False by @abetlen in 8649d7671bd1a7c0d9cc6a5ad91c6ca286512ab3
- fix: last tokens passing to sample_repetition_penalties function by @ymikhailov in #1295
## [0.2.58]
- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
- feat: add support for KV cache quantization options by @Limour-dev in #1307
- feat: Add logprobs support to chat completions by @windspirit95 in #1311
- fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64 by @bretello in #1289
- feat: Add tools/functions variables to Jinja2ChatFormatter, add function response formatting for all simple chat formats by @CISC in #1273
- fix: Changed local API doc references to hosted by by @lawfordp2017 in #1317
## [0.2.57] ## [0.2.57]
- feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1 - feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
@ -24,7 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [0.2.55] ## [0.2.55]
- feat: Update llama.cpp to ggerganov/9731134296af3a6839cd682e51d9c2109a871de5 - feat: Update llama.cpp to ggerganov/llama.cpp@9731134296af3a6839cd682e51d9c2109a871de5
- docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244 - docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244
## [0.2.54] ## [0.2.54]

View file

@ -6,6 +6,7 @@
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
[![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]()
Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library. Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.
This package provides: This package provides:
@ -43,6 +44,15 @@ This will also build `llama.cpp` from source and install it alongside this pytho
If this fails, add `--verbose` to the `pip install` see the full cmake build log. If this fails, add `--verbose` to the `pip install` see the full cmake build log.
**Pre-built Wheel (New)**
It is also possible to install a pre-built wheel with basic CPU support.
```bash
pip install llama-cpp-python \
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
```
### Installation Configuration ### Installation Configuration
`llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https://github.com/ggerganov/llama.cpp#build) for a full list. `llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https://github.com/ggerganov/llama.cpp#build) for a full list.
@ -99,12 +109,36 @@ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-
</details> </details>
<details> <details>
<summary>cuBLAS (CUDA)</summary> <summary>CUDA</summary>
To install with cuBLAS, set the `LLAMA_CUBLAS=on` environment variable before installing: To install with CUDA support, set the `LLAMA_CUDA=on` environment variable before installing:
```bash ```bash
CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
```
**Pre-built Wheel (New)**
It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
- CUDA Version is 12.1, 12.2 or 12.3
- Python Version is 3.10, 3.11 or 3.12
```bash
pip install llama-cpp-python \
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/<cuda-version>
```
Where `<cuda-version>` is one of the following:
- `cu121`: CUDA 12.1
- `cu122`: CUDA 12.2
- `cu123`: CUDA 12.3
For example, to install the CUDA 12.1 wheel:
```bash
pip install llama-cpp-python \
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
``` ```
</details> </details>
@ -118,6 +152,18 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor
CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
``` ```
**Pre-built Wheel (New)**
It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements:
- MacOS Version is 11.0 or later
- Python Version is 3.10, 3.11 or 3.12
```bash
pip install llama-cpp-python \
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
```
</details> </details>
<details> <details>
@ -321,7 +367,7 @@ For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`
### JSON and JSON Schema Mode ### JSON and JSON Schema Mode
To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_chat_completion). To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
#### JSON Mode #### JSON Mode
@ -529,7 +575,7 @@ llama = Llama(
### Embeddings ### Embeddings
To generate text embeddings use [`create_embedding`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_embedding). To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding).
```python ```python
import llama_cpp import llama_cpp
@ -568,7 +614,7 @@ python3 -m llama_cpp.server --model models/7B/llama-model.gguf
Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this: Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this:
```bash ```bash
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]' CMAKE_ARGS="-DLLAMA_CUDA=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'
python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35 python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35
``` ```

View file

@ -1,4 +1,4 @@
from .llama_cpp import * from .llama_cpp import *
from .llama import * from .llama import *
__version__ = "0.2.57" __version__ = "0.2.59"

View file

@ -730,12 +730,14 @@ class _LlamaSamplingContext:
if len(self.prev) > 0: if len(self.prev) > 0:
nl_token = ctx_main.model.token_nl() nl_token = ctx_main.model.token_nl()
nl_logit = logits_array[nl_token] nl_logit = logits_array[nl_token]
if self.params.penalty_last_n > 0: last_tokens = self.prev[-self.params.penalty_last_n:]
last_tokens_size = min(len(last_tokens), self.params.penalty_last_n)
if last_tokens_size > 0:
last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens)
ctx_main.sample_repetition_penalties( ctx_main.sample_repetition_penalties(
token_data_array, token_data_array,
# TODO: Only create this once last_tokens_p,
(llama_cpp.llama_token * len(self.prev))(*self.prev), last_tokens_size,
self.params.penalty_last_n,
self.params.penalty_repeat, self.params.penalty_repeat,
self.params.penalty_freq, self.params.penalty_freq,
self.params.penalty_present, self.params.penalty_present,

View file

@ -105,6 +105,9 @@ class Llama:
draft_model: Optional[LlamaDraftModel] = None, draft_model: Optional[LlamaDraftModel] = None,
# Tokenizer Override # Tokenizer Override
tokenizer: Optional[BaseLlamaTokenizer] = None, tokenizer: Optional[BaseLlamaTokenizer] = None,
# KV cache quantization
type_k: Optional[int] = None,
type_v: Optional[int] = None,
# Misc # Misc
verbose: bool = True, verbose: bool = True,
# Extra Params # Extra Params
@ -172,6 +175,8 @@ class Llama:
draft_model: Optional draft model to use for speculative decoding. draft_model: Optional draft model to use for speculative decoding.
tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp. tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
verbose: Print verbose output to stderr. verbose: Print verbose output to stderr.
type_k: KV cache data type for K (default: f16)
type_v: KV cache data type for V (default: f16)
Raises: Raises:
ValueError: If the model path does not exist. ValueError: If the model path does not exist.
@ -298,7 +303,11 @@ class Llama:
) # Must be set to True for speculative decoding ) # Must be set to True for speculative decoding
self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.embeddings = embedding # TODO: Rename to embeddings
self.context_params.offload_kqv = offload_kqv self.context_params.offload_kqv = offload_kqv
# KV cache quantization
if type_k is not None:
self.context_params.type_k = type_k
if type_v is not None:
self.context_params.type_v = type_v
# Sampling Params # Sampling Params
self.last_n_tokens_size = last_n_tokens_size self.last_n_tokens_size = last_n_tokens_size
@ -526,14 +535,16 @@ class Llama:
# Save tokens # Save tokens
self.input_ids[n_past : n_past + n_tokens] = batch self.input_ids[n_past : n_past + n_tokens] = batch
# Save logits # Save logits
rows = n_tokens if self.context_params.logits_all:
cols = self._n_vocab rows = n_tokens
offset = ( cols = self._n_vocab
0 if self.context_params.logits_all else n_tokens - 1 logits = self._ctx.get_logits()[: rows * cols]
) # NOTE: Only save the last token logits if logits_all is False self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits
self.scores[n_past + offset : n_past + n_tokens, :].reshape(-1)[ else:
: rows = 1
] = self._ctx.get_logits()[offset * cols : rows * cols] cols = self._n_vocab
logits = self._ctx.get_logits()[: rows * cols]
self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits
# Update n_tokens # Update n_tokens
self.n_tokens += n_tokens self.n_tokens += n_tokens
@ -1653,6 +1664,7 @@ class Llama:
top_k=top_k, top_k=top_k,
min_p=min_p, min_p=min_p,
typical_p=typical_p, typical_p=typical_p,
logprobs=top_logprobs if logprobs else None,
stream=stream, stream=stream,
stop=stop, stop=stop,
seed=seed, seed=seed,
@ -1723,6 +1735,7 @@ class Llama:
n_threads=self.context_params.n_threads, n_threads=self.context_params.n_threads,
n_threads_batch=self.context_params.n_threads_batch, n_threads_batch=self.context_params.n_threads_batch,
rope_scaling_type=self.context_params.rope_scaling_type, rope_scaling_type=self.context_params.rope_scaling_type,
pooling_type=self.context_params.pooling_type,
rope_freq_base=self.context_params.rope_freq_base, rope_freq_base=self.context_params.rope_freq_base,
rope_freq_scale=self.context_params.rope_freq_scale, rope_freq_scale=self.context_params.rope_freq_scale,
yarn_ext_factor=self.context_params.yarn_ext_factor, yarn_ext_factor=self.context_params.yarn_ext_factor,
@ -1732,6 +1745,7 @@ class Llama:
yarn_orig_ctx=self.context_params.yarn_orig_ctx, yarn_orig_ctx=self.context_params.yarn_orig_ctx,
logits_all=self.context_params.logits_all, logits_all=self.context_params.logits_all,
embedding=self.context_params.embeddings, embedding=self.context_params.embeddings,
offload_kqv=self.context_params.offload_kqv,
# Sampling Params # Sampling Params
last_n_tokens_size=self.last_n_tokens_size, last_n_tokens_size=self.last_n_tokens_size,
# LoRA Params # LoRA Params
@ -1743,51 +1757,17 @@ class Llama:
# Chat Format Params # Chat Format Params
chat_format=self.chat_format, chat_format=self.chat_format,
chat_handler=self.chat_handler, chat_handler=self.chat_handler,
# Speculative Decidng
draft_model=self.draft_model,
# KV cache quantization
type_k=self.context_params.type_k,
type_v=self.context_params.type_v,
# Misc # Misc
verbose=self.verbose, verbose=self.verbose,
) )
def __setstate__(self, state): def __setstate__(self, state):
self.__init__( self.__init__(**state)
model_path=state["model_path"],
# Model Params
n_gpu_layers=state["n_gpu_layers"],
split_mode=state["split_mode"],
main_gpu=state["main_gpu"],
tensor_split=state["tensor_split"],
vocab_only=state["vocab_only"],
use_mmap=state["use_mmap"],
use_mlock=state["use_mlock"],
kv_overrides=state["kv_overrides"],
# Context Params
seed=state["seed"],
n_ctx=state["n_ctx"],
n_batch=state["n_batch"],
n_threads=state["n_threads"],
n_threads_batch=state["n_threads_batch"],
rope_freq_base=state["rope_freq_base"],
rope_freq_scale=state["rope_freq_scale"],
rope_scaling_type=state["rope_scaling_type"],
yarn_ext_factor=state["yarn_ext_factor"],
yarn_attn_factor=state["yarn_attn_factor"],
yarn_beta_fast=state["yarn_beta_fast"],
yarn_beta_slow=state["yarn_beta_slow"],
yarn_orig_ctx=state["yarn_orig_ctx"],
logits_all=state["logits_all"],
embedding=state["embedding"],
# Sampling Params
last_n_tokens_size=state["last_n_tokens_size"],
# LoRA Params
lora_base=state["lora_base"],
lora_path=state["lora_path"],
# Backend Params
numa=state["numa"],
# Chat Format Params
chat_format=state["chat_format"],
chat_handler=state["chat_handler"],
# Misc
verbose=state["verbose"],
)
def save_state(self) -> LlamaState: def save_state(self) -> LlamaState:
assert self._ctx.ctx is not None assert self._ctx.ctx is not None

View file

@ -231,6 +231,7 @@ def _convert_text_completion_to_chat(
"role": "assistant", "role": "assistant",
"content": completion["choices"][0]["text"], "content": completion["choices"][0]["text"],
}, },
"logprobs": completion["choices"][0]["logprobs"],
"finish_reason": completion["choices"][0]["finish_reason"], "finish_reason": completion["choices"][0]["finish_reason"],
} }
], ],
@ -254,6 +255,7 @@ def _convert_text_completion_chunks_to_chat(
"delta": { "delta": {
"role": "assistant", "role": "assistant",
}, },
"logprobs": None,
"finish_reason": None, "finish_reason": None,
} }
], ],
@ -273,6 +275,7 @@ def _convert_text_completion_chunks_to_chat(
if chunk["choices"][0]["finish_reason"] is None if chunk["choices"][0]["finish_reason"] is None
else {} else {}
), ),
"logprobs": chunk["choices"][0]["logprobs"],
"finish_reason": chunk["choices"][0]["finish_reason"], "finish_reason": chunk["choices"][0]["finish_reason"],
} }
], ],
@ -487,6 +490,7 @@ def chat_formatter_to_chat_completion_handler(
temperature: float = 0.2, temperature: float = 0.2,
top_p: float = 0.95, top_p: float = 0.95,
top_k: int = 40, top_k: int = 40,
logprobs: int = 0,
min_p: float = 0.05, min_p: float = 0.05,
typical_p: float = 1.0, typical_p: float = 1.0,
stream: bool = False, stream: bool = False,
@ -576,6 +580,7 @@ def chat_formatter_to_chat_completion_handler(
top_k=top_k, top_k=top_k,
min_p=min_p, min_p=min_p,
typical_p=typical_p, typical_p=typical_p,
logprobs=logprobs,
stream=stream, stream=stream,
stop=stop, stop=stop,
seed=seed, seed=seed,

View file

@ -141,6 +141,70 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
byref = ctypes.byref # type: ignore byref = ctypes.byref # type: ignore
# from ggml.h
# // NOTE: always add types at the end of the enum to keep backward compatibility
# enum ggml_type {
# GGML_TYPE_F32 = 0,
# GGML_TYPE_F16 = 1,
# GGML_TYPE_Q4_0 = 2,
# GGML_TYPE_Q4_1 = 3,
# // GGML_TYPE_Q4_2 = 4, support has been removed
# // GGML_TYPE_Q4_3 = 5, support has been removed
# GGML_TYPE_Q5_0 = 6,
# GGML_TYPE_Q5_1 = 7,
# GGML_TYPE_Q8_0 = 8,
# GGML_TYPE_Q8_1 = 9,
# GGML_TYPE_Q2_K = 10,
# GGML_TYPE_Q3_K = 11,
# GGML_TYPE_Q4_K = 12,
# GGML_TYPE_Q5_K = 13,
# GGML_TYPE_Q6_K = 14,
# GGML_TYPE_Q8_K = 15,
# GGML_TYPE_IQ2_XXS = 16,
# GGML_TYPE_IQ2_XS = 17,
# GGML_TYPE_IQ3_XXS = 18,
# GGML_TYPE_IQ1_S = 19,
# GGML_TYPE_IQ4_NL = 20,
# GGML_TYPE_IQ3_S = 21,
# GGML_TYPE_IQ2_S = 22,
# GGML_TYPE_IQ4_XS = 23,
# GGML_TYPE_I8 = 24,
# GGML_TYPE_I16 = 25,
# GGML_TYPE_I32 = 26,
# GGML_TYPE_I64 = 27,
# GGML_TYPE_F64 = 28,
# GGML_TYPE_IQ1_M = 29,
# GGML_TYPE_COUNT,
# };
GGML_TYPE_F32 = 0
GGML_TYPE_F16 = 1
GGML_TYPE_Q4_0 = 2
GGML_TYPE_Q4_1 = 3
GGML_TYPE_Q5_0 = 6
GGML_TYPE_Q5_1 = 7
GGML_TYPE_Q8_0 = 8
GGML_TYPE_Q8_1 = 9
GGML_TYPE_Q2_K = 10
GGML_TYPE_Q3_K = 11
GGML_TYPE_Q4_K = 12
GGML_TYPE_Q5_K = 13
GGML_TYPE_Q6_K = 14
GGML_TYPE_Q8_K = 15
GGML_TYPE_IQ2_XXS = 16
GGML_TYPE_IQ2_XS = 17
GGML_TYPE_IQ3_XXS = 18
GGML_TYPE_IQ1_S = 19
GGML_TYPE_IQ4_NL = 20
GGML_TYPE_IQ3_S = 21
GGML_TYPE_IQ2_S = 22
GGML_TYPE_IQ4_XS = 23
GGML_TYPE_I8 = 24
GGML_TYPE_I16 = 25
GGML_TYPE_I32 = 26
GGML_TYPE_I64 = 27
GGML_TYPE_F64 = 28
GGML_TYPE_IQ1_M = 29
GGML_TYPE_COUNT = 30
# from ggml-backend.h # from ggml-backend.h
# typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
@ -175,8 +239,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN # define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
# define LLAMA_SESSION_VERSION 4 # define LLAMA_SESSION_VERSION 5
LLAMA_SESSION_VERSION = 4 LLAMA_SESSION_VERSION = 5
# struct llama_model; # struct llama_model;
@ -199,14 +263,18 @@ llama_seq_id = ctypes.c_int32
# enum llama_vocab_type { # enum llama_vocab_type {
# LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab # LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
# LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece # LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
# LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding # LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
# LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece # LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
# }; # };
LLAMA_VOCAB_TYPE_NONE = 0 LLAMA_VOCAB_TYPE_NONE = 0
"""For models without vocab"""
LLAMA_VOCAB_TYPE_SPM = 1 LLAMA_VOCAB_TYPE_SPM = 1
"""LLaMA tokenizer based on byte-level BPE with byte fallback"""
LLAMA_VOCAB_TYPE_BPE = 2 LLAMA_VOCAB_TYPE_BPE = 2
"""GPT-2 tokenizer based on byte-level BPE"""
LLAMA_VOCAB_TYPE_WPM = 3 LLAMA_VOCAB_TYPE_WPM = 3
"""BERT tokenizer based on WordPiece"""
# // note: these values should be synchronized with ggml_rope # // note: these values should be synchronized with ggml_rope
@ -274,6 +342,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
# }; # };
@ -677,6 +746,7 @@ It might not exist for progress report where '.' is output repeatedly."""
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
# bool pure; // quantize all tensors to the default type # bool pure; // quantize all tensors to the default type
# void * imatrix; // pointer to importance matrix data # void * imatrix; // pointer to importance matrix data
# void * kv_overrides; // pointer to vector containing overrides
# } llama_model_quantize_params; # } llama_model_quantize_params;
class llama_model_quantize_params(ctypes.Structure): class llama_model_quantize_params(ctypes.Structure):
"""Parameters for llama_model_quantize """Parameters for llama_model_quantize
@ -691,6 +761,7 @@ class llama_model_quantize_params(ctypes.Structure):
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
pure (bool): quantize all tensors to the default type pure (bool): quantize all tensors to the default type
imatrix (ctypes.c_void_p): pointer to importance matrix data imatrix (ctypes.c_void_p): pointer to importance matrix data
kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
""" """
_fields_ = [ _fields_ = [
@ -703,6 +774,7 @@ class llama_model_quantize_params(ctypes.Structure):
("only_copy", ctypes.c_bool), ("only_copy", ctypes.c_bool),
("pure", ctypes.c_bool), ("pure", ctypes.c_bool),
("imatrix", ctypes.c_void_p), ("imatrix", ctypes.c_void_p),
("kv_overrides", ctypes.c_void_p),
] ]
@ -1838,9 +1910,9 @@ def llama_synchronize(ctx: llama_context_p, /):
# // Token logits obtained from the last call to llama_decode() # // Token logits obtained from the last call to llama_decode()
# // The logits for the last token are stored in the last row # // The logits for which llama_batch.logits[i] != 0 are stored contiguously
# // Logits for which llama_batch.logits[i] == 0 are undefined # // in the order they have appeared in the batch.
# // Rows: n_tokens provided with llama_batch # // Rows: number of tokens for which llama_batch.logits[i] != 0
# // Cols: n_vocab # // Cols: n_vocab
# LLAMA_API float * llama_get_logits(struct llama_context * ctx); # LLAMA_API float * llama_get_logits(struct llama_context * ctx);
@ctypes_function( @ctypes_function(
@ -1859,7 +1931,8 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
# // Logits for the ith token. Equivalent to: # // Logits for the ith token. Equivalent to:
# // llama_get_logits(ctx) + i*n_vocab # // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
# // returns NULL for invalid ids.
# LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i); # LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
@ctypes_function( @ctypes_function(
"llama_get_logits_ith", "llama_get_logits_ith",
@ -1874,8 +1947,12 @@ def llama_get_logits_ith(
... ...
# // Get all output token embeddings # // Get all output token embeddings.
# // shape: [n_tokens*n_embd] (1-dimensional) # // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
# // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
# // in the order they have appeared in the batch.
# // shape: [n_outputs*n_embd]
# // Otherwise, returns NULL.
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@ctypes_function( @ctypes_function(
"llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float) "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@ -1886,9 +1963,10 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
... ...
# // Get the embeddings for the ith token # // Get the embeddings for the ith token. Equivalent to:
# // llama_get_embeddings(ctx) + i*n_embd # // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
# // shape: [n_embd] (1-dimensional) # // shape: [n_embd] (1-dimensional)
# // returns NULL for invalid ids.
# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); # LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
@ctypes_function( @ctypes_function(
"llama_get_embeddings_ith", "llama_get_embeddings_ith",

View file

@ -84,6 +84,7 @@ class ChatCompletionFunction(TypedDict):
class ChatCompletionResponseChoice(TypedDict): class ChatCompletionResponseChoice(TypedDict):
index: int index: int
message: "ChatCompletionResponseMessage" message: "ChatCompletionResponseMessage"
logprobs: Optional[CompletionLogprobs]
finish_reason: Optional[str] finish_reason: Optional[str]

View file

@ -405,6 +405,18 @@ async def create_chat_completion(
} }
}, },
}, },
"logprobs": {
"summary": "Logprobs",
"value": {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
],
"logprobs": True,
"top_logprobs": 10
},
},
} }
), ),
llama_proxy: LlamaProxy = Depends(get_llama_proxy), llama_proxy: LlamaProxy = Depends(get_llama_proxy),
@ -493,7 +505,7 @@ async def tokenize(
) -> TokenizeInputResponse: ) -> TokenizeInputResponse:
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
return {"tokens": tokens} return TokenizeInputResponse(tokens=tokens)
@router.post( @router.post(
@ -508,7 +520,7 @@ async def count_query_tokens(
) -> TokenizeInputCountResponse: ) -> TokenizeInputCountResponse:
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
return {"count": len(tokens)} return TokenizeInputCountResponse(count=len(tokens))
@router.post( @router.post(
@ -523,4 +535,4 @@ async def detokenize(
) -> DetokenizeInputResponse: ) -> DetokenizeInputResponse:
text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8") text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")
return {"text": text} return DetokenizeInputResponse(text=text)

View file

@ -175,6 +175,9 @@ class LlamaProxy:
chat_handler=chat_handler, chat_handler=chat_handler,
# Speculative Decoding # Speculative Decoding
draft_model=draft_model, draft_model=draft_model,
# KV Cache Quantization
type_k=settings.type_k,
type_v=settings.type_v,
# Tokenizer # Tokenizer
tokenizer=tokenizer, tokenizer=tokenizer,
# Misc # Misc

View file

@ -159,6 +159,15 @@ class ModelSettings(BaseSettings):
default=10, default=10,
description="Number of tokens to predict using the draft model.", description="Number of tokens to predict using the draft model.",
) )
# KV Cache Quantization
type_k: Optional[int] = Field(
default=None,
description="Type of the key cache quantization.",
)
type_v: Optional[int] = Field(
default=None,
description="Type of the value cache quantization.",
)
# Misc # Misc
verbose: bool = Field( verbose: bool = Field(
default=True, description="Whether to print debug information." default=True, description="Whether to print debug information."

View file

@ -130,7 +130,6 @@ class CreateCompletionRequest(BaseModel):
presence_penalty: Optional[float] = presence_penalty_field presence_penalty: Optional[float] = presence_penalty_field
frequency_penalty: Optional[float] = frequency_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field
logit_bias: Optional[Dict[str, float]] = Field(None) logit_bias: Optional[Dict[str, float]] = Field(None)
logprobs: Optional[int] = Field(None)
seed: Optional[int] = Field(None) seed: Optional[int] = Field(None)
# ignored or currently unsupported # ignored or currently unsupported
@ -209,6 +208,15 @@ class CreateChatCompletionRequest(BaseModel):
default=None, default=None,
description="The maximum number of tokens to generate. Defaults to inf", description="The maximum number of tokens to generate. Defaults to inf",
) )
logprobs: Optional[bool] = Field(
default=False,
description="Whether to output the logprobs or not. Default is True"
)
top_logprobs: Optional[int] = Field(
default=None,
ge=0,
description="The number of logprobs to generate. If None, no logprobs are generated. logprobs need to set to True.",
)
temperature: float = temperature_field temperature: float = temperature_field
top_p: float = top_p_field top_p: float = top_p_field
min_p: float = min_p_field min_p: float = min_p_field
@ -268,7 +276,7 @@ class ModelList(TypedDict):
class TokenizeInputRequest(BaseModel): class TokenizeInputRequest(BaseModel):
model: Optional[str] = model_field model: Optional[str] = model_field
input: Optional[str] = Field(description="The input to tokenize.") input: str = Field(description="The input to tokenize.")
model_config = { model_config = {
"json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]} "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}

58
scripts/releases-to-pep-503.sh Executable file
View file

@ -0,0 +1,58 @@
#!/bin/bash
# Get output directory or default to index/whl/cpu
output_dir=${1:-"index/whl/cpu"}
# Create output directory
mkdir -p $output_dir
# Change to output directory
pushd $output_dir
# Create an index html file
echo "<!DOCTYPE html>" > index.html
echo "<html>" >> index.html
echo " <head></head>" >> index.html
echo " <body>" >> index.html
echo " <a href=\"llama-cpp-python/\">llama-cpp-python</a>" >> index.html
echo " <br>" >> index.html
echo " </body>" >> index.html
echo "</html>" >> index.html
echo "" >> index.html
# Create llama-cpp-python directory
mkdir -p llama-cpp-python
# Change to llama-cpp-python directory
pushd llama-cpp-python
# Create an index html file
echo "<!DOCTYPE html>" > index.html
echo "<html>" >> index.html
echo " <body>" >> index.html
echo " <h1>Links for llama-cpp-python</h1>" >> index.html
# Get all releases
releases=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases | jq -r .[].tag_name)
# Get pattern from second arg or default to valid python package version pattern
pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"}
# Filter releases by pattern
releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern)
# For each release, get all assets
for release in $releases; do
assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets)
echo " <h2>$release</h2>" >> index.html
for asset in $(echo $assets | jq -r .[].browser_download_url); do
if [[ $asset == *".whl" ]]; then
echo " <a href=\"$asset\">$asset</a>" >> index.html
echo " <br>" >> index.html
fi
done
done
echo " </body>" >> index.html
echo "</html>" >> index.html
echo "" >> index.html

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652 Subproject commit 60cdf40cc32f0ad4cb11e0ca8fd38f3b93d8d640