This commit is contained in:
commit
8b9cd38c0d
18 changed files with 569 additions and 85 deletions
10
.github/workflows/build-and-release.yaml
vendored
10
.github/workflows/build-and-release.yaml
vendored
|
@ -11,7 +11,7 @@ jobs:
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
os: [ubuntu-latest, windows-latest, macOS-latest]
|
os: [ubuntu-20.04, windows-2019, macos-11]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
@ -23,19 +23,19 @@ jobs:
|
||||||
with:
|
with:
|
||||||
python-version: "3.8"
|
python-version: "3.8"
|
||||||
|
|
||||||
- name: Install cibuildwheel
|
|
||||||
run: python -m pip install cibuildwheel==2.12.1
|
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python -m pip install -e .[all]
|
python -m pip install -e .[all]
|
||||||
|
|
||||||
- name: Build wheels
|
- name: Build wheels
|
||||||
run: python -m cibuildwheel --output-dir wheelhouse
|
uses: pypa/cibuildwheel@v2.16.5
|
||||||
env:
|
env:
|
||||||
# disable repair
|
# disable repair
|
||||||
CIBW_REPAIR_WHEEL_COMMAND: ""
|
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||||
|
with:
|
||||||
|
package-dir: .
|
||||||
|
output-dir: wheelhouse
|
||||||
|
|
||||||
- uses: actions/upload-artifact@v3
|
- uses: actions/upload-artifact@v3
|
||||||
with:
|
with:
|
||||||
|
|
131
.github/workflows/build-wheels-cuda.yaml
vendored
Normal file
131
.github/workflows/build-wheels-cuda.yaml
vendored
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
name: Build Wheels (CUDA)
|
||||||
|
|
||||||
|
on: workflow_dispatch
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
define_matrix:
|
||||||
|
name: Define Build Matrix
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: pwsh
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Define Job Output
|
||||||
|
id: set-matrix
|
||||||
|
run: |
|
||||||
|
$matrix = @{
|
||||||
|
'os' = @('ubuntu-20.04', 'windows-latest')
|
||||||
|
'pyver' = @("3.10", "3.11", "3.12")
|
||||||
|
'cuda' = @("12.1.1", "12.2.2", "12.3.2")
|
||||||
|
'releasetag' = @("basic")
|
||||||
|
}
|
||||||
|
|
||||||
|
$matrixOut = ConvertTo-Json $matrix -Compress
|
||||||
|
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
|
||||||
|
|
||||||
|
build_wheels:
|
||||||
|
name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
|
||||||
|
needs: define_matrix
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: pwsh
|
||||||
|
env:
|
||||||
|
CUDAVER: ${{ matrix.cuda }}
|
||||||
|
AVXVER: ${{ matrix.releasetag }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
submodules: "recursive"
|
||||||
|
|
||||||
|
- uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.pyver }}
|
||||||
|
|
||||||
|
- name: Setup Mamba
|
||||||
|
uses: conda-incubator/setup-miniconda@v2.2.0
|
||||||
|
with:
|
||||||
|
activate-environment: "build"
|
||||||
|
python-version: ${{ matrix.pyver }}
|
||||||
|
miniforge-variant: Mambaforge
|
||||||
|
miniforge-version: latest
|
||||||
|
use-mamba: true
|
||||||
|
add-pip-as-python-dependency: true
|
||||||
|
auto-activate-base: false
|
||||||
|
|
||||||
|
- name: VS Integration Cache
|
||||||
|
id: vs-integration-cache
|
||||||
|
if: runner.os == 'Windows'
|
||||||
|
uses: actions/cache@v3.3.2
|
||||||
|
with:
|
||||||
|
path: ./MSBuildExtensions
|
||||||
|
key: cuda-${{ matrix.cuda }}-vs-integration
|
||||||
|
|
||||||
|
- name: Get Visual Studio Integration
|
||||||
|
if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
|
||||||
|
run: |
|
||||||
|
if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
|
||||||
|
$links = (Invoke-RestMethod 'https://github.com/Jimver/cuda-toolkit/raw/dc0ca7bb29c5a92f7a963d3d5c93f8d59765136a/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
|
||||||
|
for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
|
||||||
|
Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
|
||||||
|
& 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
|
||||||
|
Remove-Item 'cudainstaller.zip'
|
||||||
|
|
||||||
|
- name: Install Visual Studio Integration
|
||||||
|
if: runner.os == 'Windows'
|
||||||
|
run: |
|
||||||
|
$y = (gi '.\MSBuildExtensions').fullname + '\*'
|
||||||
|
(gi 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_})
|
||||||
|
$cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_')
|
||||||
|
echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Install Dependencies
|
||||||
|
env:
|
||||||
|
MAMBA_DOWNLOAD_FAILFAST: "0"
|
||||||
|
MAMBA_NO_LOW_SPEED_LIMIT: "1"
|
||||||
|
run: |
|
||||||
|
$cudaVersion = $env:CUDAVER
|
||||||
|
mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
|
||||||
|
python -m pip install build wheel
|
||||||
|
|
||||||
|
- name: Build Wheel
|
||||||
|
run: |
|
||||||
|
$cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
|
||||||
|
$env:CUDA_PATH = $env:CONDA_PREFIX
|
||||||
|
$env:CUDA_HOME = $env:CONDA_PREFIX
|
||||||
|
$env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
|
||||||
|
if ($IsLinux) {
|
||||||
|
$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
|
||||||
|
}
|
||||||
|
$env:VERBOSE = '1'
|
||||||
|
$env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all'
|
||||||
|
$env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
|
||||||
|
if ($env:AVXVER -eq 'AVX') {
|
||||||
|
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
|
||||||
|
}
|
||||||
|
if ($env:AVXVER -eq 'AVX512') {
|
||||||
|
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on'
|
||||||
|
}
|
||||||
|
if ($env:AVXVER -eq 'basic') {
|
||||||
|
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
|
||||||
|
}
|
||||||
|
python -m build --wheel
|
||||||
|
# write the build tag to the output
|
||||||
|
Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
|
||||||
|
|
||||||
|
- uses: softprops/action-gh-release@v1
|
||||||
|
with:
|
||||||
|
files: dist/*
|
||||||
|
# Set tag_name to <tag>-cu<cuda_version>
|
||||||
|
tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
87
.github/workflows/build-wheels-metal.yaml
vendored
Normal file
87
.github/workflows/build-wheels-metal.yaml
vendored
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
name: Build Wheels (Metal)
|
||||||
|
|
||||||
|
on: workflow_dispatch
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
define_matrix:
|
||||||
|
name: Define Build Matrix
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: pwsh
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Define Job Output
|
||||||
|
id: set-matrix
|
||||||
|
run: |
|
||||||
|
$matrix = @{
|
||||||
|
'os' = @('macos-11', 'macos-12', 'macos-13')
|
||||||
|
'pyver' = @('3.10', '3.11', '3.12')
|
||||||
|
}
|
||||||
|
|
||||||
|
$matrixOut = ConvertTo-Json $matrix -Compress
|
||||||
|
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
|
||||||
|
|
||||||
|
build_wheels:
|
||||||
|
name: ${{ matrix.os }} Python ${{ matrix.pyver }}
|
||||||
|
needs: define_matrix
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
|
||||||
|
env:
|
||||||
|
OSVER: ${{ matrix.os }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
submodules: "recursive"
|
||||||
|
|
||||||
|
- uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.pyver }}
|
||||||
|
|
||||||
|
- name: Install Dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install build wheel cmake
|
||||||
|
|
||||||
|
- name: Build Wheel
|
||||||
|
run: |
|
||||||
|
XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer"
|
||||||
|
XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin"
|
||||||
|
export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_METAL=on"
|
||||||
|
[[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0"
|
||||||
|
[[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0"
|
||||||
|
[[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0"
|
||||||
|
|
||||||
|
export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64"
|
||||||
|
VERBOSE=1 python -m build --wheel
|
||||||
|
|
||||||
|
if [[ "$OSVER" == "macos-13" ]]; then
|
||||||
|
export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
|
||||||
|
export MACOSX_DEPLOYMENT_TARGET="14.0"
|
||||||
|
VERBOSE=1 python -m build --wheel
|
||||||
|
fi
|
||||||
|
|
||||||
|
for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done
|
||||||
|
|
||||||
|
export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_METAL=on" && export ARCHFLAGS="-arch x86_64"
|
||||||
|
VERBOSE=1 python -m build --wheel
|
||||||
|
|
||||||
|
if [[ "$OSVER" == "macos-13" ]]; then
|
||||||
|
export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
|
||||||
|
export MACOSX_DEPLOYMENT_TARGET="14.0"
|
||||||
|
VERBOSE=1 python -m build --wheel
|
||||||
|
fi
|
||||||
|
|
||||||
|
- uses: softprops/action-gh-release@v1
|
||||||
|
with:
|
||||||
|
files: dist/*
|
||||||
|
# set release name to <tag>-metal
|
||||||
|
tag_name: ${{ github.ref_name }}-metal
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
48
.github/workflows/generate-index-from-release.yaml
vendored
Normal file
48
.github/workflows/generate-index-from-release.yaml
vendored
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
name: Wheels Index
|
||||||
|
|
||||||
|
on:
|
||||||
|
# Trigger on any new release
|
||||||
|
release:
|
||||||
|
types: [published]
|
||||||
|
|
||||||
|
# Allows you to run this workflow manually from the Actions tab
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
pages: write
|
||||||
|
id-token: write
|
||||||
|
|
||||||
|
# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
|
||||||
|
# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
|
||||||
|
concurrency:
|
||||||
|
group: "pages"
|
||||||
|
cancel-in-progress: false
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
# Single deploy job since we're just deploying
|
||||||
|
deploy:
|
||||||
|
environment:
|
||||||
|
name: github-pages
|
||||||
|
url: ${{ steps.deployment.outputs.page_url }}
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Setup Pages
|
||||||
|
uses: actions/configure-pages@v4
|
||||||
|
- name: Build
|
||||||
|
run: |
|
||||||
|
./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
|
||||||
|
./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
|
||||||
|
./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
|
||||||
|
./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
|
||||||
|
- name: Upload artifact
|
||||||
|
uses: actions/upload-pages-artifact@v3
|
||||||
|
with:
|
||||||
|
# Upload entire repository
|
||||||
|
path: 'index'
|
||||||
|
- name: Deploy to GitHub Pages
|
||||||
|
id: deployment
|
||||||
|
uses: actions/deploy-pages@v4
|
18
CHANGELOG.md
18
CHANGELOG.md
|
@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.2.59]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
|
||||||
|
- feat: Binary wheels for CPU, CUDA (12.1 - 12.3), Metal by @abetlen, @jllllll, and @oobabooga in #1247
|
||||||
|
- fix: segfault when logits_all=False by @abetlen in 8649d7671bd1a7c0d9cc6a5ad91c6ca286512ab3
|
||||||
|
- fix: last tokens passing to sample_repetition_penalties function by @ymikhailov in #1295
|
||||||
|
|
||||||
|
## [0.2.58]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
|
||||||
|
- feat: add support for KV cache quantization options by @Limour-dev in #1307
|
||||||
|
- feat: Add logprobs support to chat completions by @windspirit95 in #1311
|
||||||
|
- fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64 by @bretello in #1289
|
||||||
|
- feat: Add tools/functions variables to Jinja2ChatFormatter, add function response formatting for all simple chat formats by @CISC in #1273
|
||||||
|
- fix: Changed local API doc references to hosted by by @lawfordp2017 in #1317
|
||||||
|
|
||||||
## [0.2.57]
|
## [0.2.57]
|
||||||
|
|
||||||
- feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
|
- feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
|
||||||
|
@ -24,7 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [0.2.55]
|
## [0.2.55]
|
||||||
|
|
||||||
- feat: Update llama.cpp to ggerganov/9731134296af3a6839cd682e51d9c2109a871de5
|
- feat: Update llama.cpp to ggerganov/llama.cpp@9731134296af3a6839cd682e51d9c2109a871de5
|
||||||
- docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244
|
- docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244
|
||||||
|
|
||||||
## [0.2.54]
|
## [0.2.54]
|
||||||
|
|
58
README.md
58
README.md
|
@ -6,6 +6,7 @@
|
||||||
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
|
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
|
||||||
[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
|
[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
|
||||||
[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
|
[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
|
||||||
|
[![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]()
|
||||||
|
|
||||||
Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.
|
Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.
|
||||||
This package provides:
|
This package provides:
|
||||||
|
@ -43,6 +44,15 @@ This will also build `llama.cpp` from source and install it alongside this pytho
|
||||||
|
|
||||||
If this fails, add `--verbose` to the `pip install` see the full cmake build log.
|
If this fails, add `--verbose` to the `pip install` see the full cmake build log.
|
||||||
|
|
||||||
|
**Pre-built Wheel (New)**
|
||||||
|
|
||||||
|
It is also possible to install a pre-built wheel with basic CPU support.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install llama-cpp-python \
|
||||||
|
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
||||||
|
```
|
||||||
|
|
||||||
### Installation Configuration
|
### Installation Configuration
|
||||||
|
|
||||||
`llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https://github.com/ggerganov/llama.cpp#build) for a full list.
|
`llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https://github.com/ggerganov/llama.cpp#build) for a full list.
|
||||||
|
@ -99,12 +109,36 @@ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>cuBLAS (CUDA)</summary>
|
<summary>CUDA</summary>
|
||||||
|
|
||||||
To install with cuBLAS, set the `LLAMA_CUBLAS=on` environment variable before installing:
|
To install with CUDA support, set the `LLAMA_CUDA=on` environment variable before installing:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
|
CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
|
||||||
|
```
|
||||||
|
|
||||||
|
**Pre-built Wheel (New)**
|
||||||
|
|
||||||
|
It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
|
||||||
|
|
||||||
|
- CUDA Version is 12.1, 12.2 or 12.3
|
||||||
|
- Python Version is 3.10, 3.11 or 3.12
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install llama-cpp-python \
|
||||||
|
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/<cuda-version>
|
||||||
|
```
|
||||||
|
|
||||||
|
Where `<cuda-version>` is one of the following:
|
||||||
|
- `cu121`: CUDA 12.1
|
||||||
|
- `cu122`: CUDA 12.2
|
||||||
|
- `cu123`: CUDA 12.3
|
||||||
|
|
||||||
|
For example, to install the CUDA 12.1 wheel:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install llama-cpp-python \
|
||||||
|
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
@ -118,6 +152,18 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor
|
||||||
CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
|
CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Pre-built Wheel (New)**
|
||||||
|
|
||||||
|
It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements:
|
||||||
|
|
||||||
|
- MacOS Version is 11.0 or later
|
||||||
|
- Python Version is 3.10, 3.11 or 3.12
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install llama-cpp-python \
|
||||||
|
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
|
||||||
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
<details>
|
<details>
|
||||||
|
|
||||||
|
@ -321,7 +367,7 @@ For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`
|
||||||
|
|
||||||
### JSON and JSON Schema Mode
|
### JSON and JSON Schema Mode
|
||||||
|
|
||||||
To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_chat_completion).
|
To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
|
||||||
|
|
||||||
#### JSON Mode
|
#### JSON Mode
|
||||||
|
|
||||||
|
@ -529,7 +575,7 @@ llama = Llama(
|
||||||
|
|
||||||
### Embeddings
|
### Embeddings
|
||||||
|
|
||||||
To generate text embeddings use [`create_embedding`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_embedding).
|
To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import llama_cpp
|
import llama_cpp
|
||||||
|
@ -568,7 +614,7 @@ python3 -m llama_cpp.server --model models/7B/llama-model.gguf
|
||||||
Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this:
|
Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'
|
CMAKE_ARGS="-DLLAMA_CUDA=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'
|
||||||
python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35
|
python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .llama_cpp import *
|
from .llama_cpp import *
|
||||||
from .llama import *
|
from .llama import *
|
||||||
|
|
||||||
__version__ = "0.2.57"
|
__version__ = "0.2.59"
|
|
@ -730,12 +730,14 @@ class _LlamaSamplingContext:
|
||||||
if len(self.prev) > 0:
|
if len(self.prev) > 0:
|
||||||
nl_token = ctx_main.model.token_nl()
|
nl_token = ctx_main.model.token_nl()
|
||||||
nl_logit = logits_array[nl_token]
|
nl_logit = logits_array[nl_token]
|
||||||
if self.params.penalty_last_n > 0:
|
last_tokens = self.prev[-self.params.penalty_last_n:]
|
||||||
|
last_tokens_size = min(len(last_tokens), self.params.penalty_last_n)
|
||||||
|
if last_tokens_size > 0:
|
||||||
|
last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens)
|
||||||
ctx_main.sample_repetition_penalties(
|
ctx_main.sample_repetition_penalties(
|
||||||
token_data_array,
|
token_data_array,
|
||||||
# TODO: Only create this once
|
last_tokens_p,
|
||||||
(llama_cpp.llama_token * len(self.prev))(*self.prev),
|
last_tokens_size,
|
||||||
self.params.penalty_last_n,
|
|
||||||
self.params.penalty_repeat,
|
self.params.penalty_repeat,
|
||||||
self.params.penalty_freq,
|
self.params.penalty_freq,
|
||||||
self.params.penalty_present,
|
self.params.penalty_present,
|
||||||
|
|
|
@ -105,6 +105,9 @@ class Llama:
|
||||||
draft_model: Optional[LlamaDraftModel] = None,
|
draft_model: Optional[LlamaDraftModel] = None,
|
||||||
# Tokenizer Override
|
# Tokenizer Override
|
||||||
tokenizer: Optional[BaseLlamaTokenizer] = None,
|
tokenizer: Optional[BaseLlamaTokenizer] = None,
|
||||||
|
# KV cache quantization
|
||||||
|
type_k: Optional[int] = None,
|
||||||
|
type_v: Optional[int] = None,
|
||||||
# Misc
|
# Misc
|
||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
# Extra Params
|
# Extra Params
|
||||||
|
@ -172,6 +175,8 @@ class Llama:
|
||||||
draft_model: Optional draft model to use for speculative decoding.
|
draft_model: Optional draft model to use for speculative decoding.
|
||||||
tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
|
tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
|
||||||
verbose: Print verbose output to stderr.
|
verbose: Print verbose output to stderr.
|
||||||
|
type_k: KV cache data type for K (default: f16)
|
||||||
|
type_v: KV cache data type for V (default: f16)
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If the model path does not exist.
|
ValueError: If the model path does not exist.
|
||||||
|
@ -298,7 +303,11 @@ class Llama:
|
||||||
) # Must be set to True for speculative decoding
|
) # Must be set to True for speculative decoding
|
||||||
self.context_params.embeddings = embedding # TODO: Rename to embeddings
|
self.context_params.embeddings = embedding # TODO: Rename to embeddings
|
||||||
self.context_params.offload_kqv = offload_kqv
|
self.context_params.offload_kqv = offload_kqv
|
||||||
|
# KV cache quantization
|
||||||
|
if type_k is not None:
|
||||||
|
self.context_params.type_k = type_k
|
||||||
|
if type_v is not None:
|
||||||
|
self.context_params.type_v = type_v
|
||||||
# Sampling Params
|
# Sampling Params
|
||||||
self.last_n_tokens_size = last_n_tokens_size
|
self.last_n_tokens_size = last_n_tokens_size
|
||||||
|
|
||||||
|
@ -526,14 +535,16 @@ class Llama:
|
||||||
# Save tokens
|
# Save tokens
|
||||||
self.input_ids[n_past : n_past + n_tokens] = batch
|
self.input_ids[n_past : n_past + n_tokens] = batch
|
||||||
# Save logits
|
# Save logits
|
||||||
|
if self.context_params.logits_all:
|
||||||
rows = n_tokens
|
rows = n_tokens
|
||||||
cols = self._n_vocab
|
cols = self._n_vocab
|
||||||
offset = (
|
logits = self._ctx.get_logits()[: rows * cols]
|
||||||
0 if self.context_params.logits_all else n_tokens - 1
|
self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits
|
||||||
) # NOTE: Only save the last token logits if logits_all is False
|
else:
|
||||||
self.scores[n_past + offset : n_past + n_tokens, :].reshape(-1)[
|
rows = 1
|
||||||
:
|
cols = self._n_vocab
|
||||||
] = self._ctx.get_logits()[offset * cols : rows * cols]
|
logits = self._ctx.get_logits()[: rows * cols]
|
||||||
|
self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits
|
||||||
# Update n_tokens
|
# Update n_tokens
|
||||||
self.n_tokens += n_tokens
|
self.n_tokens += n_tokens
|
||||||
|
|
||||||
|
@ -1653,6 +1664,7 @@ class Llama:
|
||||||
top_k=top_k,
|
top_k=top_k,
|
||||||
min_p=min_p,
|
min_p=min_p,
|
||||||
typical_p=typical_p,
|
typical_p=typical_p,
|
||||||
|
logprobs=top_logprobs if logprobs else None,
|
||||||
stream=stream,
|
stream=stream,
|
||||||
stop=stop,
|
stop=stop,
|
||||||
seed=seed,
|
seed=seed,
|
||||||
|
@ -1723,6 +1735,7 @@ class Llama:
|
||||||
n_threads=self.context_params.n_threads,
|
n_threads=self.context_params.n_threads,
|
||||||
n_threads_batch=self.context_params.n_threads_batch,
|
n_threads_batch=self.context_params.n_threads_batch,
|
||||||
rope_scaling_type=self.context_params.rope_scaling_type,
|
rope_scaling_type=self.context_params.rope_scaling_type,
|
||||||
|
pooling_type=self.context_params.pooling_type,
|
||||||
rope_freq_base=self.context_params.rope_freq_base,
|
rope_freq_base=self.context_params.rope_freq_base,
|
||||||
rope_freq_scale=self.context_params.rope_freq_scale,
|
rope_freq_scale=self.context_params.rope_freq_scale,
|
||||||
yarn_ext_factor=self.context_params.yarn_ext_factor,
|
yarn_ext_factor=self.context_params.yarn_ext_factor,
|
||||||
|
@ -1732,6 +1745,7 @@ class Llama:
|
||||||
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
|
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
|
||||||
logits_all=self.context_params.logits_all,
|
logits_all=self.context_params.logits_all,
|
||||||
embedding=self.context_params.embeddings,
|
embedding=self.context_params.embeddings,
|
||||||
|
offload_kqv=self.context_params.offload_kqv,
|
||||||
# Sampling Params
|
# Sampling Params
|
||||||
last_n_tokens_size=self.last_n_tokens_size,
|
last_n_tokens_size=self.last_n_tokens_size,
|
||||||
# LoRA Params
|
# LoRA Params
|
||||||
|
@ -1743,51 +1757,17 @@ class Llama:
|
||||||
# Chat Format Params
|
# Chat Format Params
|
||||||
chat_format=self.chat_format,
|
chat_format=self.chat_format,
|
||||||
chat_handler=self.chat_handler,
|
chat_handler=self.chat_handler,
|
||||||
|
# Speculative Decidng
|
||||||
|
draft_model=self.draft_model,
|
||||||
|
# KV cache quantization
|
||||||
|
type_k=self.context_params.type_k,
|
||||||
|
type_v=self.context_params.type_v,
|
||||||
# Misc
|
# Misc
|
||||||
verbose=self.verbose,
|
verbose=self.verbose,
|
||||||
)
|
)
|
||||||
|
|
||||||
def __setstate__(self, state):
|
def __setstate__(self, state):
|
||||||
self.__init__(
|
self.__init__(**state)
|
||||||
model_path=state["model_path"],
|
|
||||||
# Model Params
|
|
||||||
n_gpu_layers=state["n_gpu_layers"],
|
|
||||||
split_mode=state["split_mode"],
|
|
||||||
main_gpu=state["main_gpu"],
|
|
||||||
tensor_split=state["tensor_split"],
|
|
||||||
vocab_only=state["vocab_only"],
|
|
||||||
use_mmap=state["use_mmap"],
|
|
||||||
use_mlock=state["use_mlock"],
|
|
||||||
kv_overrides=state["kv_overrides"],
|
|
||||||
# Context Params
|
|
||||||
seed=state["seed"],
|
|
||||||
n_ctx=state["n_ctx"],
|
|
||||||
n_batch=state["n_batch"],
|
|
||||||
n_threads=state["n_threads"],
|
|
||||||
n_threads_batch=state["n_threads_batch"],
|
|
||||||
rope_freq_base=state["rope_freq_base"],
|
|
||||||
rope_freq_scale=state["rope_freq_scale"],
|
|
||||||
rope_scaling_type=state["rope_scaling_type"],
|
|
||||||
yarn_ext_factor=state["yarn_ext_factor"],
|
|
||||||
yarn_attn_factor=state["yarn_attn_factor"],
|
|
||||||
yarn_beta_fast=state["yarn_beta_fast"],
|
|
||||||
yarn_beta_slow=state["yarn_beta_slow"],
|
|
||||||
yarn_orig_ctx=state["yarn_orig_ctx"],
|
|
||||||
logits_all=state["logits_all"],
|
|
||||||
embedding=state["embedding"],
|
|
||||||
# Sampling Params
|
|
||||||
last_n_tokens_size=state["last_n_tokens_size"],
|
|
||||||
# LoRA Params
|
|
||||||
lora_base=state["lora_base"],
|
|
||||||
lora_path=state["lora_path"],
|
|
||||||
# Backend Params
|
|
||||||
numa=state["numa"],
|
|
||||||
# Chat Format Params
|
|
||||||
chat_format=state["chat_format"],
|
|
||||||
chat_handler=state["chat_handler"],
|
|
||||||
# Misc
|
|
||||||
verbose=state["verbose"],
|
|
||||||
)
|
|
||||||
|
|
||||||
def save_state(self) -> LlamaState:
|
def save_state(self) -> LlamaState:
|
||||||
assert self._ctx.ctx is not None
|
assert self._ctx.ctx is not None
|
||||||
|
|
|
@ -231,6 +231,7 @@ def _convert_text_completion_to_chat(
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
"content": completion["choices"][0]["text"],
|
"content": completion["choices"][0]["text"],
|
||||||
},
|
},
|
||||||
|
"logprobs": completion["choices"][0]["logprobs"],
|
||||||
"finish_reason": completion["choices"][0]["finish_reason"],
|
"finish_reason": completion["choices"][0]["finish_reason"],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -254,6 +255,7 @@ def _convert_text_completion_chunks_to_chat(
|
||||||
"delta": {
|
"delta": {
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
},
|
},
|
||||||
|
"logprobs": None,
|
||||||
"finish_reason": None,
|
"finish_reason": None,
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -273,6 +275,7 @@ def _convert_text_completion_chunks_to_chat(
|
||||||
if chunk["choices"][0]["finish_reason"] is None
|
if chunk["choices"][0]["finish_reason"] is None
|
||||||
else {}
|
else {}
|
||||||
),
|
),
|
||||||
|
"logprobs": chunk["choices"][0]["logprobs"],
|
||||||
"finish_reason": chunk["choices"][0]["finish_reason"],
|
"finish_reason": chunk["choices"][0]["finish_reason"],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -487,6 +490,7 @@ def chat_formatter_to_chat_completion_handler(
|
||||||
temperature: float = 0.2,
|
temperature: float = 0.2,
|
||||||
top_p: float = 0.95,
|
top_p: float = 0.95,
|
||||||
top_k: int = 40,
|
top_k: int = 40,
|
||||||
|
logprobs: int = 0,
|
||||||
min_p: float = 0.05,
|
min_p: float = 0.05,
|
||||||
typical_p: float = 1.0,
|
typical_p: float = 1.0,
|
||||||
stream: bool = False,
|
stream: bool = False,
|
||||||
|
@ -576,6 +580,7 @@ def chat_formatter_to_chat_completion_handler(
|
||||||
top_k=top_k,
|
top_k=top_k,
|
||||||
min_p=min_p,
|
min_p=min_p,
|
||||||
typical_p=typical_p,
|
typical_p=typical_p,
|
||||||
|
logprobs=logprobs,
|
||||||
stream=stream,
|
stream=stream,
|
||||||
stop=stop,
|
stop=stop,
|
||||||
seed=seed,
|
seed=seed,
|
||||||
|
|
|
@ -141,6 +141,70 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
|
||||||
|
|
||||||
byref = ctypes.byref # type: ignore
|
byref = ctypes.byref # type: ignore
|
||||||
|
|
||||||
|
# from ggml.h
|
||||||
|
# // NOTE: always add types at the end of the enum to keep backward compatibility
|
||||||
|
# enum ggml_type {
|
||||||
|
# GGML_TYPE_F32 = 0,
|
||||||
|
# GGML_TYPE_F16 = 1,
|
||||||
|
# GGML_TYPE_Q4_0 = 2,
|
||||||
|
# GGML_TYPE_Q4_1 = 3,
|
||||||
|
# // GGML_TYPE_Q4_2 = 4, support has been removed
|
||||||
|
# // GGML_TYPE_Q4_3 = 5, support has been removed
|
||||||
|
# GGML_TYPE_Q5_0 = 6,
|
||||||
|
# GGML_TYPE_Q5_1 = 7,
|
||||||
|
# GGML_TYPE_Q8_0 = 8,
|
||||||
|
# GGML_TYPE_Q8_1 = 9,
|
||||||
|
# GGML_TYPE_Q2_K = 10,
|
||||||
|
# GGML_TYPE_Q3_K = 11,
|
||||||
|
# GGML_TYPE_Q4_K = 12,
|
||||||
|
# GGML_TYPE_Q5_K = 13,
|
||||||
|
# GGML_TYPE_Q6_K = 14,
|
||||||
|
# GGML_TYPE_Q8_K = 15,
|
||||||
|
# GGML_TYPE_IQ2_XXS = 16,
|
||||||
|
# GGML_TYPE_IQ2_XS = 17,
|
||||||
|
# GGML_TYPE_IQ3_XXS = 18,
|
||||||
|
# GGML_TYPE_IQ1_S = 19,
|
||||||
|
# GGML_TYPE_IQ4_NL = 20,
|
||||||
|
# GGML_TYPE_IQ3_S = 21,
|
||||||
|
# GGML_TYPE_IQ2_S = 22,
|
||||||
|
# GGML_TYPE_IQ4_XS = 23,
|
||||||
|
# GGML_TYPE_I8 = 24,
|
||||||
|
# GGML_TYPE_I16 = 25,
|
||||||
|
# GGML_TYPE_I32 = 26,
|
||||||
|
# GGML_TYPE_I64 = 27,
|
||||||
|
# GGML_TYPE_F64 = 28,
|
||||||
|
# GGML_TYPE_IQ1_M = 29,
|
||||||
|
# GGML_TYPE_COUNT,
|
||||||
|
# };
|
||||||
|
GGML_TYPE_F32 = 0
|
||||||
|
GGML_TYPE_F16 = 1
|
||||||
|
GGML_TYPE_Q4_0 = 2
|
||||||
|
GGML_TYPE_Q4_1 = 3
|
||||||
|
GGML_TYPE_Q5_0 = 6
|
||||||
|
GGML_TYPE_Q5_1 = 7
|
||||||
|
GGML_TYPE_Q8_0 = 8
|
||||||
|
GGML_TYPE_Q8_1 = 9
|
||||||
|
GGML_TYPE_Q2_K = 10
|
||||||
|
GGML_TYPE_Q3_K = 11
|
||||||
|
GGML_TYPE_Q4_K = 12
|
||||||
|
GGML_TYPE_Q5_K = 13
|
||||||
|
GGML_TYPE_Q6_K = 14
|
||||||
|
GGML_TYPE_Q8_K = 15
|
||||||
|
GGML_TYPE_IQ2_XXS = 16
|
||||||
|
GGML_TYPE_IQ2_XS = 17
|
||||||
|
GGML_TYPE_IQ3_XXS = 18
|
||||||
|
GGML_TYPE_IQ1_S = 19
|
||||||
|
GGML_TYPE_IQ4_NL = 20
|
||||||
|
GGML_TYPE_IQ3_S = 21
|
||||||
|
GGML_TYPE_IQ2_S = 22
|
||||||
|
GGML_TYPE_IQ4_XS = 23
|
||||||
|
GGML_TYPE_I8 = 24
|
||||||
|
GGML_TYPE_I16 = 25
|
||||||
|
GGML_TYPE_I32 = 26
|
||||||
|
GGML_TYPE_I64 = 27
|
||||||
|
GGML_TYPE_F64 = 28
|
||||||
|
GGML_TYPE_IQ1_M = 29
|
||||||
|
GGML_TYPE_COUNT = 30
|
||||||
|
|
||||||
# from ggml-backend.h
|
# from ggml-backend.h
|
||||||
# typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
# typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
||||||
|
@ -175,8 +239,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E
|
||||||
|
|
||||||
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
|
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
|
||||||
# define LLAMA_SESSION_VERSION 4
|
# define LLAMA_SESSION_VERSION 5
|
||||||
LLAMA_SESSION_VERSION = 4
|
LLAMA_SESSION_VERSION = 5
|
||||||
|
|
||||||
|
|
||||||
# struct llama_model;
|
# struct llama_model;
|
||||||
|
@ -199,14 +263,18 @@ llama_seq_id = ctypes.c_int32
|
||||||
|
|
||||||
# enum llama_vocab_type {
|
# enum llama_vocab_type {
|
||||||
# LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
# LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
||||||
# LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
|
# LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
||||||
# LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
|
# LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
||||||
# LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
# LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||||
# };
|
# };
|
||||||
LLAMA_VOCAB_TYPE_NONE = 0
|
LLAMA_VOCAB_TYPE_NONE = 0
|
||||||
|
"""For models without vocab"""
|
||||||
LLAMA_VOCAB_TYPE_SPM = 1
|
LLAMA_VOCAB_TYPE_SPM = 1
|
||||||
|
"""LLaMA tokenizer based on byte-level BPE with byte fallback"""
|
||||||
LLAMA_VOCAB_TYPE_BPE = 2
|
LLAMA_VOCAB_TYPE_BPE = 2
|
||||||
|
"""GPT-2 tokenizer based on byte-level BPE"""
|
||||||
LLAMA_VOCAB_TYPE_WPM = 3
|
LLAMA_VOCAB_TYPE_WPM = 3
|
||||||
|
"""BERT tokenizer based on WordPiece"""
|
||||||
|
|
||||||
|
|
||||||
# // note: these values should be synchronized with ggml_rope
|
# // note: these values should be synchronized with ggml_rope
|
||||||
|
@ -274,6 +342,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
|
||||||
# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
||||||
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||||
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||||
|
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||||
|
|
||||||
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
# };
|
# };
|
||||||
|
@ -677,6 +746,7 @@ It might not exist for progress report where '.' is output repeatedly."""
|
||||||
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
# bool pure; // quantize all tensors to the default type
|
# bool pure; // quantize all tensors to the default type
|
||||||
# void * imatrix; // pointer to importance matrix data
|
# void * imatrix; // pointer to importance matrix data
|
||||||
|
# void * kv_overrides; // pointer to vector containing overrides
|
||||||
# } llama_model_quantize_params;
|
# } llama_model_quantize_params;
|
||||||
class llama_model_quantize_params(ctypes.Structure):
|
class llama_model_quantize_params(ctypes.Structure):
|
||||||
"""Parameters for llama_model_quantize
|
"""Parameters for llama_model_quantize
|
||||||
|
@ -691,6 +761,7 @@ class llama_model_quantize_params(ctypes.Structure):
|
||||||
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
pure (bool): quantize all tensors to the default type
|
pure (bool): quantize all tensors to the default type
|
||||||
imatrix (ctypes.c_void_p): pointer to importance matrix data
|
imatrix (ctypes.c_void_p): pointer to importance matrix data
|
||||||
|
kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_fields_ = [
|
_fields_ = [
|
||||||
|
@ -703,6 +774,7 @@ class llama_model_quantize_params(ctypes.Structure):
|
||||||
("only_copy", ctypes.c_bool),
|
("only_copy", ctypes.c_bool),
|
||||||
("pure", ctypes.c_bool),
|
("pure", ctypes.c_bool),
|
||||||
("imatrix", ctypes.c_void_p),
|
("imatrix", ctypes.c_void_p),
|
||||||
|
("kv_overrides", ctypes.c_void_p),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -1838,9 +1910,9 @@ def llama_synchronize(ctx: llama_context_p, /):
|
||||||
|
|
||||||
|
|
||||||
# // Token logits obtained from the last call to llama_decode()
|
# // Token logits obtained from the last call to llama_decode()
|
||||||
# // The logits for the last token are stored in the last row
|
# // The logits for which llama_batch.logits[i] != 0 are stored contiguously
|
||||||
# // Logits for which llama_batch.logits[i] == 0 are undefined
|
# // in the order they have appeared in the batch.
|
||||||
# // Rows: n_tokens provided with llama_batch
|
# // Rows: number of tokens for which llama_batch.logits[i] != 0
|
||||||
# // Cols: n_vocab
|
# // Cols: n_vocab
|
||||||
# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||||
@ctypes_function(
|
@ctypes_function(
|
||||||
|
@ -1859,7 +1931,8 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
|
||||||
|
|
||||||
|
|
||||||
# // Logits for the ith token. Equivalent to:
|
# // Logits for the ith token. Equivalent to:
|
||||||
# // llama_get_logits(ctx) + i*n_vocab
|
# // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
||||||
|
# // returns NULL for invalid ids.
|
||||||
# LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
# LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
||||||
@ctypes_function(
|
@ctypes_function(
|
||||||
"llama_get_logits_ith",
|
"llama_get_logits_ith",
|
||||||
|
@ -1874,8 +1947,12 @@ def llama_get_logits_ith(
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
# // Get all output token embeddings
|
# // Get all output token embeddings.
|
||||||
# // shape: [n_tokens*n_embd] (1-dimensional)
|
# // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
|
||||||
|
# // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
|
||||||
|
# // in the order they have appeared in the batch.
|
||||||
|
# // shape: [n_outputs*n_embd]
|
||||||
|
# // Otherwise, returns NULL.
|
||||||
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||||
@ctypes_function(
|
@ctypes_function(
|
||||||
"llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
|
"llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
|
||||||
|
@ -1886,9 +1963,10 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
# // Get the embeddings for the ith token
|
# // Get the embeddings for the ith token. Equivalent to:
|
||||||
# // llama_get_embeddings(ctx) + i*n_embd
|
# // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
||||||
# // shape: [n_embd] (1-dimensional)
|
# // shape: [n_embd] (1-dimensional)
|
||||||
|
# // returns NULL for invalid ids.
|
||||||
# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||||
@ctypes_function(
|
@ctypes_function(
|
||||||
"llama_get_embeddings_ith",
|
"llama_get_embeddings_ith",
|
||||||
|
|
|
@ -84,6 +84,7 @@ class ChatCompletionFunction(TypedDict):
|
||||||
class ChatCompletionResponseChoice(TypedDict):
|
class ChatCompletionResponseChoice(TypedDict):
|
||||||
index: int
|
index: int
|
||||||
message: "ChatCompletionResponseMessage"
|
message: "ChatCompletionResponseMessage"
|
||||||
|
logprobs: Optional[CompletionLogprobs]
|
||||||
finish_reason: Optional[str]
|
finish_reason: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -405,6 +405,18 @@ async def create_chat_completion(
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
"logprobs": {
|
||||||
|
"summary": "Logprobs",
|
||||||
|
"value": {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "What is the capital of France?"},
|
||||||
|
],
|
||||||
|
"logprobs": True,
|
||||||
|
"top_logprobs": 10
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
|
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
|
||||||
|
@ -493,7 +505,7 @@ async def tokenize(
|
||||||
) -> TokenizeInputResponse:
|
) -> TokenizeInputResponse:
|
||||||
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
|
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
|
||||||
|
|
||||||
return {"tokens": tokens}
|
return TokenizeInputResponse(tokens=tokens)
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
@router.post(
|
||||||
|
@ -508,7 +520,7 @@ async def count_query_tokens(
|
||||||
) -> TokenizeInputCountResponse:
|
) -> TokenizeInputCountResponse:
|
||||||
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
|
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
|
||||||
|
|
||||||
return {"count": len(tokens)}
|
return TokenizeInputCountResponse(count=len(tokens))
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
@router.post(
|
||||||
|
@ -523,4 +535,4 @@ async def detokenize(
|
||||||
) -> DetokenizeInputResponse:
|
) -> DetokenizeInputResponse:
|
||||||
text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")
|
text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")
|
||||||
|
|
||||||
return {"text": text}
|
return DetokenizeInputResponse(text=text)
|
||||||
|
|
|
@ -175,6 +175,9 @@ class LlamaProxy:
|
||||||
chat_handler=chat_handler,
|
chat_handler=chat_handler,
|
||||||
# Speculative Decoding
|
# Speculative Decoding
|
||||||
draft_model=draft_model,
|
draft_model=draft_model,
|
||||||
|
# KV Cache Quantization
|
||||||
|
type_k=settings.type_k,
|
||||||
|
type_v=settings.type_v,
|
||||||
# Tokenizer
|
# Tokenizer
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
# Misc
|
# Misc
|
||||||
|
|
|
@ -159,6 +159,15 @@ class ModelSettings(BaseSettings):
|
||||||
default=10,
|
default=10,
|
||||||
description="Number of tokens to predict using the draft model.",
|
description="Number of tokens to predict using the draft model.",
|
||||||
)
|
)
|
||||||
|
# KV Cache Quantization
|
||||||
|
type_k: Optional[int] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Type of the key cache quantization.",
|
||||||
|
)
|
||||||
|
type_v: Optional[int] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Type of the value cache quantization.",
|
||||||
|
)
|
||||||
# Misc
|
# Misc
|
||||||
verbose: bool = Field(
|
verbose: bool = Field(
|
||||||
default=True, description="Whether to print debug information."
|
default=True, description="Whether to print debug information."
|
||||||
|
|
|
@ -130,7 +130,6 @@ class CreateCompletionRequest(BaseModel):
|
||||||
presence_penalty: Optional[float] = presence_penalty_field
|
presence_penalty: Optional[float] = presence_penalty_field
|
||||||
frequency_penalty: Optional[float] = frequency_penalty_field
|
frequency_penalty: Optional[float] = frequency_penalty_field
|
||||||
logit_bias: Optional[Dict[str, float]] = Field(None)
|
logit_bias: Optional[Dict[str, float]] = Field(None)
|
||||||
logprobs: Optional[int] = Field(None)
|
|
||||||
seed: Optional[int] = Field(None)
|
seed: Optional[int] = Field(None)
|
||||||
|
|
||||||
# ignored or currently unsupported
|
# ignored or currently unsupported
|
||||||
|
@ -209,6 +208,15 @@ class CreateChatCompletionRequest(BaseModel):
|
||||||
default=None,
|
default=None,
|
||||||
description="The maximum number of tokens to generate. Defaults to inf",
|
description="The maximum number of tokens to generate. Defaults to inf",
|
||||||
)
|
)
|
||||||
|
logprobs: Optional[bool] = Field(
|
||||||
|
default=False,
|
||||||
|
description="Whether to output the logprobs or not. Default is True"
|
||||||
|
)
|
||||||
|
top_logprobs: Optional[int] = Field(
|
||||||
|
default=None,
|
||||||
|
ge=0,
|
||||||
|
description="The number of logprobs to generate. If None, no logprobs are generated. logprobs need to set to True.",
|
||||||
|
)
|
||||||
temperature: float = temperature_field
|
temperature: float = temperature_field
|
||||||
top_p: float = top_p_field
|
top_p: float = top_p_field
|
||||||
min_p: float = min_p_field
|
min_p: float = min_p_field
|
||||||
|
@ -268,7 +276,7 @@ class ModelList(TypedDict):
|
||||||
|
|
||||||
class TokenizeInputRequest(BaseModel):
|
class TokenizeInputRequest(BaseModel):
|
||||||
model: Optional[str] = model_field
|
model: Optional[str] = model_field
|
||||||
input: Optional[str] = Field(description="The input to tokenize.")
|
input: str = Field(description="The input to tokenize.")
|
||||||
|
|
||||||
model_config = {
|
model_config = {
|
||||||
"json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
|
"json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
|
||||||
|
|
58
scripts/releases-to-pep-503.sh
Executable file
58
scripts/releases-to-pep-503.sh
Executable file
|
@ -0,0 +1,58 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Get output directory or default to index/whl/cpu
|
||||||
|
output_dir=${1:-"index/whl/cpu"}
|
||||||
|
|
||||||
|
# Create output directory
|
||||||
|
mkdir -p $output_dir
|
||||||
|
|
||||||
|
# Change to output directory
|
||||||
|
pushd $output_dir
|
||||||
|
|
||||||
|
# Create an index html file
|
||||||
|
echo "<!DOCTYPE html>" > index.html
|
||||||
|
echo "<html>" >> index.html
|
||||||
|
echo " <head></head>" >> index.html
|
||||||
|
echo " <body>" >> index.html
|
||||||
|
echo " <a href=\"llama-cpp-python/\">llama-cpp-python</a>" >> index.html
|
||||||
|
echo " <br>" >> index.html
|
||||||
|
echo " </body>" >> index.html
|
||||||
|
echo "</html>" >> index.html
|
||||||
|
echo "" >> index.html
|
||||||
|
|
||||||
|
# Create llama-cpp-python directory
|
||||||
|
mkdir -p llama-cpp-python
|
||||||
|
|
||||||
|
# Change to llama-cpp-python directory
|
||||||
|
pushd llama-cpp-python
|
||||||
|
|
||||||
|
# Create an index html file
|
||||||
|
echo "<!DOCTYPE html>" > index.html
|
||||||
|
echo "<html>" >> index.html
|
||||||
|
echo " <body>" >> index.html
|
||||||
|
echo " <h1>Links for llama-cpp-python</h1>" >> index.html
|
||||||
|
|
||||||
|
# Get all releases
|
||||||
|
releases=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases | jq -r .[].tag_name)
|
||||||
|
|
||||||
|
# Get pattern from second arg or default to valid python package version pattern
|
||||||
|
pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"}
|
||||||
|
|
||||||
|
# Filter releases by pattern
|
||||||
|
releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern)
|
||||||
|
|
||||||
|
# For each release, get all assets
|
||||||
|
for release in $releases; do
|
||||||
|
assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets)
|
||||||
|
echo " <h2>$release</h2>" >> index.html
|
||||||
|
for asset in $(echo $assets | jq -r .[].browser_download_url); do
|
||||||
|
if [[ $asset == *".whl" ]]; then
|
||||||
|
echo " <a href=\"$asset\">$asset</a>" >> index.html
|
||||||
|
echo " <br>" >> index.html
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
echo " </body>" >> index.html
|
||||||
|
echo "</html>" >> index.html
|
||||||
|
echo "" >> index.html
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652
|
Subproject commit 60cdf40cc32f0ad4cb11e0ca8fd38f3b93d8d640
|
Loading…
Reference in a new issue