This commit is contained in:
commit
8b9cd38c0d
18 changed files with 569 additions and 85 deletions
10
.github/workflows/build-and-release.yaml
vendored
10
.github/workflows/build-and-release.yaml
vendored
|
@ -11,7 +11,7 @@ jobs:
|
|||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest, macOS-latest]
|
||||
os: [ubuntu-20.04, windows-2019, macos-11]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
@ -23,19 +23,19 @@ jobs:
|
|||
with:
|
||||
python-version: "3.8"
|
||||
|
||||
- name: Install cibuildwheel
|
||||
run: python -m pip install cibuildwheel==2.12.1
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install -e .[all]
|
||||
|
||||
- name: Build wheels
|
||||
run: python -m cibuildwheel --output-dir wheelhouse
|
||||
uses: pypa/cibuildwheel@v2.16.5
|
||||
env:
|
||||
# disable repair
|
||||
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||
with:
|
||||
package-dir: .
|
||||
output-dir: wheelhouse
|
||||
|
||||
- uses: actions/upload-artifact@v3
|
||||
with:
|
||||
|
|
131
.github/workflows/build-wheels-cuda.yaml
vendored
Normal file
131
.github/workflows/build-wheels-cuda.yaml
vendored
Normal file
|
@ -0,0 +1,131 @@
|
|||
name: Build Wheels (CUDA)
|
||||
|
||||
on: workflow_dispatch
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
jobs:
|
||||
define_matrix:
|
||||
name: Define Build Matrix
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
defaults:
|
||||
run:
|
||||
shell: pwsh
|
||||
|
||||
steps:
|
||||
- name: Define Job Output
|
||||
id: set-matrix
|
||||
run: |
|
||||
$matrix = @{
|
||||
'os' = @('ubuntu-20.04', 'windows-latest')
|
||||
'pyver' = @("3.10", "3.11", "3.12")
|
||||
'cuda' = @("12.1.1", "12.2.2", "12.3.2")
|
||||
'releasetag' = @("basic")
|
||||
}
|
||||
|
||||
$matrixOut = ConvertTo-Json $matrix -Compress
|
||||
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
|
||||
|
||||
build_wheels:
|
||||
name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
|
||||
needs: define_matrix
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
|
||||
defaults:
|
||||
run:
|
||||
shell: pwsh
|
||||
env:
|
||||
CUDAVER: ${{ matrix.cuda }}
|
||||
AVXVER: ${{ matrix.releasetag }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: "recursive"
|
||||
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.pyver }}
|
||||
|
||||
- name: Setup Mamba
|
||||
uses: conda-incubator/setup-miniconda@v2.2.0
|
||||
with:
|
||||
activate-environment: "build"
|
||||
python-version: ${{ matrix.pyver }}
|
||||
miniforge-variant: Mambaforge
|
||||
miniforge-version: latest
|
||||
use-mamba: true
|
||||
add-pip-as-python-dependency: true
|
||||
auto-activate-base: false
|
||||
|
||||
- name: VS Integration Cache
|
||||
id: vs-integration-cache
|
||||
if: runner.os == 'Windows'
|
||||
uses: actions/cache@v3.3.2
|
||||
with:
|
||||
path: ./MSBuildExtensions
|
||||
key: cuda-${{ matrix.cuda }}-vs-integration
|
||||
|
||||
- name: Get Visual Studio Integration
|
||||
if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
|
||||
$links = (Invoke-RestMethod 'https://github.com/Jimver/cuda-toolkit/raw/dc0ca7bb29c5a92f7a963d3d5c93f8d59765136a/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
|
||||
for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
|
||||
Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
|
||||
& 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
|
||||
Remove-Item 'cudainstaller.zip'
|
||||
|
||||
- name: Install Visual Studio Integration
|
||||
if: runner.os == 'Windows'
|
||||
run: |
|
||||
$y = (gi '.\MSBuildExtensions').fullname + '\*'
|
||||
(gi 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_})
|
||||
$cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_')
|
||||
echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
|
||||
|
||||
- name: Install Dependencies
|
||||
env:
|
||||
MAMBA_DOWNLOAD_FAILFAST: "0"
|
||||
MAMBA_NO_LOW_SPEED_LIMIT: "1"
|
||||
run: |
|
||||
$cudaVersion = $env:CUDAVER
|
||||
mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
|
||||
python -m pip install build wheel
|
||||
|
||||
- name: Build Wheel
|
||||
run: |
|
||||
$cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
|
||||
$env:CUDA_PATH = $env:CONDA_PREFIX
|
||||
$env:CUDA_HOME = $env:CONDA_PREFIX
|
||||
$env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
|
||||
if ($IsLinux) {
|
||||
$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
|
||||
}
|
||||
$env:VERBOSE = '1'
|
||||
$env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all'
|
||||
$env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
|
||||
if ($env:AVXVER -eq 'AVX') {
|
||||
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
|
||||
}
|
||||
if ($env:AVXVER -eq 'AVX512') {
|
||||
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on'
|
||||
}
|
||||
if ($env:AVXVER -eq 'basic') {
|
||||
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
|
||||
}
|
||||
python -m build --wheel
|
||||
# write the build tag to the output
|
||||
Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
|
||||
|
||||
- uses: softprops/action-gh-release@v1
|
||||
with:
|
||||
files: dist/*
|
||||
# Set tag_name to <tag>-cu<cuda_version>
|
||||
tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
87
.github/workflows/build-wheels-metal.yaml
vendored
Normal file
87
.github/workflows/build-wheels-metal.yaml
vendored
Normal file
|
@ -0,0 +1,87 @@
|
|||
name: Build Wheels (Metal)
|
||||
|
||||
on: workflow_dispatch
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
jobs:
|
||||
define_matrix:
|
||||
name: Define Build Matrix
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
defaults:
|
||||
run:
|
||||
shell: pwsh
|
||||
|
||||
steps:
|
||||
- name: Define Job Output
|
||||
id: set-matrix
|
||||
run: |
|
||||
$matrix = @{
|
||||
'os' = @('macos-11', 'macos-12', 'macos-13')
|
||||
'pyver' = @('3.10', '3.11', '3.12')
|
||||
}
|
||||
|
||||
$matrixOut = ConvertTo-Json $matrix -Compress
|
||||
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
|
||||
|
||||
build_wheels:
|
||||
name: ${{ matrix.os }} Python ${{ matrix.pyver }}
|
||||
needs: define_matrix
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
|
||||
env:
|
||||
OSVER: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: "recursive"
|
||||
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.pyver }}
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
python -m pip install build wheel cmake
|
||||
|
||||
- name: Build Wheel
|
||||
run: |
|
||||
XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer"
|
||||
XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin"
|
||||
export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_METAL=on"
|
||||
[[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0"
|
||||
[[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0"
|
||||
[[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0"
|
||||
|
||||
export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64"
|
||||
VERBOSE=1 python -m build --wheel
|
||||
|
||||
if [[ "$OSVER" == "macos-13" ]]; then
|
||||
export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
|
||||
export MACOSX_DEPLOYMENT_TARGET="14.0"
|
||||
VERBOSE=1 python -m build --wheel
|
||||
fi
|
||||
|
||||
for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done
|
||||
|
||||
export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_METAL=on" && export ARCHFLAGS="-arch x86_64"
|
||||
VERBOSE=1 python -m build --wheel
|
||||
|
||||
if [[ "$OSVER" == "macos-13" ]]; then
|
||||
export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
|
||||
export MACOSX_DEPLOYMENT_TARGET="14.0"
|
||||
VERBOSE=1 python -m build --wheel
|
||||
fi
|
||||
|
||||
- uses: softprops/action-gh-release@v1
|
||||
with:
|
||||
files: dist/*
|
||||
# set release name to <tag>-metal
|
||||
tag_name: ${{ github.ref_name }}-metal
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
48
.github/workflows/generate-index-from-release.yaml
vendored
Normal file
48
.github/workflows/generate-index-from-release.yaml
vendored
Normal file
|
@ -0,0 +1,48 @@
|
|||
name: Wheels Index
|
||||
|
||||
on:
|
||||
# Trigger on any new release
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
# Allows you to run this workflow manually from the Actions tab
|
||||
workflow_dispatch:
|
||||
|
||||
# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
|
||||
permissions:
|
||||
contents: read
|
||||
pages: write
|
||||
id-token: write
|
||||
|
||||
# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
|
||||
# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
|
||||
concurrency:
|
||||
group: "pages"
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
# Single deploy job since we're just deploying
|
||||
deploy:
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Setup Pages
|
||||
uses: actions/configure-pages@v4
|
||||
- name: Build
|
||||
run: |
|
||||
./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
|
||||
./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
|
||||
./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
|
||||
./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-pages-artifact@v3
|
||||
with:
|
||||
# Upload entire repository
|
||||
path: 'index'
|
||||
- name: Deploy to GitHub Pages
|
||||
id: deployment
|
||||
uses: actions/deploy-pages@v4
|
18
CHANGELOG.md
18
CHANGELOG.md
|
@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.2.59]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
|
||||
- feat: Binary wheels for CPU, CUDA (12.1 - 12.3), Metal by @abetlen, @jllllll, and @oobabooga in #1247
|
||||
- fix: segfault when logits_all=False by @abetlen in 8649d7671bd1a7c0d9cc6a5ad91c6ca286512ab3
|
||||
- fix: last tokens passing to sample_repetition_penalties function by @ymikhailov in #1295
|
||||
|
||||
## [0.2.58]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
|
||||
- feat: add support for KV cache quantization options by @Limour-dev in #1307
|
||||
- feat: Add logprobs support to chat completions by @windspirit95 in #1311
|
||||
- fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64 by @bretello in #1289
|
||||
- feat: Add tools/functions variables to Jinja2ChatFormatter, add function response formatting for all simple chat formats by @CISC in #1273
|
||||
- fix: Changed local API doc references to hosted by by @lawfordp2017 in #1317
|
||||
|
||||
## [0.2.57]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
|
||||
|
@ -24,7 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [0.2.55]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/9731134296af3a6839cd682e51d9c2109a871de5
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@9731134296af3a6839cd682e51d9c2109a871de5
|
||||
- docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244
|
||||
|
||||
## [0.2.54]
|
||||
|
|
58
README.md
58
README.md
|
@ -6,6 +6,7 @@
|
|||
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
|
||||
[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
|
||||
[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
|
||||
[![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]()
|
||||
|
||||
Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.
|
||||
This package provides:
|
||||
|
@ -43,6 +44,15 @@ This will also build `llama.cpp` from source and install it alongside this pytho
|
|||
|
||||
If this fails, add `--verbose` to the `pip install` see the full cmake build log.
|
||||
|
||||
**Pre-built Wheel (New)**
|
||||
|
||||
It is also possible to install a pre-built wheel with basic CPU support.
|
||||
|
||||
```bash
|
||||
pip install llama-cpp-python \
|
||||
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
||||
```
|
||||
|
||||
### Installation Configuration
|
||||
|
||||
`llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https://github.com/ggerganov/llama.cpp#build) for a full list.
|
||||
|
@ -99,12 +109,36 @@ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-
|
|||
</details>
|
||||
|
||||
<details>
|
||||
<summary>cuBLAS (CUDA)</summary>
|
||||
<summary>CUDA</summary>
|
||||
|
||||
To install with cuBLAS, set the `LLAMA_CUBLAS=on` environment variable before installing:
|
||||
To install with CUDA support, set the `LLAMA_CUDA=on` environment variable before installing:
|
||||
|
||||
```bash
|
||||
CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
|
||||
CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
|
||||
```
|
||||
|
||||
**Pre-built Wheel (New)**
|
||||
|
||||
It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
|
||||
|
||||
- CUDA Version is 12.1, 12.2 or 12.3
|
||||
- Python Version is 3.10, 3.11 or 3.12
|
||||
|
||||
```bash
|
||||
pip install llama-cpp-python \
|
||||
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/<cuda-version>
|
||||
```
|
||||
|
||||
Where `<cuda-version>` is one of the following:
|
||||
- `cu121`: CUDA 12.1
|
||||
- `cu122`: CUDA 12.2
|
||||
- `cu123`: CUDA 12.3
|
||||
|
||||
For example, to install the CUDA 12.1 wheel:
|
||||
|
||||
```bash
|
||||
pip install llama-cpp-python \
|
||||
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
||||
```
|
||||
|
||||
</details>
|
||||
|
@ -118,6 +152,18 @@ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable befor
|
|||
CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
|
||||
```
|
||||
|
||||
**Pre-built Wheel (New)**
|
||||
|
||||
It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements:
|
||||
|
||||
- MacOS Version is 11.0 or later
|
||||
- Python Version is 3.10, 3.11 or 3.12
|
||||
|
||||
```bash
|
||||
pip install llama-cpp-python \
|
||||
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
|
||||
```
|
||||
|
||||
</details>
|
||||
<details>
|
||||
|
||||
|
@ -321,7 +367,7 @@ For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`
|
|||
|
||||
### JSON and JSON Schema Mode
|
||||
|
||||
To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_chat_completion).
|
||||
To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
|
||||
|
||||
#### JSON Mode
|
||||
|
||||
|
@ -529,7 +575,7 @@ llama = Llama(
|
|||
|
||||
### Embeddings
|
||||
|
||||
To generate text embeddings use [`create_embedding`](http://localhost:8000/api-reference/#llama_cpp.Llama.create_embedding).
|
||||
To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding).
|
||||
|
||||
```python
|
||||
import llama_cpp
|
||||
|
@ -568,7 +614,7 @@ python3 -m llama_cpp.server --model models/7B/llama-model.gguf
|
|||
Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this:
|
||||
|
||||
```bash
|
||||
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'
|
||||
CMAKE_ARGS="-DLLAMA_CUDA=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'
|
||||
python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35
|
||||
```
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from .llama_cpp import *
|
||||
from .llama import *
|
||||
|
||||
__version__ = "0.2.57"
|
||||
__version__ = "0.2.59"
|
|
@ -730,12 +730,14 @@ class _LlamaSamplingContext:
|
|||
if len(self.prev) > 0:
|
||||
nl_token = ctx_main.model.token_nl()
|
||||
nl_logit = logits_array[nl_token]
|
||||
if self.params.penalty_last_n > 0:
|
||||
last_tokens = self.prev[-self.params.penalty_last_n:]
|
||||
last_tokens_size = min(len(last_tokens), self.params.penalty_last_n)
|
||||
if last_tokens_size > 0:
|
||||
last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens)
|
||||
ctx_main.sample_repetition_penalties(
|
||||
token_data_array,
|
||||
# TODO: Only create this once
|
||||
(llama_cpp.llama_token * len(self.prev))(*self.prev),
|
||||
self.params.penalty_last_n,
|
||||
last_tokens_p,
|
||||
last_tokens_size,
|
||||
self.params.penalty_repeat,
|
||||
self.params.penalty_freq,
|
||||
self.params.penalty_present,
|
||||
|
|
|
@ -105,6 +105,9 @@ class Llama:
|
|||
draft_model: Optional[LlamaDraftModel] = None,
|
||||
# Tokenizer Override
|
||||
tokenizer: Optional[BaseLlamaTokenizer] = None,
|
||||
# KV cache quantization
|
||||
type_k: Optional[int] = None,
|
||||
type_v: Optional[int] = None,
|
||||
# Misc
|
||||
verbose: bool = True,
|
||||
# Extra Params
|
||||
|
@ -172,6 +175,8 @@ class Llama:
|
|||
draft_model: Optional draft model to use for speculative decoding.
|
||||
tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
|
||||
verbose: Print verbose output to stderr.
|
||||
type_k: KV cache data type for K (default: f16)
|
||||
type_v: KV cache data type for V (default: f16)
|
||||
|
||||
Raises:
|
||||
ValueError: If the model path does not exist.
|
||||
|
@ -298,7 +303,11 @@ class Llama:
|
|||
) # Must be set to True for speculative decoding
|
||||
self.context_params.embeddings = embedding # TODO: Rename to embeddings
|
||||
self.context_params.offload_kqv = offload_kqv
|
||||
|
||||
# KV cache quantization
|
||||
if type_k is not None:
|
||||
self.context_params.type_k = type_k
|
||||
if type_v is not None:
|
||||
self.context_params.type_v = type_v
|
||||
# Sampling Params
|
||||
self.last_n_tokens_size = last_n_tokens_size
|
||||
|
||||
|
@ -526,14 +535,16 @@ class Llama:
|
|||
# Save tokens
|
||||
self.input_ids[n_past : n_past + n_tokens] = batch
|
||||
# Save logits
|
||||
rows = n_tokens
|
||||
cols = self._n_vocab
|
||||
offset = (
|
||||
0 if self.context_params.logits_all else n_tokens - 1
|
||||
) # NOTE: Only save the last token logits if logits_all is False
|
||||
self.scores[n_past + offset : n_past + n_tokens, :].reshape(-1)[
|
||||
:
|
||||
] = self._ctx.get_logits()[offset * cols : rows * cols]
|
||||
if self.context_params.logits_all:
|
||||
rows = n_tokens
|
||||
cols = self._n_vocab
|
||||
logits = self._ctx.get_logits()[: rows * cols]
|
||||
self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits
|
||||
else:
|
||||
rows = 1
|
||||
cols = self._n_vocab
|
||||
logits = self._ctx.get_logits()[: rows * cols]
|
||||
self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits
|
||||
# Update n_tokens
|
||||
self.n_tokens += n_tokens
|
||||
|
||||
|
@ -1653,6 +1664,7 @@ class Llama:
|
|||
top_k=top_k,
|
||||
min_p=min_p,
|
||||
typical_p=typical_p,
|
||||
logprobs=top_logprobs if logprobs else None,
|
||||
stream=stream,
|
||||
stop=stop,
|
||||
seed=seed,
|
||||
|
@ -1723,6 +1735,7 @@ class Llama:
|
|||
n_threads=self.context_params.n_threads,
|
||||
n_threads_batch=self.context_params.n_threads_batch,
|
||||
rope_scaling_type=self.context_params.rope_scaling_type,
|
||||
pooling_type=self.context_params.pooling_type,
|
||||
rope_freq_base=self.context_params.rope_freq_base,
|
||||
rope_freq_scale=self.context_params.rope_freq_scale,
|
||||
yarn_ext_factor=self.context_params.yarn_ext_factor,
|
||||
|
@ -1732,6 +1745,7 @@ class Llama:
|
|||
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
|
||||
logits_all=self.context_params.logits_all,
|
||||
embedding=self.context_params.embeddings,
|
||||
offload_kqv=self.context_params.offload_kqv,
|
||||
# Sampling Params
|
||||
last_n_tokens_size=self.last_n_tokens_size,
|
||||
# LoRA Params
|
||||
|
@ -1743,51 +1757,17 @@ class Llama:
|
|||
# Chat Format Params
|
||||
chat_format=self.chat_format,
|
||||
chat_handler=self.chat_handler,
|
||||
# Speculative Decidng
|
||||
draft_model=self.draft_model,
|
||||
# KV cache quantization
|
||||
type_k=self.context_params.type_k,
|
||||
type_v=self.context_params.type_v,
|
||||
# Misc
|
||||
verbose=self.verbose,
|
||||
)
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.__init__(
|
||||
model_path=state["model_path"],
|
||||
# Model Params
|
||||
n_gpu_layers=state["n_gpu_layers"],
|
||||
split_mode=state["split_mode"],
|
||||
main_gpu=state["main_gpu"],
|
||||
tensor_split=state["tensor_split"],
|
||||
vocab_only=state["vocab_only"],
|
||||
use_mmap=state["use_mmap"],
|
||||
use_mlock=state["use_mlock"],
|
||||
kv_overrides=state["kv_overrides"],
|
||||
# Context Params
|
||||
seed=state["seed"],
|
||||
n_ctx=state["n_ctx"],
|
||||
n_batch=state["n_batch"],
|
||||
n_threads=state["n_threads"],
|
||||
n_threads_batch=state["n_threads_batch"],
|
||||
rope_freq_base=state["rope_freq_base"],
|
||||
rope_freq_scale=state["rope_freq_scale"],
|
||||
rope_scaling_type=state["rope_scaling_type"],
|
||||
yarn_ext_factor=state["yarn_ext_factor"],
|
||||
yarn_attn_factor=state["yarn_attn_factor"],
|
||||
yarn_beta_fast=state["yarn_beta_fast"],
|
||||
yarn_beta_slow=state["yarn_beta_slow"],
|
||||
yarn_orig_ctx=state["yarn_orig_ctx"],
|
||||
logits_all=state["logits_all"],
|
||||
embedding=state["embedding"],
|
||||
# Sampling Params
|
||||
last_n_tokens_size=state["last_n_tokens_size"],
|
||||
# LoRA Params
|
||||
lora_base=state["lora_base"],
|
||||
lora_path=state["lora_path"],
|
||||
# Backend Params
|
||||
numa=state["numa"],
|
||||
# Chat Format Params
|
||||
chat_format=state["chat_format"],
|
||||
chat_handler=state["chat_handler"],
|
||||
# Misc
|
||||
verbose=state["verbose"],
|
||||
)
|
||||
self.__init__(**state)
|
||||
|
||||
def save_state(self) -> LlamaState:
|
||||
assert self._ctx.ctx is not None
|
||||
|
|
|
@ -231,6 +231,7 @@ def _convert_text_completion_to_chat(
|
|||
"role": "assistant",
|
||||
"content": completion["choices"][0]["text"],
|
||||
},
|
||||
"logprobs": completion["choices"][0]["logprobs"],
|
||||
"finish_reason": completion["choices"][0]["finish_reason"],
|
||||
}
|
||||
],
|
||||
|
@ -254,6 +255,7 @@ def _convert_text_completion_chunks_to_chat(
|
|||
"delta": {
|
||||
"role": "assistant",
|
||||
},
|
||||
"logprobs": None,
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
|
@ -273,6 +275,7 @@ def _convert_text_completion_chunks_to_chat(
|
|||
if chunk["choices"][0]["finish_reason"] is None
|
||||
else {}
|
||||
),
|
||||
"logprobs": chunk["choices"][0]["logprobs"],
|
||||
"finish_reason": chunk["choices"][0]["finish_reason"],
|
||||
}
|
||||
],
|
||||
|
@ -487,6 +490,7 @@ def chat_formatter_to_chat_completion_handler(
|
|||
temperature: float = 0.2,
|
||||
top_p: float = 0.95,
|
||||
top_k: int = 40,
|
||||
logprobs: int = 0,
|
||||
min_p: float = 0.05,
|
||||
typical_p: float = 1.0,
|
||||
stream: bool = False,
|
||||
|
@ -576,6 +580,7 @@ def chat_formatter_to_chat_completion_handler(
|
|||
top_k=top_k,
|
||||
min_p=min_p,
|
||||
typical_p=typical_p,
|
||||
logprobs=logprobs,
|
||||
stream=stream,
|
||||
stop=stop,
|
||||
seed=seed,
|
||||
|
|
|
@ -141,6 +141,70 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
|
|||
|
||||
byref = ctypes.byref # type: ignore
|
||||
|
||||
# from ggml.h
|
||||
# // NOTE: always add types at the end of the enum to keep backward compatibility
|
||||
# enum ggml_type {
|
||||
# GGML_TYPE_F32 = 0,
|
||||
# GGML_TYPE_F16 = 1,
|
||||
# GGML_TYPE_Q4_0 = 2,
|
||||
# GGML_TYPE_Q4_1 = 3,
|
||||
# // GGML_TYPE_Q4_2 = 4, support has been removed
|
||||
# // GGML_TYPE_Q4_3 = 5, support has been removed
|
||||
# GGML_TYPE_Q5_0 = 6,
|
||||
# GGML_TYPE_Q5_1 = 7,
|
||||
# GGML_TYPE_Q8_0 = 8,
|
||||
# GGML_TYPE_Q8_1 = 9,
|
||||
# GGML_TYPE_Q2_K = 10,
|
||||
# GGML_TYPE_Q3_K = 11,
|
||||
# GGML_TYPE_Q4_K = 12,
|
||||
# GGML_TYPE_Q5_K = 13,
|
||||
# GGML_TYPE_Q6_K = 14,
|
||||
# GGML_TYPE_Q8_K = 15,
|
||||
# GGML_TYPE_IQ2_XXS = 16,
|
||||
# GGML_TYPE_IQ2_XS = 17,
|
||||
# GGML_TYPE_IQ3_XXS = 18,
|
||||
# GGML_TYPE_IQ1_S = 19,
|
||||
# GGML_TYPE_IQ4_NL = 20,
|
||||
# GGML_TYPE_IQ3_S = 21,
|
||||
# GGML_TYPE_IQ2_S = 22,
|
||||
# GGML_TYPE_IQ4_XS = 23,
|
||||
# GGML_TYPE_I8 = 24,
|
||||
# GGML_TYPE_I16 = 25,
|
||||
# GGML_TYPE_I32 = 26,
|
||||
# GGML_TYPE_I64 = 27,
|
||||
# GGML_TYPE_F64 = 28,
|
||||
# GGML_TYPE_IQ1_M = 29,
|
||||
# GGML_TYPE_COUNT,
|
||||
# };
|
||||
GGML_TYPE_F32 = 0
|
||||
GGML_TYPE_F16 = 1
|
||||
GGML_TYPE_Q4_0 = 2
|
||||
GGML_TYPE_Q4_1 = 3
|
||||
GGML_TYPE_Q5_0 = 6
|
||||
GGML_TYPE_Q5_1 = 7
|
||||
GGML_TYPE_Q8_0 = 8
|
||||
GGML_TYPE_Q8_1 = 9
|
||||
GGML_TYPE_Q2_K = 10
|
||||
GGML_TYPE_Q3_K = 11
|
||||
GGML_TYPE_Q4_K = 12
|
||||
GGML_TYPE_Q5_K = 13
|
||||
GGML_TYPE_Q6_K = 14
|
||||
GGML_TYPE_Q8_K = 15
|
||||
GGML_TYPE_IQ2_XXS = 16
|
||||
GGML_TYPE_IQ2_XS = 17
|
||||
GGML_TYPE_IQ3_XXS = 18
|
||||
GGML_TYPE_IQ1_S = 19
|
||||
GGML_TYPE_IQ4_NL = 20
|
||||
GGML_TYPE_IQ3_S = 21
|
||||
GGML_TYPE_IQ2_S = 22
|
||||
GGML_TYPE_IQ4_XS = 23
|
||||
GGML_TYPE_I8 = 24
|
||||
GGML_TYPE_I16 = 25
|
||||
GGML_TYPE_I32 = 26
|
||||
GGML_TYPE_I64 = 27
|
||||
GGML_TYPE_F64 = 28
|
||||
GGML_TYPE_IQ1_M = 29
|
||||
GGML_TYPE_COUNT = 30
|
||||
|
||||
# from ggml-backend.h
|
||||
# typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
|
@ -175,8 +239,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E
|
|||
|
||||
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
|
||||
# define LLAMA_SESSION_VERSION 4
|
||||
LLAMA_SESSION_VERSION = 4
|
||||
# define LLAMA_SESSION_VERSION 5
|
||||
LLAMA_SESSION_VERSION = 5
|
||||
|
||||
|
||||
# struct llama_model;
|
||||
|
@ -199,14 +263,18 @@ llama_seq_id = ctypes.c_int32
|
|||
|
||||
# enum llama_vocab_type {
|
||||
# LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
||||
# LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
|
||||
# LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
|
||||
# LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
||||
# LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
||||
# LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
||||
# LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||
# };
|
||||
LLAMA_VOCAB_TYPE_NONE = 0
|
||||
"""For models without vocab"""
|
||||
LLAMA_VOCAB_TYPE_SPM = 1
|
||||
"""LLaMA tokenizer based on byte-level BPE with byte fallback"""
|
||||
LLAMA_VOCAB_TYPE_BPE = 2
|
||||
"""GPT-2 tokenizer based on byte-level BPE"""
|
||||
LLAMA_VOCAB_TYPE_WPM = 3
|
||||
"""BERT tokenizer based on WordPiece"""
|
||||
|
||||
|
||||
# // note: these values should be synchronized with ggml_rope
|
||||
|
@ -274,6 +342,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
|
|||
# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
|
||||
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
# };
|
||||
|
@ -677,6 +746,7 @@ It might not exist for progress report where '.' is output repeatedly."""
|
|||
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
# bool pure; // quantize all tensors to the default type
|
||||
# void * imatrix; // pointer to importance matrix data
|
||||
# void * kv_overrides; // pointer to vector containing overrides
|
||||
# } llama_model_quantize_params;
|
||||
class llama_model_quantize_params(ctypes.Structure):
|
||||
"""Parameters for llama_model_quantize
|
||||
|
@ -691,6 +761,7 @@ class llama_model_quantize_params(ctypes.Structure):
|
|||
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
pure (bool): quantize all tensors to the default type
|
||||
imatrix (ctypes.c_void_p): pointer to importance matrix data
|
||||
kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
|
||||
"""
|
||||
|
||||
_fields_ = [
|
||||
|
@ -703,6 +774,7 @@ class llama_model_quantize_params(ctypes.Structure):
|
|||
("only_copy", ctypes.c_bool),
|
||||
("pure", ctypes.c_bool),
|
||||
("imatrix", ctypes.c_void_p),
|
||||
("kv_overrides", ctypes.c_void_p),
|
||||
]
|
||||
|
||||
|
||||
|
@ -1838,9 +1910,9 @@ def llama_synchronize(ctx: llama_context_p, /):
|
|||
|
||||
|
||||
# // Token logits obtained from the last call to llama_decode()
|
||||
# // The logits for the last token are stored in the last row
|
||||
# // Logits for which llama_batch.logits[i] == 0 are undefined
|
||||
# // Rows: n_tokens provided with llama_batch
|
||||
# // The logits for which llama_batch.logits[i] != 0 are stored contiguously
|
||||
# // in the order they have appeared in the batch.
|
||||
# // Rows: number of tokens for which llama_batch.logits[i] != 0
|
||||
# // Cols: n_vocab
|
||||
# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
@ctypes_function(
|
||||
|
@ -1859,7 +1931,8 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
|
|||
|
||||
|
||||
# // Logits for the ith token. Equivalent to:
|
||||
# // llama_get_logits(ctx) + i*n_vocab
|
||||
# // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
||||
# // returns NULL for invalid ids.
|
||||
# LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
||||
@ctypes_function(
|
||||
"llama_get_logits_ith",
|
||||
|
@ -1874,8 +1947,12 @@ def llama_get_logits_ith(
|
|||
...
|
||||
|
||||
|
||||
# // Get all output token embeddings
|
||||
# // shape: [n_tokens*n_embd] (1-dimensional)
|
||||
# // Get all output token embeddings.
|
||||
# // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
|
||||
# // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
|
||||
# // in the order they have appeared in the batch.
|
||||
# // shape: [n_outputs*n_embd]
|
||||
# // Otherwise, returns NULL.
|
||||
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
@ctypes_function(
|
||||
"llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
|
||||
|
@ -1886,9 +1963,10 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
|
|||
...
|
||||
|
||||
|
||||
# // Get the embeddings for the ith token
|
||||
# // llama_get_embeddings(ctx) + i*n_embd
|
||||
# // Get the embeddings for the ith token. Equivalent to:
|
||||
# // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
||||
# // shape: [n_embd] (1-dimensional)
|
||||
# // returns NULL for invalid ids.
|
||||
# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||
@ctypes_function(
|
||||
"llama_get_embeddings_ith",
|
||||
|
|
|
@ -84,6 +84,7 @@ class ChatCompletionFunction(TypedDict):
|
|||
class ChatCompletionResponseChoice(TypedDict):
|
||||
index: int
|
||||
message: "ChatCompletionResponseMessage"
|
||||
logprobs: Optional[CompletionLogprobs]
|
||||
finish_reason: Optional[str]
|
||||
|
||||
|
||||
|
|
|
@ -405,6 +405,18 @@ async def create_chat_completion(
|
|||
}
|
||||
},
|
||||
},
|
||||
"logprobs": {
|
||||
"summary": "Logprobs",
|
||||
"value": {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is the capital of France?"},
|
||||
],
|
||||
"logprobs": True,
|
||||
"top_logprobs": 10
|
||||
},
|
||||
},
|
||||
}
|
||||
),
|
||||
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
|
||||
|
@ -493,7 +505,7 @@ async def tokenize(
|
|||
) -> TokenizeInputResponse:
|
||||
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
|
||||
|
||||
return {"tokens": tokens}
|
||||
return TokenizeInputResponse(tokens=tokens)
|
||||
|
||||
|
||||
@router.post(
|
||||
|
@ -508,7 +520,7 @@ async def count_query_tokens(
|
|||
) -> TokenizeInputCountResponse:
|
||||
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
|
||||
|
||||
return {"count": len(tokens)}
|
||||
return TokenizeInputCountResponse(count=len(tokens))
|
||||
|
||||
|
||||
@router.post(
|
||||
|
@ -523,4 +535,4 @@ async def detokenize(
|
|||
) -> DetokenizeInputResponse:
|
||||
text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")
|
||||
|
||||
return {"text": text}
|
||||
return DetokenizeInputResponse(text=text)
|
||||
|
|
|
@ -175,6 +175,9 @@ class LlamaProxy:
|
|||
chat_handler=chat_handler,
|
||||
# Speculative Decoding
|
||||
draft_model=draft_model,
|
||||
# KV Cache Quantization
|
||||
type_k=settings.type_k,
|
||||
type_v=settings.type_v,
|
||||
# Tokenizer
|
||||
tokenizer=tokenizer,
|
||||
# Misc
|
||||
|
|
|
@ -159,6 +159,15 @@ class ModelSettings(BaseSettings):
|
|||
default=10,
|
||||
description="Number of tokens to predict using the draft model.",
|
||||
)
|
||||
# KV Cache Quantization
|
||||
type_k: Optional[int] = Field(
|
||||
default=None,
|
||||
description="Type of the key cache quantization.",
|
||||
)
|
||||
type_v: Optional[int] = Field(
|
||||
default=None,
|
||||
description="Type of the value cache quantization.",
|
||||
)
|
||||
# Misc
|
||||
verbose: bool = Field(
|
||||
default=True, description="Whether to print debug information."
|
||||
|
|
|
@ -130,7 +130,6 @@ class CreateCompletionRequest(BaseModel):
|
|||
presence_penalty: Optional[float] = presence_penalty_field
|
||||
frequency_penalty: Optional[float] = frequency_penalty_field
|
||||
logit_bias: Optional[Dict[str, float]] = Field(None)
|
||||
logprobs: Optional[int] = Field(None)
|
||||
seed: Optional[int] = Field(None)
|
||||
|
||||
# ignored or currently unsupported
|
||||
|
@ -209,6 +208,15 @@ class CreateChatCompletionRequest(BaseModel):
|
|||
default=None,
|
||||
description="The maximum number of tokens to generate. Defaults to inf",
|
||||
)
|
||||
logprobs: Optional[bool] = Field(
|
||||
default=False,
|
||||
description="Whether to output the logprobs or not. Default is True"
|
||||
)
|
||||
top_logprobs: Optional[int] = Field(
|
||||
default=None,
|
||||
ge=0,
|
||||
description="The number of logprobs to generate. If None, no logprobs are generated. logprobs need to set to True.",
|
||||
)
|
||||
temperature: float = temperature_field
|
||||
top_p: float = top_p_field
|
||||
min_p: float = min_p_field
|
||||
|
@ -268,7 +276,7 @@ class ModelList(TypedDict):
|
|||
|
||||
class TokenizeInputRequest(BaseModel):
|
||||
model: Optional[str] = model_field
|
||||
input: Optional[str] = Field(description="The input to tokenize.")
|
||||
input: str = Field(description="The input to tokenize.")
|
||||
|
||||
model_config = {
|
||||
"json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
|
||||
|
|
58
scripts/releases-to-pep-503.sh
Executable file
58
scripts/releases-to-pep-503.sh
Executable file
|
@ -0,0 +1,58 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Get output directory or default to index/whl/cpu
|
||||
output_dir=${1:-"index/whl/cpu"}
|
||||
|
||||
# Create output directory
|
||||
mkdir -p $output_dir
|
||||
|
||||
# Change to output directory
|
||||
pushd $output_dir
|
||||
|
||||
# Create an index html file
|
||||
echo "<!DOCTYPE html>" > index.html
|
||||
echo "<html>" >> index.html
|
||||
echo " <head></head>" >> index.html
|
||||
echo " <body>" >> index.html
|
||||
echo " <a href=\"llama-cpp-python/\">llama-cpp-python</a>" >> index.html
|
||||
echo " <br>" >> index.html
|
||||
echo " </body>" >> index.html
|
||||
echo "</html>" >> index.html
|
||||
echo "" >> index.html
|
||||
|
||||
# Create llama-cpp-python directory
|
||||
mkdir -p llama-cpp-python
|
||||
|
||||
# Change to llama-cpp-python directory
|
||||
pushd llama-cpp-python
|
||||
|
||||
# Create an index html file
|
||||
echo "<!DOCTYPE html>" > index.html
|
||||
echo "<html>" >> index.html
|
||||
echo " <body>" >> index.html
|
||||
echo " <h1>Links for llama-cpp-python</h1>" >> index.html
|
||||
|
||||
# Get all releases
|
||||
releases=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases | jq -r .[].tag_name)
|
||||
|
||||
# Get pattern from second arg or default to valid python package version pattern
|
||||
pattern=${2:-"^[v]?[0-9]+\.[0-9]+\.[0-9]+$"}
|
||||
|
||||
# Filter releases by pattern
|
||||
releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern)
|
||||
|
||||
# For each release, get all assets
|
||||
for release in $releases; do
|
||||
assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets)
|
||||
echo " <h2>$release</h2>" >> index.html
|
||||
for asset in $(echo $assets | jq -r .[].browser_download_url); do
|
||||
if [[ $asset == *".whl" ]]; then
|
||||
echo " <a href=\"$asset\">$asset</a>" >> index.html
|
||||
echo " <br>" >> index.html
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
echo " </body>" >> index.html
|
||||
echo "</html>" >> index.html
|
||||
echo "" >> index.html
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652
|
||||
Subproject commit 60cdf40cc32f0ad4cb11e0ca8fd38f3b93d8d640
|
Loading…
Reference in a new issue