Compare commits
7 commits
64058abaa0
...
5f5ea0a49c
Author | SHA1 | Date | |
---|---|---|---|
5f5ea0a49c | |||
|
8401c6f2d1 | ||
|
9e396b3ebd | ||
|
5af81634cb | ||
|
320a5d7ea5 | ||
|
dbcf64cf07 | ||
|
e342161371 |
12 changed files with 240 additions and 173 deletions
4
.github/workflows/build-and-release.yaml
vendored
4
.github/workflows/build-and-release.yaml
vendored
|
@ -29,7 +29,7 @@ jobs:
|
||||||
python -m pip install -e .[all]
|
python -m pip install -e .[all]
|
||||||
|
|
||||||
- name: Build wheels
|
- name: Build wheels
|
||||||
uses: pypa/cibuildwheel@v2.18.1
|
uses: pypa/cibuildwheel@v2.19.0
|
||||||
env:
|
env:
|
||||||
# disable repair
|
# disable repair
|
||||||
CIBW_REPAIR_WHEEL_COMMAND: ""
|
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||||
|
@ -56,7 +56,7 @@ jobs:
|
||||||
platforms: linux/arm64
|
platforms: linux/arm64
|
||||||
|
|
||||||
- name: Build wheels
|
- name: Build wheels
|
||||||
uses: pypa/cibuildwheel@v2.18.1
|
uses: pypa/cibuildwheel@v2.19.0
|
||||||
env:
|
env:
|
||||||
CIBW_SKIP: "*musllinux* pp*"
|
CIBW_SKIP: "*musllinux* pp*"
|
||||||
CIBW_REPAIR_WHEEL_COMMAND: ""
|
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||||
|
|
23
.github/workflows/build-wheels-cuda.yaml
vendored
23
.github/workflows/build-wheels-cuda.yaml
vendored
|
@ -20,8 +20,8 @@ jobs:
|
||||||
id: set-matrix
|
id: set-matrix
|
||||||
run: |
|
run: |
|
||||||
$matrix = @{
|
$matrix = @{
|
||||||
'os' = @('ubuntu-20.04', 'windows-latest')
|
'os' = @('ubuntu-latest', 'windows-latest')
|
||||||
'pyver' = @("3.10", "3.11", "3.12")
|
'pyver' = @("3.9", "3.10", "3.11", "3.12")
|
||||||
'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1")
|
'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1")
|
||||||
'releasetag' = @("basic")
|
'releasetag' = @("basic")
|
||||||
}
|
}
|
||||||
|
@ -50,6 +50,7 @@ jobs:
|
||||||
- uses: actions/setup-python@v5
|
- uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.pyver }}
|
python-version: ${{ matrix.pyver }}
|
||||||
|
cache: 'pip'
|
||||||
|
|
||||||
- name: Setup Mamba
|
- name: Setup Mamba
|
||||||
uses: conda-incubator/setup-miniconda@v3.0.4
|
uses: conda-incubator/setup-miniconda@v3.0.4
|
||||||
|
@ -109,15 +110,15 @@ jobs:
|
||||||
$env:VERBOSE = '1'
|
$env:VERBOSE = '1'
|
||||||
$env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all'
|
$env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all'
|
||||||
$env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
|
$env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
|
||||||
if ($env:AVXVER -eq 'AVX') {
|
# if ($env:AVXVER -eq 'AVX') {
|
||||||
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
|
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
|
||||||
}
|
# }
|
||||||
if ($env:AVXVER -eq 'AVX512') {
|
# if ($env:AVXVER -eq 'AVX512') {
|
||||||
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on'
|
# $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on'
|
||||||
}
|
# }
|
||||||
if ($env:AVXVER -eq 'basic') {
|
# if ($env:AVXVER -eq 'basic') {
|
||||||
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
|
# $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
|
||||||
}
|
# }
|
||||||
python -m build --wheel
|
python -m build --wheel
|
||||||
# write the build tag to the output
|
# write the build tag to the output
|
||||||
Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
|
Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
|
||||||
|
|
89
.github/workflows/build-wheels-metal.yaml
vendored
89
.github/workflows/build-wheels-metal.yaml
vendored
|
@ -6,81 +6,60 @@ permissions:
|
||||||
contents: write
|
contents: write
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
define_matrix:
|
|
||||||
name: Define Build Matrix
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
outputs:
|
|
||||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: pwsh
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Define Job Output
|
|
||||||
id: set-matrix
|
|
||||||
run: |
|
|
||||||
$matrix = @{
|
|
||||||
'os' = @('macos-11', 'macos-12', 'macos-13')
|
|
||||||
'pyver' = @('3.10', '3.11', '3.12')
|
|
||||||
}
|
|
||||||
|
|
||||||
$matrixOut = ConvertTo-Json $matrix -Compress
|
|
||||||
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
|
|
||||||
|
|
||||||
build_wheels:
|
build_wheels:
|
||||||
name: ${{ matrix.os }} Python ${{ matrix.pyver }}
|
name: Build wheels on ${{ matrix.os }}
|
||||||
needs: define_matrix
|
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
|
matrix:
|
||||||
env:
|
os: [macos-12, macos-13, macos-14]
|
||||||
OSVER: ${{ matrix.os }}
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: "recursive"
|
submodules: "recursive"
|
||||||
|
|
||||||
|
# Used to host cibuildwheel
|
||||||
- uses: actions/setup-python@v5
|
- uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.pyver }}
|
python-version: "3.12"
|
||||||
|
cache: 'pip'
|
||||||
|
|
||||||
- name: Install Dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install build wheel cmake
|
python -m pip install --upgrade pip
|
||||||
|
python -m pip install -e .[all]
|
||||||
|
|
||||||
- name: Build Wheel
|
- name: Build wheels
|
||||||
run: |
|
uses: pypa/cibuildwheel@v2.18.1
|
||||||
XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer"
|
env:
|
||||||
XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin"
|
# disable repair
|
||||||
export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_METAL=on"
|
CIBW_REPAIR_WHEEL_COMMAND: ""
|
||||||
[[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0"
|
CIBW_ARCHS: "arm64"
|
||||||
[[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0"
|
CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DLLAMA_METAL=on"
|
||||||
[[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0"
|
CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*"
|
||||||
|
with:
|
||||||
|
package-dir: .
|
||||||
|
output-dir: wheelhouse2
|
||||||
|
|
||||||
export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64"
|
- uses: actions/upload-artifact@v4
|
||||||
VERBOSE=1 python -m build --wheel
|
with:
|
||||||
|
name: wheels-mac_${{ matrix.os }}
|
||||||
|
path: ./wheelhouse2/*.whl
|
||||||
|
|
||||||
if [[ "$OSVER" == "macos-13" ]]; then
|
release:
|
||||||
export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
|
name: Release
|
||||||
export MACOSX_DEPLOYMENT_TARGET="14.0"
|
needs: [build_wheels]
|
||||||
VERBOSE=1 python -m build --wheel
|
runs-on: ubuntu-latest
|
||||||
fi
|
|
||||||
|
|
||||||
for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done
|
steps:
|
||||||
|
- uses: actions/download-artifact@v4
|
||||||
export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_METAL=on" && export ARCHFLAGS="-arch x86_64"
|
with:
|
||||||
VERBOSE=1 python -m build --wheel
|
merge-multiple: true
|
||||||
|
path: dist2
|
||||||
if [[ "$OSVER" == "macos-13" ]]; then
|
|
||||||
export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
|
|
||||||
export MACOSX_DEPLOYMENT_TARGET="14.0"
|
|
||||||
VERBOSE=1 python -m build --wheel
|
|
||||||
fi
|
|
||||||
|
|
||||||
- uses: softprops/action-gh-release@v2
|
- uses: softprops/action-gh-release@v2
|
||||||
with:
|
with:
|
||||||
files: dist/*
|
files: dist2/*
|
||||||
# set release name to <tag>-metal
|
# set release name to <tag>-metal
|
||||||
tag_name: ${{ github.ref_name }}-metal
|
tag_name: ${{ github.ref_name }}-metal
|
||||||
env:
|
env:
|
||||||
|
|
9
.github/workflows/publish-to-test.yaml
vendored
9
.github/workflows/publish-to-test.yaml
vendored
|
@ -22,7 +22,8 @@ jobs:
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.8"
|
python-version: "3.11"
|
||||||
|
cache: 'pip'
|
||||||
- name: Append Dev Version to __version__
|
- name: Append Dev Version to __version__
|
||||||
run: |
|
run: |
|
||||||
DEV_VERSION=${{ github.event.inputs.dev_version }}
|
DEV_VERSION=${{ github.event.inputs.dev_version }}
|
||||||
|
@ -31,11 +32,11 @@ jobs:
|
||||||
sed -i 's/__version__ = \".*\"/__version__ = \"'"${NEW_VERSION}"'\"/' llama_cpp/__init__.py
|
sed -i 's/__version__ = \".*\"/__version__ = \"'"${NEW_VERSION}"'\"/' llama_cpp/__init__.py
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install --upgrade pip build
|
python -m pip install --upgrade pip build
|
||||||
python3 -m pip install -e .[all]
|
python -m pip install -e .[all]
|
||||||
- name: Build source distribution
|
- name: Build source distribution
|
||||||
run: |
|
run: |
|
||||||
python3 -m build --sdist
|
python -m build --sdist
|
||||||
- name: Publish to Test PyPI
|
- name: Publish to Test PyPI
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
with:
|
with:
|
||||||
|
|
8
.github/workflows/publish.yaml
vendored
8
.github/workflows/publish.yaml
vendored
|
@ -16,14 +16,14 @@ jobs:
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.8"
|
python-version: "3.9"
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install --upgrade pip build
|
python -m pip install --upgrade pip build
|
||||||
python3 -m pip install -e .[all]
|
python -m pip install -e .[all]
|
||||||
- name: Build source distribution
|
- name: Build source distribution
|
||||||
run: |
|
run: |
|
||||||
python3 -m build --sdist
|
python -m build --sdist
|
||||||
- name: Publish distribution to PyPI
|
- name: Publish distribution to PyPI
|
||||||
# TODO: move to tag based releases
|
# TODO: move to tag based releases
|
||||||
# if: startsWith(github.ref, 'refs/tags')
|
# if: startsWith(github.ref, 'refs/tags')
|
||||||
|
|
27
.github/workflows/test-pypi.yaml
vendored
27
.github/workflows/test-pypi.yaml
vendored
|
@ -8,57 +8,60 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
cache: 'pip'
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python3 -m pip install --verbose llama-cpp-python[all]
|
python -m pip install --verbose llama-cpp-python[all]
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
python3 -c "import llama_cpp"
|
python -c "import llama_cpp"
|
||||||
|
|
||||||
build-windows:
|
build-windows:
|
||||||
|
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
cache: 'pip'
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python3 -m pip install --verbose llama-cpp-python[all]
|
python -m pip install --verbose llama-cpp-python[all]
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
python3 -c "import llama_cpp"
|
python -c "import llama_cpp"
|
||||||
|
|
||||||
build-macos:
|
build-macos:
|
||||||
|
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
cache: 'pip'
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python3 -m pip install --verbose llama-cpp-python[all]
|
python -m pip install --verbose llama-cpp-python[all]
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
python3 -c "import llama_cpp"
|
python -c "import llama_cpp"
|
||||||
|
|
47
.github/workflows/test.yaml
vendored
47
.github/workflows/test.yaml
vendored
|
@ -14,7 +14,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
@ -24,20 +24,21 @@ jobs:
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
cache: 'pip'
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python3 -m pip install .[all] -v
|
python -m pip install .[all] -v
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
python3 -m pytest
|
python -m pytest
|
||||||
|
|
||||||
build-windows:
|
build-windows:
|
||||||
|
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
@ -47,20 +48,21 @@ jobs:
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
cache: 'pip'
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python3 -m pip install .[all] -v
|
python -m pip install .[all] -v
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
python3 -m pytest
|
python -m pytest
|
||||||
|
|
||||||
build-macos:
|
build-macos:
|
||||||
|
|
||||||
runs-on: macos-13
|
runs-on: macos-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
@ -70,13 +72,14 @@ jobs:
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
cache: 'pip'
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python3 -m pip install .[all] --verbose
|
python -m pip install .[all] --verbose
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
python3 -m pytest
|
python -m pytest
|
||||||
|
|
||||||
# build-linux-opencl:
|
# build-linux-opencl:
|
||||||
|
|
||||||
|
@ -98,29 +101,29 @@ jobs:
|
||||||
# sudo apt-get install -y --no-install-recommends llvm intel-oneapi-runtime-opencl intel-oneapi-runtime-compilers libclblast-dev
|
# sudo apt-get install -y --no-install-recommends llvm intel-oneapi-runtime-opencl intel-oneapi-runtime-compilers libclblast-dev
|
||||||
# - name: Install dependencies
|
# - name: Install dependencies
|
||||||
# run: |
|
# run: |
|
||||||
# python3 -m pip install --upgrade pip
|
# python -m pip install --upgrade pip
|
||||||
# CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install .[all] --verbose
|
# CMAKE_ARGS="-DLLAMA_CLBLAST=on" python -m pip install .[all] --verbose
|
||||||
# - name: Test with pytest
|
# - name: Test with pytest
|
||||||
# run: |
|
# run: |
|
||||||
# python3 -m pytest
|
# python -m pytest
|
||||||
|
|
||||||
|
|
||||||
build-macos-metal:
|
build-macos-metal:
|
||||||
|
|
||||||
runs-on: macos-13
|
runs-on: macos-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: "recursive"
|
submodules: "recursive"
|
||||||
- name: Set up Python 3.8
|
- name: Set up Python 3.9
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.8"
|
python-version: "3.9"
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose
|
CMAKE_ARGS="-DLLAMA_METAL=on" python -m pip install .[all] --verbose
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
python3 -m pytest
|
python -m pytest
|
||||||
|
|
33
examples/high_level_api/high_level_api_infill.py
Normal file
33
examples/high_level_api/high_level_api_infill.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from llama_cpp import Llama
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
|
||||||
|
parser.add_argument("-p", "--prompt", type=str, default="def add(")
|
||||||
|
parser.add_argument("-s", "--suffix", type=str, default="\n return sum\n\n")
|
||||||
|
parser.add_argument("-i", "--spm-infill", action='store_true')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
llm = Llama(model_path=args.model, n_gpu_layers=-1, spm_infill=args.spm_infill)
|
||||||
|
|
||||||
|
output = llm.create_completion(
|
||||||
|
temperature = 0.0,
|
||||||
|
repeat_penalty = 1.0,
|
||||||
|
prompt = args.prompt,
|
||||||
|
suffix = args.suffix,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Models sometimes repeat suffix in response, attempt to filter that
|
||||||
|
response = output["choices"][0]["text"]
|
||||||
|
response_stripped = response.rstrip()
|
||||||
|
unwanted_response_suffix = args.suffix.rstrip()
|
||||||
|
unwanted_response_length = len(unwanted_response_suffix)
|
||||||
|
|
||||||
|
filtered = False
|
||||||
|
if unwanted_response_suffix and response_stripped[-unwanted_response_length:] == unwanted_response_suffix:
|
||||||
|
response = response_stripped[:-unwanted_response_length]
|
||||||
|
filtered = True
|
||||||
|
|
||||||
|
print(f"Fill-in-Middle completion{' (filtered)' if filtered else ''}:\n\n{args.prompt}\033[32m{response}\033[{'33' if filtered else '0'}m{args.suffix}\033[0m")
|
||||||
|
|
|
@ -9,6 +9,7 @@ from typing import (
|
||||||
Sequence,
|
Sequence,
|
||||||
)
|
)
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from contextlib import ExitStack
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
|
@ -27,9 +28,6 @@ class _LlamaModel:
|
||||||
"""Intermediate Python wrapper for a llama.cpp llama_model.
|
"""Intermediate Python wrapper for a llama.cpp llama_model.
|
||||||
NOTE: For stability it's recommended you use the Llama class instead."""
|
NOTE: For stability it's recommended you use the Llama class instead."""
|
||||||
|
|
||||||
_llama_free_model = None
|
|
||||||
# NOTE: this must be "saved" here to avoid exceptions when calling __del__
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
|
@ -40,8 +38,7 @@ class _LlamaModel:
|
||||||
self.path_model = path_model
|
self.path_model = path_model
|
||||||
self.params = params
|
self.params = params
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
|
self._exit_stack = ExitStack()
|
||||||
self._llama_free_model = llama_cpp._lib.llama_free_model # type: ignore
|
|
||||||
|
|
||||||
self.model = None
|
self.model = None
|
||||||
|
|
||||||
|
@ -56,11 +53,17 @@ class _LlamaModel:
|
||||||
if self.model is None:
|
if self.model is None:
|
||||||
raise ValueError(f"Failed to load model from file: {path_model}")
|
raise ValueError(f"Failed to load model from file: {path_model}")
|
||||||
|
|
||||||
def __del__(self):
|
def free_model():
|
||||||
if self.model is not None and self._llama_free_model is not None:
|
if self.model is None:
|
||||||
self._llama_free_model(self.model)
|
return
|
||||||
|
llama_cpp.llama_free_model(self.model)
|
||||||
self.model = None
|
self.model = None
|
||||||
|
|
||||||
|
self._exit_stack.callback(free_model)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self._exit_stack.close()
|
||||||
|
|
||||||
def vocab_type(self) -> int:
|
def vocab_type(self) -> int:
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
return llama_cpp.llama_vocab_type(self.model)
|
return llama_cpp.llama_vocab_type(self.model)
|
||||||
|
@ -170,6 +173,14 @@ class _LlamaModel:
|
||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
return llama_cpp.llama_token_eot(self.model)
|
return llama_cpp.llama_token_eot(self.model)
|
||||||
|
|
||||||
|
def add_bos_token(self) -> int:
|
||||||
|
assert self.model is not None
|
||||||
|
return llama_cpp.llama_add_bos_token(self.model)
|
||||||
|
|
||||||
|
def add_eos_token(self) -> int:
|
||||||
|
assert self.model is not None
|
||||||
|
return llama_cpp.llama_add_eos_token(self.model)
|
||||||
|
|
||||||
# Tokenization
|
# Tokenization
|
||||||
|
|
||||||
def tokenize(self, text: bytes, add_bos: bool, special: bool):
|
def tokenize(self, text: bytes, add_bos: bool, special: bool):
|
||||||
|
@ -249,8 +260,6 @@ class _LlamaContext:
|
||||||
"""Intermediate Python wrapper for a llama.cpp llama_context.
|
"""Intermediate Python wrapper for a llama.cpp llama_context.
|
||||||
NOTE: For stability it's recommended you use the Llama class instead."""
|
NOTE: For stability it's recommended you use the Llama class instead."""
|
||||||
|
|
||||||
_llama_free = None
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
|
@ -261,24 +270,28 @@ class _LlamaContext:
|
||||||
self.model = model
|
self.model = model
|
||||||
self.params = params
|
self.params = params
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
|
self._exit_stack = ExitStack()
|
||||||
|
|
||||||
self._llama_free = llama_cpp._lib.llama_free # type: ignore
|
|
||||||
self.ctx = None
|
self.ctx = None
|
||||||
|
|
||||||
assert self.model.model is not None
|
assert self.model.model is not None
|
||||||
|
|
||||||
self.ctx = llama_cpp.llama_new_context_with_model(
|
self.ctx = llama_cpp.llama_new_context_with_model(self.model.model, self.params)
|
||||||
self.model.model, self.params
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.ctx is None:
|
if self.ctx is None:
|
||||||
raise ValueError("Failed to create llama_context")
|
raise ValueError("Failed to create llama_context")
|
||||||
|
|
||||||
def __del__(self):
|
def free_ctx():
|
||||||
if self.ctx is not None and self._llama_free is not None:
|
if self.ctx is None:
|
||||||
self._llama_free(self.ctx)
|
return
|
||||||
|
llama_cpp.llama_free(self.ctx)
|
||||||
self.ctx = None
|
self.ctx = None
|
||||||
|
|
||||||
|
self._exit_stack.callback(free_ctx)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self._exit_stack.close()
|
||||||
|
|
||||||
def n_ctx(self) -> int:
|
def n_ctx(self) -> int:
|
||||||
assert self.ctx is not None
|
assert self.ctx is not None
|
||||||
return llama_cpp.llama_n_ctx(self.ctx)
|
return llama_cpp.llama_n_ctx(self.ctx)
|
||||||
|
@ -493,8 +506,6 @@ class _LlamaContext:
|
||||||
|
|
||||||
|
|
||||||
class _LlamaBatch:
|
class _LlamaBatch:
|
||||||
_llama_batch_free = None
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True
|
self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True
|
||||||
):
|
):
|
||||||
|
@ -502,19 +513,24 @@ class _LlamaBatch:
|
||||||
self.embd = embd
|
self.embd = embd
|
||||||
self.n_seq_max = n_seq_max
|
self.n_seq_max = n_seq_max
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
|
self._exit_stack = ExitStack()
|
||||||
self._llama_batch_free = llama_cpp._lib.llama_batch_free # type: ignore
|
|
||||||
|
|
||||||
self.batch = None
|
self.batch = None
|
||||||
self.batch = llama_cpp.llama_batch_init(
|
self.batch = llama_cpp.llama_batch_init(
|
||||||
self._n_tokens, self.embd, self.n_seq_max
|
self._n_tokens, self.embd, self.n_seq_max
|
||||||
)
|
)
|
||||||
|
|
||||||
def __del__(self):
|
def free_batch():
|
||||||
if self.batch is not None and self._llama_batch_free is not None:
|
if self.batch is None:
|
||||||
self._llama_batch_free(self.batch)
|
return
|
||||||
|
llama_cpp.llama_batch_free(self.batch)
|
||||||
self.batch = None
|
self.batch = None
|
||||||
|
|
||||||
|
self._exit_stack.callback(free_batch)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self._exit_stack.close()
|
||||||
|
|
||||||
def n_tokens(self) -> int:
|
def n_tokens(self) -> int:
|
||||||
assert self.batch is not None
|
assert self.batch is not None
|
||||||
return self.batch.n_tokens
|
return self.batch.n_tokens
|
||||||
|
|
|
@ -9,7 +9,9 @@ import ctypes
|
||||||
import typing
|
import typing
|
||||||
import fnmatch
|
import fnmatch
|
||||||
import warnings
|
import warnings
|
||||||
|
import contextlib
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
from typing import (
|
from typing import (
|
||||||
List,
|
List,
|
||||||
|
@ -21,6 +23,7 @@ from typing import (
|
||||||
Deque,
|
Deque,
|
||||||
Callable,
|
Callable,
|
||||||
Dict,
|
Dict,
|
||||||
|
Type,
|
||||||
)
|
)
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -115,6 +118,7 @@ class Llama:
|
||||||
type_k: Optional[int] = None,
|
type_k: Optional[int] = None,
|
||||||
type_v: Optional[int] = None,
|
type_v: Optional[int] = None,
|
||||||
# Misc
|
# Misc
|
||||||
|
spm_infill: bool = False,
|
||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
# Extra Params
|
# Extra Params
|
||||||
**kwargs, # type: ignore
|
**kwargs, # type: ignore
|
||||||
|
@ -185,6 +189,7 @@ class Llama:
|
||||||
verbose: Print verbose output to stderr.
|
verbose: Print verbose output to stderr.
|
||||||
type_k: KV cache data type for K (default: f16)
|
type_k: KV cache data type for K (default: f16)
|
||||||
type_v: KV cache data type for V (default: f16)
|
type_v: KV cache data type for V (default: f16)
|
||||||
|
spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If the model path does not exist.
|
ValueError: If the model path does not exist.
|
||||||
|
@ -343,12 +348,16 @@ class Llama:
|
||||||
self.lora_scale = lora_scale
|
self.lora_scale = lora_scale
|
||||||
self.lora_path = lora_path
|
self.lora_path = lora_path
|
||||||
|
|
||||||
|
self.spm_infill = spm_infill
|
||||||
|
|
||||||
if not os.path.exists(model_path):
|
if not os.path.exists(model_path):
|
||||||
raise ValueError(f"Model path does not exist: {model_path}")
|
raise ValueError(f"Model path does not exist: {model_path}")
|
||||||
|
|
||||||
self._model = _LlamaModel(
|
self._stack = contextlib.ExitStack()
|
||||||
|
|
||||||
|
self._model = self._stack.enter_context(contextlib.closing(_LlamaModel(
|
||||||
path_model=self.model_path, params=self.model_params, verbose=self.verbose
|
path_model=self.model_path, params=self.model_params, verbose=self.verbose
|
||||||
)
|
)))
|
||||||
|
|
||||||
# Override tokenizer
|
# Override tokenizer
|
||||||
self.tokenizer_ = tokenizer or LlamaTokenizer(self)
|
self.tokenizer_ = tokenizer or LlamaTokenizer(self)
|
||||||
|
@ -360,18 +369,18 @@ class Llama:
|
||||||
self.context_params.n_ctx = self._model.n_ctx_train()
|
self.context_params.n_ctx = self._model.n_ctx_train()
|
||||||
self.context_params.n_batch = self.n_batch
|
self.context_params.n_batch = self.n_batch
|
||||||
|
|
||||||
self._ctx = _LlamaContext(
|
self._ctx = self._stack.enter_context(contextlib.closing(_LlamaContext(
|
||||||
model=self._model,
|
model=self._model,
|
||||||
params=self.context_params,
|
params=self.context_params,
|
||||||
verbose=self.verbose,
|
verbose=self.verbose,
|
||||||
)
|
)))
|
||||||
|
|
||||||
self._batch = _LlamaBatch(
|
self._batch = self._stack.enter_context(contextlib.closing(_LlamaBatch(
|
||||||
n_tokens=self.n_batch,
|
n_tokens=self.n_batch,
|
||||||
embd=0,
|
embd=0,
|
||||||
n_seq_max=self.context_params.n_ctx,
|
n_seq_max=self.context_params.n_ctx,
|
||||||
verbose=self.verbose,
|
verbose=self.verbose,
|
||||||
)
|
)))
|
||||||
|
|
||||||
if self.lora_path:
|
if self.lora_path:
|
||||||
if self._model.apply_lora_from_file(
|
if self._model.apply_lora_from_file(
|
||||||
|
@ -972,14 +981,33 @@ class Llama:
|
||||||
|
|
||||||
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
|
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
|
||||||
created: int = int(time.time())
|
created: int = int(time.time())
|
||||||
|
bos_token_id: int = self.token_bos()
|
||||||
|
cls_token_id: int = self._model.token_cls()
|
||||||
|
sep_token_id: int = self._model.token_sep()
|
||||||
prefix_token_id: int = self._model.token_prefix()
|
prefix_token_id: int = self._model.token_prefix()
|
||||||
middle_token_id: int = self._model.token_middle()
|
middle_token_id: int = self._model.token_middle()
|
||||||
suffix_token_id: int = self._model.token_suffix()
|
suffix_token_id: int = self._model.token_suffix()
|
||||||
|
add_space_prefix: bool = self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
|
||||||
|
bos_tokens: List[int] = [cls_token_id if cls_token_id != -1 else bos_token_id]
|
||||||
|
eos_tokens: List[int] = [sep_token_id if sep_token_id != -1 else self.token_eos()]
|
||||||
|
|
||||||
|
if (isinstance(prompt, list) and suffix is None) or self._model.add_bos_token() == 0 or bos_tokens[:1] == [-1]:
|
||||||
|
bos_tokens = []
|
||||||
|
|
||||||
|
if (isinstance(prompt, list) and suffix is None) or (self._model.add_eos_token() != 1 and sep_token_id == -1):
|
||||||
|
eos_tokens = []
|
||||||
|
|
||||||
|
suffix_space_prefix: int = 0
|
||||||
|
# Tokenizer hack to remove leading space
|
||||||
|
if add_space_prefix and suffix_token_id >= 0 and suffix:
|
||||||
|
suffix = "☺" + suffix
|
||||||
|
suffix_space_prefix = 2
|
||||||
|
|
||||||
# If prompt is empty, initialize completion with BOS token to avoid
|
# If prompt is empty, initialize completion with BOS token to avoid
|
||||||
# detokenization including a space at the beginning of the completion
|
# detokenization including a space at the beginning of the completion
|
||||||
completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
|
completion_tokens: List[int] = [] if len(prompt) > 0 else [bos_token_id]
|
||||||
# Add blank space to start of prompt to match OG llama tokenizer
|
# Add blank space to start of prompt to match OG llama tokenizer
|
||||||
prompt_tokens: List[int] = (
|
prefix_tokens: List[int] = (
|
||||||
(
|
(
|
||||||
[prefix_token_id]
|
[prefix_token_id]
|
||||||
if prefix_token_id >= 0 and suffix is not None
|
if prefix_token_id >= 0 and suffix is not None
|
||||||
|
@ -988,38 +1016,33 @@ class Llama:
|
||||||
+
|
+
|
||||||
(
|
(
|
||||||
(
|
(
|
||||||
self.tokenize(prompt.encode("utf-8"), add_bos=(prefix_token_id < 0 or suffix is None), special=(prefix_token_id < 0 or suffix is None))
|
self.tokenize(prompt.encode("utf-8"), add_bos=False, special=(prefix_token_id < 0 or suffix is None))
|
||||||
if prompt != ""
|
if prompt != ""
|
||||||
else (
|
else []
|
||||||
[]
|
|
||||||
if prefix_token_id >= 0 and suffix is not None
|
|
||||||
else [self.token_bos()]
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
if isinstance(prompt, str)
|
if isinstance(prompt, str)
|
||||||
else prompt
|
else prompt
|
||||||
)
|
)
|
||||||
+
|
|
||||||
(
|
|
||||||
(
|
|
||||||
[suffix_token_id]
|
|
||||||
+
|
|
||||||
(
|
|
||||||
self.tokenize(suffix.encode("utf-8"), add_bos=False, special=False)
|
|
||||||
if suffix
|
|
||||||
else []
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if suffix_token_id >= 0 and suffix is not None
|
|
||||||
else []
|
|
||||||
)
|
|
||||||
+
|
|
||||||
(
|
|
||||||
[middle_token_id]
|
|
||||||
if middle_token_id >= 0 and suffix is not None
|
|
||||||
else []
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
suffix_tokens: List[int] = (
|
||||||
|
(
|
||||||
|
[suffix_token_id]
|
||||||
|
+
|
||||||
|
(
|
||||||
|
self.tokenize(suffix.encode("utf-8"), add_bos=False, special=False)[suffix_space_prefix:]
|
||||||
|
if suffix
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if suffix_token_id >= 0 and suffix is not None
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
middle_tokens: List[int] = (
|
||||||
|
[middle_token_id]
|
||||||
|
if middle_token_id >= 0 and suffix is not None
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
prompt_tokens: List[int] = bos_tokens + ((suffix_tokens + prefix_tokens + middle_tokens) if self.spm_infill else (prefix_tokens + suffix_tokens + middle_tokens)) + eos_tokens
|
||||||
text: bytes = b""
|
text: bytes = b""
|
||||||
returned_tokens: int = 0
|
returned_tokens: int = 0
|
||||||
stop = (
|
stop = (
|
||||||
|
@ -1176,7 +1199,7 @@ class Llama:
|
||||||
# not sure how to handle this branch when dealing
|
# not sure how to handle this branch when dealing
|
||||||
# with CJK output, so keep it unchanged
|
# with CJK output, so keep it unchanged
|
||||||
for token in remaining_tokens:
|
for token in remaining_tokens:
|
||||||
if token == self.token_bos():
|
if token == bos_token_id:
|
||||||
continue
|
continue
|
||||||
token_end_position += len(self.detokenize([token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens]))
|
token_end_position += len(self.detokenize([token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens]))
|
||||||
# Check if stop sequence is in the token
|
# Check if stop sequence is in the token
|
||||||
|
@ -1303,7 +1326,7 @@ class Llama:
|
||||||
|
|
||||||
logprobs_or_none: Optional[CompletionLogprobs] = None
|
logprobs_or_none: Optional[CompletionLogprobs] = None
|
||||||
if logprobs is not None:
|
if logprobs is not None:
|
||||||
if token == self.token_bos():
|
if token == bos_token_id:
|
||||||
continue
|
continue
|
||||||
token_str = self.detokenize([token]).decode(
|
token_str = self.detokenize([token]).decode(
|
||||||
"utf-8", errors="ignore"
|
"utf-8", errors="ignore"
|
||||||
|
@ -1431,7 +1454,7 @@ class Llama:
|
||||||
for idx, (token, token_str, logprobs_token) in enumerate(
|
for idx, (token, token_str, logprobs_token) in enumerate(
|
||||||
zip(all_tokens, all_token_strs, all_logprobs)
|
zip(all_tokens, all_token_strs, all_logprobs)
|
||||||
):
|
):
|
||||||
if token == self.token_bos():
|
if token == bos_token_id:
|
||||||
continue
|
continue
|
||||||
text_offsets.append(
|
text_offsets.append(
|
||||||
text_offset
|
text_offset
|
||||||
|
@ -1858,6 +1881,7 @@ class Llama:
|
||||||
type_k=self.context_params.type_k,
|
type_k=self.context_params.type_k,
|
||||||
type_v=self.context_params.type_v,
|
type_v=self.context_params.type_v,
|
||||||
# Misc
|
# Misc
|
||||||
|
spm_infill=self.spm_infill,
|
||||||
verbose=self.verbose,
|
verbose=self.verbose,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1940,6 +1964,10 @@ class Llama:
|
||||||
"""Return the pooling type."""
|
"""Return the pooling type."""
|
||||||
return self._ctx.pooling_type()
|
return self._ctx.pooling_type()
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
"""Explicitly free the model from memory."""
|
||||||
|
self._stack.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def logits_to_logprobs(
|
def logits_to_logprobs(
|
||||||
logits: Union[npt.NDArray[np.single], List], axis: int = -1
|
logits: Union[npt.NDArray[np.single], List], axis: int = -1
|
||||||
|
|
|
@ -44,6 +44,8 @@ class LlamaProxy:
|
||||||
if self._current_model is not None:
|
if self._current_model is not None:
|
||||||
return self._current_model
|
return self._current_model
|
||||||
|
|
||||||
|
if self._current_model:
|
||||||
|
self._current_model.close()
|
||||||
self._current_model = None
|
self._current_model = None
|
||||||
|
|
||||||
settings = self._model_settings_dict[model]
|
settings = self._model_settings_dict[model]
|
||||||
|
@ -65,6 +67,7 @@ class LlamaProxy:
|
||||||
|
|
||||||
def free(self):
|
def free(self):
|
||||||
if self._current_model:
|
if self._current_model:
|
||||||
|
self._current_model.close()
|
||||||
del self._current_model
|
del self._current_model
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit fd5ea0f897ecb3659d6c269ef6f3d833e865ead7
|
Subproject commit 172c8256840ffd882ab9992ecedbb587d9b21f15
|
Loading…
Reference in a new issue