Compare commits

..

No commits in common. "5f5ea0a49c1d8f408748cb7ec3b22a86f73fa4fc" and "64058abaa08e6923531ddd137b94e11acbcfb7e2" have entirely different histories.

12 changed files with 169 additions and 236 deletions

View file

@ -29,7 +29,7 @@ jobs:
python -m pip install -e .[all] python -m pip install -e .[all]
- name: Build wheels - name: Build wheels
uses: pypa/cibuildwheel@v2.19.0 uses: pypa/cibuildwheel@v2.18.1
env: env:
# disable repair # disable repair
CIBW_REPAIR_WHEEL_COMMAND: "" CIBW_REPAIR_WHEEL_COMMAND: ""
@ -56,7 +56,7 @@ jobs:
platforms: linux/arm64 platforms: linux/arm64
- name: Build wheels - name: Build wheels
uses: pypa/cibuildwheel@v2.19.0 uses: pypa/cibuildwheel@v2.18.1
env: env:
CIBW_SKIP: "*musllinux* pp*" CIBW_SKIP: "*musllinux* pp*"
CIBW_REPAIR_WHEEL_COMMAND: "" CIBW_REPAIR_WHEEL_COMMAND: ""

View file

@ -20,8 +20,8 @@ jobs:
id: set-matrix id: set-matrix
run: | run: |
$matrix = @{ $matrix = @{
'os' = @('ubuntu-latest', 'windows-latest') 'os' = @('ubuntu-20.04', 'windows-latest')
'pyver' = @("3.9", "3.10", "3.11", "3.12") 'pyver' = @("3.10", "3.11", "3.12")
'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1") 'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1")
'releasetag' = @("basic") 'releasetag' = @("basic")
} }
@ -50,7 +50,6 @@ jobs:
- uses: actions/setup-python@v5 - uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.pyver }} python-version: ${{ matrix.pyver }}
cache: 'pip'
- name: Setup Mamba - name: Setup Mamba
uses: conda-incubator/setup-miniconda@v3.0.4 uses: conda-incubator/setup-miniconda@v3.0.4
@ -110,15 +109,15 @@ jobs:
$env:VERBOSE = '1' $env:VERBOSE = '1'
$env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all' $env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all'
$env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS" $env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
# if ($env:AVXVER -eq 'AVX') { if ($env:AVXVER -eq 'AVX') {
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off' $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
# } }
# if ($env:AVXVER -eq 'AVX512') { if ($env:AVXVER -eq 'AVX512') {
# $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on' $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on'
# } }
# if ($env:AVXVER -eq 'basic') { if ($env:AVXVER -eq 'basic') {
# $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off' $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
# } }
python -m build --wheel python -m build --wheel
# write the build tag to the output # write the build tag to the output
Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV

View file

@ -6,60 +6,81 @@ permissions:
contents: write contents: write
jobs: jobs:
define_matrix:
name: Define Build Matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
defaults:
run:
shell: pwsh
steps:
- name: Define Job Output
id: set-matrix
run: |
$matrix = @{
'os' = @('macos-11', 'macos-12', 'macos-13')
'pyver' = @('3.10', '3.11', '3.12')
}
$matrixOut = ConvertTo-Json $matrix -Compress
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
build_wheels: build_wheels:
name: Build wheels on ${{ matrix.os }} name: ${{ matrix.os }} Python ${{ matrix.pyver }}
needs: define_matrix
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
strategy: strategy:
matrix: matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
os: [macos-12, macos-13, macos-14] env:
OSVER: ${{ matrix.os }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
submodules: "recursive" submodules: "recursive"
# Used to host cibuildwheel
- uses: actions/setup-python@v5 - uses: actions/setup-python@v5
with: with:
python-version: "3.12" python-version: ${{ matrix.pyver }}
cache: 'pip'
- name: Install dependencies - name: Install Dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install build wheel cmake
python -m pip install -e .[all]
- name: Build wheels - name: Build Wheel
uses: pypa/cibuildwheel@v2.18.1 run: |
env: XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer"
# disable repair XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin"
CIBW_REPAIR_WHEEL_COMMAND: "" export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_METAL=on"
CIBW_ARCHS: "arm64" [[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0"
CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DLLAMA_METAL=on" [[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0"
CIBW_BUILD: "cp39-* cp310-* cp311-* cp312-*" [[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0"
with:
package-dir: .
output-dir: wheelhouse2
- uses: actions/upload-artifact@v4 export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64"
with: VERBOSE=1 python -m build --wheel
name: wheels-mac_${{ matrix.os }}
path: ./wheelhouse2/*.whl
release: if [[ "$OSVER" == "macos-13" ]]; then
name: Release export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
needs: [build_wheels] export MACOSX_DEPLOYMENT_TARGET="14.0"
runs-on: ubuntu-latest VERBOSE=1 python -m build --wheel
fi
steps: for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done
- uses: actions/download-artifact@v4
with: export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_METAL=on" && export ARCHFLAGS="-arch x86_64"
merge-multiple: true VERBOSE=1 python -m build --wheel
path: dist2
if [[ "$OSVER" == "macos-13" ]]; then
export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
export MACOSX_DEPLOYMENT_TARGET="14.0"
VERBOSE=1 python -m build --wheel
fi
- uses: softprops/action-gh-release@v2 - uses: softprops/action-gh-release@v2
with: with:
files: dist2/* files: dist/*
# set release name to <tag>-metal # set release name to <tag>-metal
tag_name: ${{ github.ref_name }}-metal tag_name: ${{ github.ref_name }}-metal
env: env:

View file

@ -22,8 +22,7 @@ jobs:
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: "3.11" python-version: "3.8"
cache: 'pip'
- name: Append Dev Version to __version__ - name: Append Dev Version to __version__
run: | run: |
DEV_VERSION=${{ github.event.inputs.dev_version }} DEV_VERSION=${{ github.event.inputs.dev_version }}
@ -32,11 +31,11 @@ jobs:
sed -i 's/__version__ = \".*\"/__version__ = \"'"${NEW_VERSION}"'\"/' llama_cpp/__init__.py sed -i 's/__version__ = \".*\"/__version__ = \"'"${NEW_VERSION}"'\"/' llama_cpp/__init__.py
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip build python3 -m pip install --upgrade pip build
python -m pip install -e .[all] python3 -m pip install -e .[all]
- name: Build source distribution - name: Build source distribution
run: | run: |
python -m build --sdist python3 -m build --sdist
- name: Publish to Test PyPI - name: Publish to Test PyPI
uses: pypa/gh-action-pypi-publish@release/v1 uses: pypa/gh-action-pypi-publish@release/v1
with: with:

View file

@ -16,14 +16,14 @@ jobs:
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: "3.9" python-version: "3.8"
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip build python3 -m pip install --upgrade pip build
python -m pip install -e .[all] python3 -m pip install -e .[all]
- name: Build source distribution - name: Build source distribution
run: | run: |
python -m build --sdist python3 -m build --sdist
- name: Publish distribution to PyPI - name: Publish distribution to PyPI
# TODO: move to tag based releases # TODO: move to tag based releases
# if: startsWith(github.ref, 'refs/tags') # if: startsWith(github.ref, 'refs/tags')

View file

@ -8,60 +8,57 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"] python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
cache: 'pip'
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python3 -m pip install --upgrade pip
python -m pip install --verbose llama-cpp-python[all] python3 -m pip install --verbose llama-cpp-python[all]
- name: Test with pytest - name: Test with pytest
run: | run: |
python -c "import llama_cpp" python3 -c "import llama_cpp"
build-windows: build-windows:
runs-on: windows-latest runs-on: windows-latest
strategy: strategy:
matrix: matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"] python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
cache: 'pip'
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python3 -m pip install --upgrade pip
python -m pip install --verbose llama-cpp-python[all] python3 -m pip install --verbose llama-cpp-python[all]
- name: Test with pytest - name: Test with pytest
run: | run: |
python -c "import llama_cpp" python3 -c "import llama_cpp"
build-macos: build-macos:
runs-on: macos-latest runs-on: macos-latest
strategy: strategy:
matrix: matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"] python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
cache: 'pip'
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python3 -m pip install --upgrade pip
python -m pip install --verbose llama-cpp-python[all] python3 -m pip install --verbose llama-cpp-python[all]
- name: Test with pytest - name: Test with pytest
run: | run: |
python -c "import llama_cpp" python3 -c "import llama_cpp"

View file

@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -24,21 +24,20 @@ jobs:
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
cache: 'pip'
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python3 -m pip install --upgrade pip
python -m pip install .[all] -v python3 -m pip install .[all] -v
- name: Test with pytest - name: Test with pytest
run: | run: |
python -m pytest python3 -m pytest
build-windows: build-windows:
runs-on: windows-latest runs-on: windows-latest
strategy: strategy:
matrix: matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -48,21 +47,20 @@ jobs:
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
cache: 'pip'
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python3 -m pip install --upgrade pip
python -m pip install .[all] -v python3 -m pip install .[all] -v
- name: Test with pytest - name: Test with pytest
run: | run: |
python -m pytest python3 -m pytest
build-macos: build-macos:
runs-on: macos-latest runs-on: macos-13
strategy: strategy:
matrix: matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -72,14 +70,13 @@ jobs:
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
cache: 'pip'
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python3 -m pip install --upgrade pip
python -m pip install .[all] --verbose python3 -m pip install .[all] --verbose
- name: Test with pytest - name: Test with pytest
run: | run: |
python -m pytest python3 -m pytest
# build-linux-opencl: # build-linux-opencl:
@ -101,29 +98,29 @@ jobs:
# sudo apt-get install -y --no-install-recommends llvm intel-oneapi-runtime-opencl intel-oneapi-runtime-compilers libclblast-dev # sudo apt-get install -y --no-install-recommends llvm intel-oneapi-runtime-opencl intel-oneapi-runtime-compilers libclblast-dev
# - name: Install dependencies # - name: Install dependencies
# run: | # run: |
# python -m pip install --upgrade pip # python3 -m pip install --upgrade pip
# CMAKE_ARGS="-DLLAMA_CLBLAST=on" python -m pip install .[all] --verbose # CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install .[all] --verbose
# - name: Test with pytest # - name: Test with pytest
# run: | # run: |
# python -m pytest # python3 -m pytest
build-macos-metal: build-macos-metal:
runs-on: macos-latest runs-on: macos-13
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
submodules: "recursive" submodules: "recursive"
- name: Set up Python 3.9 - name: Set up Python 3.8
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: "3.9" python-version: "3.8"
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python3 -m pip install --upgrade pip
CMAKE_ARGS="-DLLAMA_METAL=on" python -m pip install .[all] --verbose CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose
- name: Test with pytest - name: Test with pytest
run: | run: |
python -m pytest python3 -m pytest

View file

@ -1,33 +0,0 @@
import argparse
from llama_cpp import Llama
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
parser.add_argument("-p", "--prompt", type=str, default="def add(")
parser.add_argument("-s", "--suffix", type=str, default="\n return sum\n\n")
parser.add_argument("-i", "--spm-infill", action='store_true')
args = parser.parse_args()
llm = Llama(model_path=args.model, n_gpu_layers=-1, spm_infill=args.spm_infill)
output = llm.create_completion(
temperature = 0.0,
repeat_penalty = 1.0,
prompt = args.prompt,
suffix = args.suffix,
)
# Models sometimes repeat suffix in response, attempt to filter that
response = output["choices"][0]["text"]
response_stripped = response.rstrip()
unwanted_response_suffix = args.suffix.rstrip()
unwanted_response_length = len(unwanted_response_suffix)
filtered = False
if unwanted_response_suffix and response_stripped[-unwanted_response_length:] == unwanted_response_suffix:
response = response_stripped[:-unwanted_response_length]
filtered = True
print(f"Fill-in-Middle completion{' (filtered)' if filtered else ''}:\n\n{args.prompt}\033[32m{response}\033[{'33' if filtered else '0'}m{args.suffix}\033[0m")

View file

@ -9,7 +9,6 @@ from typing import (
Sequence, Sequence,
) )
from dataclasses import dataclass, field from dataclasses import dataclass, field
from contextlib import ExitStack
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
@ -28,6 +27,9 @@ class _LlamaModel:
"""Intermediate Python wrapper for a llama.cpp llama_model. """Intermediate Python wrapper for a llama.cpp llama_model.
NOTE: For stability it's recommended you use the Llama class instead.""" NOTE: For stability it's recommended you use the Llama class instead."""
_llama_free_model = None
# NOTE: this must be "saved" here to avoid exceptions when calling __del__
def __init__( def __init__(
self, self,
*, *,
@ -38,7 +40,8 @@ class _LlamaModel:
self.path_model = path_model self.path_model = path_model
self.params = params self.params = params
self.verbose = verbose self.verbose = verbose
self._exit_stack = ExitStack()
self._llama_free_model = llama_cpp._lib.llama_free_model # type: ignore
self.model = None self.model = None
@ -53,17 +56,11 @@ class _LlamaModel:
if self.model is None: if self.model is None:
raise ValueError(f"Failed to load model from file: {path_model}") raise ValueError(f"Failed to load model from file: {path_model}")
def free_model(): def __del__(self):
if self.model is None: if self.model is not None and self._llama_free_model is not None:
return self._llama_free_model(self.model)
llama_cpp.llama_free_model(self.model)
self.model = None self.model = None
self._exit_stack.callback(free_model)
def close(self):
self._exit_stack.close()
def vocab_type(self) -> int: def vocab_type(self) -> int:
assert self.model is not None assert self.model is not None
return llama_cpp.llama_vocab_type(self.model) return llama_cpp.llama_vocab_type(self.model)
@ -173,14 +170,6 @@ class _LlamaModel:
assert self.model is not None assert self.model is not None
return llama_cpp.llama_token_eot(self.model) return llama_cpp.llama_token_eot(self.model)
def add_bos_token(self) -> int:
assert self.model is not None
return llama_cpp.llama_add_bos_token(self.model)
def add_eos_token(self) -> int:
assert self.model is not None
return llama_cpp.llama_add_eos_token(self.model)
# Tokenization # Tokenization
def tokenize(self, text: bytes, add_bos: bool, special: bool): def tokenize(self, text: bytes, add_bos: bool, special: bool):
@ -260,6 +249,8 @@ class _LlamaContext:
"""Intermediate Python wrapper for a llama.cpp llama_context. """Intermediate Python wrapper for a llama.cpp llama_context.
NOTE: For stability it's recommended you use the Llama class instead.""" NOTE: For stability it's recommended you use the Llama class instead."""
_llama_free = None
def __init__( def __init__(
self, self,
*, *,
@ -270,28 +261,24 @@ class _LlamaContext:
self.model = model self.model = model
self.params = params self.params = params
self.verbose = verbose self.verbose = verbose
self._exit_stack = ExitStack()
self._llama_free = llama_cpp._lib.llama_free # type: ignore
self.ctx = None self.ctx = None
assert self.model.model is not None assert self.model.model is not None
self.ctx = llama_cpp.llama_new_context_with_model(self.model.model, self.params) self.ctx = llama_cpp.llama_new_context_with_model(
self.model.model, self.params
)
if self.ctx is None: if self.ctx is None:
raise ValueError("Failed to create llama_context") raise ValueError("Failed to create llama_context")
def free_ctx(): def __del__(self):
if self.ctx is None: if self.ctx is not None and self._llama_free is not None:
return self._llama_free(self.ctx)
llama_cpp.llama_free(self.ctx)
self.ctx = None self.ctx = None
self._exit_stack.callback(free_ctx)
def close(self):
self._exit_stack.close()
def n_ctx(self) -> int: def n_ctx(self) -> int:
assert self.ctx is not None assert self.ctx is not None
return llama_cpp.llama_n_ctx(self.ctx) return llama_cpp.llama_n_ctx(self.ctx)
@ -506,6 +493,8 @@ class _LlamaContext:
class _LlamaBatch: class _LlamaBatch:
_llama_batch_free = None
def __init__( def __init__(
self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True
): ):
@ -513,24 +502,19 @@ class _LlamaBatch:
self.embd = embd self.embd = embd
self.n_seq_max = n_seq_max self.n_seq_max = n_seq_max
self.verbose = verbose self.verbose = verbose
self._exit_stack = ExitStack()
self._llama_batch_free = llama_cpp._lib.llama_batch_free # type: ignore
self.batch = None self.batch = None
self.batch = llama_cpp.llama_batch_init( self.batch = llama_cpp.llama_batch_init(
self._n_tokens, self.embd, self.n_seq_max self._n_tokens, self.embd, self.n_seq_max
) )
def free_batch(): def __del__(self):
if self.batch is None: if self.batch is not None and self._llama_batch_free is not None:
return self._llama_batch_free(self.batch)
llama_cpp.llama_batch_free(self.batch)
self.batch = None self.batch = None
self._exit_stack.callback(free_batch)
def close(self):
self._exit_stack.close()
def n_tokens(self) -> int: def n_tokens(self) -> int:
assert self.batch is not None assert self.batch is not None
return self.batch.n_tokens return self.batch.n_tokens

View file

@ -9,9 +9,7 @@ import ctypes
import typing import typing
import fnmatch import fnmatch
import warnings import warnings
import contextlib
import multiprocessing import multiprocessing
from types import TracebackType
from typing import ( from typing import (
List, List,
@ -23,7 +21,6 @@ from typing import (
Deque, Deque,
Callable, Callable,
Dict, Dict,
Type,
) )
from collections import deque from collections import deque
from pathlib import Path from pathlib import Path
@ -118,7 +115,6 @@ class Llama:
type_k: Optional[int] = None, type_k: Optional[int] = None,
type_v: Optional[int] = None, type_v: Optional[int] = None,
# Misc # Misc
spm_infill: bool = False,
verbose: bool = True, verbose: bool = True,
# Extra Params # Extra Params
**kwargs, # type: ignore **kwargs, # type: ignore
@ -189,7 +185,6 @@ class Llama:
verbose: Print verbose output to stderr. verbose: Print verbose output to stderr.
type_k: KV cache data type for K (default: f16) type_k: KV cache data type for K (default: f16)
type_v: KV cache data type for V (default: f16) type_v: KV cache data type for V (default: f16)
spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
Raises: Raises:
ValueError: If the model path does not exist. ValueError: If the model path does not exist.
@ -348,16 +343,12 @@ class Llama:
self.lora_scale = lora_scale self.lora_scale = lora_scale
self.lora_path = lora_path self.lora_path = lora_path
self.spm_infill = spm_infill
if not os.path.exists(model_path): if not os.path.exists(model_path):
raise ValueError(f"Model path does not exist: {model_path}") raise ValueError(f"Model path does not exist: {model_path}")
self._stack = contextlib.ExitStack() self._model = _LlamaModel(
self._model = self._stack.enter_context(contextlib.closing(_LlamaModel(
path_model=self.model_path, params=self.model_params, verbose=self.verbose path_model=self.model_path, params=self.model_params, verbose=self.verbose
))) )
# Override tokenizer # Override tokenizer
self.tokenizer_ = tokenizer or LlamaTokenizer(self) self.tokenizer_ = tokenizer or LlamaTokenizer(self)
@ -369,18 +360,18 @@ class Llama:
self.context_params.n_ctx = self._model.n_ctx_train() self.context_params.n_ctx = self._model.n_ctx_train()
self.context_params.n_batch = self.n_batch self.context_params.n_batch = self.n_batch
self._ctx = self._stack.enter_context(contextlib.closing(_LlamaContext( self._ctx = _LlamaContext(
model=self._model, model=self._model,
params=self.context_params, params=self.context_params,
verbose=self.verbose, verbose=self.verbose,
))) )
self._batch = self._stack.enter_context(contextlib.closing(_LlamaBatch( self._batch = _LlamaBatch(
n_tokens=self.n_batch, n_tokens=self.n_batch,
embd=0, embd=0,
n_seq_max=self.context_params.n_ctx, n_seq_max=self.context_params.n_ctx,
verbose=self.verbose, verbose=self.verbose,
))) )
if self.lora_path: if self.lora_path:
if self._model.apply_lora_from_file( if self._model.apply_lora_from_file(
@ -981,33 +972,14 @@ class Llama:
completion_id: str = f"cmpl-{str(uuid.uuid4())}" completion_id: str = f"cmpl-{str(uuid.uuid4())}"
created: int = int(time.time()) created: int = int(time.time())
bos_token_id: int = self.token_bos()
cls_token_id: int = self._model.token_cls()
sep_token_id: int = self._model.token_sep()
prefix_token_id: int = self._model.token_prefix() prefix_token_id: int = self._model.token_prefix()
middle_token_id: int = self._model.token_middle() middle_token_id: int = self._model.token_middle()
suffix_token_id: int = self._model.token_suffix() suffix_token_id: int = self._model.token_suffix()
add_space_prefix: bool = self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
bos_tokens: List[int] = [cls_token_id if cls_token_id != -1 else bos_token_id]
eos_tokens: List[int] = [sep_token_id if sep_token_id != -1 else self.token_eos()]
if (isinstance(prompt, list) and suffix is None) or self._model.add_bos_token() == 0 or bos_tokens[:1] == [-1]:
bos_tokens = []
if (isinstance(prompt, list) and suffix is None) or (self._model.add_eos_token() != 1 and sep_token_id == -1):
eos_tokens = []
suffix_space_prefix: int = 0
# Tokenizer hack to remove leading space
if add_space_prefix and suffix_token_id >= 0 and suffix:
suffix = "" + suffix
suffix_space_prefix = 2
# If prompt is empty, initialize completion with BOS token to avoid # If prompt is empty, initialize completion with BOS token to avoid
# detokenization including a space at the beginning of the completion # detokenization including a space at the beginning of the completion
completion_tokens: List[int] = [] if len(prompt) > 0 else [bos_token_id] completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
# Add blank space to start of prompt to match OG llama tokenizer # Add blank space to start of prompt to match OG llama tokenizer
prefix_tokens: List[int] = ( prompt_tokens: List[int] = (
( (
[prefix_token_id] [prefix_token_id]
if prefix_token_id >= 0 and suffix is not None if prefix_token_id >= 0 and suffix is not None
@ -1016,33 +988,38 @@ class Llama:
+ +
( (
( (
self.tokenize(prompt.encode("utf-8"), add_bos=False, special=(prefix_token_id < 0 or suffix is None)) self.tokenize(prompt.encode("utf-8"), add_bos=(prefix_token_id < 0 or suffix is None), special=(prefix_token_id < 0 or suffix is None))
if prompt != "" if prompt != ""
else [] else (
[]
if prefix_token_id >= 0 and suffix is not None
else [self.token_bos()]
)
) )
if isinstance(prompt, str) if isinstance(prompt, str)
else prompt else prompt
) )
) +
suffix_tokens: List[int] = (
( (
[suffix_token_id]
+
( (
self.tokenize(suffix.encode("utf-8"), add_bos=False, special=False)[suffix_space_prefix:] [suffix_token_id]
if suffix +
else [] (
self.tokenize(suffix.encode("utf-8"), add_bos=False, special=False)
if suffix
else []
)
) )
if suffix_token_id >= 0 and suffix is not None
else []
)
+
(
[middle_token_id]
if middle_token_id >= 0 and suffix is not None
else []
) )
if suffix_token_id >= 0 and suffix is not None
else []
) )
middle_tokens: List[int] = (
[middle_token_id]
if middle_token_id >= 0 and suffix is not None
else []
)
prompt_tokens: List[int] = bos_tokens + ((suffix_tokens + prefix_tokens + middle_tokens) if self.spm_infill else (prefix_tokens + suffix_tokens + middle_tokens)) + eos_tokens
text: bytes = b"" text: bytes = b""
returned_tokens: int = 0 returned_tokens: int = 0
stop = ( stop = (
@ -1199,7 +1176,7 @@ class Llama:
# not sure how to handle this branch when dealing # not sure how to handle this branch when dealing
# with CJK output, so keep it unchanged # with CJK output, so keep it unchanged
for token in remaining_tokens: for token in remaining_tokens:
if token == bos_token_id: if token == self.token_bos():
continue continue
token_end_position += len(self.detokenize([token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens])) token_end_position += len(self.detokenize([token], prev_tokens=prompt_tokens + completion_tokens[:returned_tokens]))
# Check if stop sequence is in the token # Check if stop sequence is in the token
@ -1326,7 +1303,7 @@ class Llama:
logprobs_or_none: Optional[CompletionLogprobs] = None logprobs_or_none: Optional[CompletionLogprobs] = None
if logprobs is not None: if logprobs is not None:
if token == bos_token_id: if token == self.token_bos():
continue continue
token_str = self.detokenize([token]).decode( token_str = self.detokenize([token]).decode(
"utf-8", errors="ignore" "utf-8", errors="ignore"
@ -1454,7 +1431,7 @@ class Llama:
for idx, (token, token_str, logprobs_token) in enumerate( for idx, (token, token_str, logprobs_token) in enumerate(
zip(all_tokens, all_token_strs, all_logprobs) zip(all_tokens, all_token_strs, all_logprobs)
): ):
if token == bos_token_id: if token == self.token_bos():
continue continue
text_offsets.append( text_offsets.append(
text_offset text_offset
@ -1881,7 +1858,6 @@ class Llama:
type_k=self.context_params.type_k, type_k=self.context_params.type_k,
type_v=self.context_params.type_v, type_v=self.context_params.type_v,
# Misc # Misc
spm_infill=self.spm_infill,
verbose=self.verbose, verbose=self.verbose,
) )
@ -1964,10 +1940,6 @@ class Llama:
"""Return the pooling type.""" """Return the pooling type."""
return self._ctx.pooling_type() return self._ctx.pooling_type()
def close(self) -> None:
"""Explicitly free the model from memory."""
self._stack.close()
@staticmethod @staticmethod
def logits_to_logprobs( def logits_to_logprobs(
logits: Union[npt.NDArray[np.single], List], axis: int = -1 logits: Union[npt.NDArray[np.single], List], axis: int = -1

View file

@ -44,8 +44,6 @@ class LlamaProxy:
if self._current_model is not None: if self._current_model is not None:
return self._current_model return self._current_model
if self._current_model:
self._current_model.close()
self._current_model = None self._current_model = None
settings = self._model_settings_dict[model] settings = self._model_settings_dict[model]
@ -67,7 +65,6 @@ class LlamaProxy:
def free(self): def free(self):
if self._current_model: if self._current_model:
self._current_model.close()
del self._current_model del self._current_model
@staticmethod @staticmethod

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 172c8256840ffd882ab9992ecedbb587d9b21f15 Subproject commit fd5ea0f897ecb3659d6c269ef6f3d833e865ead7