commit
3feeafed1a
22 changed files with 240 additions and 2017 deletions
8
.github/workflows/build-and-release.yaml
vendored
8
.github/workflows/build-and-release.yaml
vendored
|
@ -26,7 +26,8 @@ jobs:
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip pytest cmake scikit-build setuptools
|
python -m pip install --upgrade pip
|
||||||
|
python -m pip install -e .[all]
|
||||||
|
|
||||||
- name: Build wheels
|
- name: Build wheels
|
||||||
run: python -m cibuildwheel --output-dir wheelhouse
|
run: python -m cibuildwheel --output-dir wheelhouse
|
||||||
|
@ -46,10 +47,11 @@ jobs:
|
||||||
- uses: actions/setup-python@v3
|
- uses: actions/setup-python@v3
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip pytest cmake scikit-build setuptools
|
python -m pip install --upgrade pip build
|
||||||
|
python -m pip install -e .[all]
|
||||||
- name: Build source distribution
|
- name: Build source distribution
|
||||||
run: |
|
run: |
|
||||||
python setup.py sdist
|
python -m build --sdist
|
||||||
- uses: actions/upload-artifact@v3
|
- uses: actions/upload-artifact@v3
|
||||||
with:
|
with:
|
||||||
path: ./dist/*.tar.gz
|
path: ./dist/*.tar.gz
|
||||||
|
|
5
.github/workflows/publish-to-test.yaml
vendored
5
.github/workflows/publish-to-test.yaml
vendored
|
@ -19,10 +19,11 @@ jobs:
|
||||||
python-version: "3.8"
|
python-version: "3.8"
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip pytest cmake scikit-build setuptools
|
python3 -m pip install --upgrade pip build
|
||||||
|
python3 -m pip install -e .[all]
|
||||||
- name: Build source distribution
|
- name: Build source distribution
|
||||||
run: |
|
run: |
|
||||||
python setup.py sdist
|
python3 -m build --sdist
|
||||||
- name: Publish to Test PyPI
|
- name: Publish to Test PyPI
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
with:
|
with:
|
||||||
|
|
5
.github/workflows/publish.yaml
vendored
5
.github/workflows/publish.yaml
vendored
|
@ -19,10 +19,11 @@ jobs:
|
||||||
python-version: "3.8"
|
python-version: "3.8"
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip pytest cmake scikit-build setuptools
|
python3 -m pip install --upgrade pip build
|
||||||
|
python3 -m pip install -e .[all]
|
||||||
- name: Build source distribution
|
- name: Build source distribution
|
||||||
run: |
|
run: |
|
||||||
python setup.py sdist
|
python3 -m build --sdist
|
||||||
- name: Publish distribution to PyPI
|
- name: Publish distribution to PyPI
|
||||||
# TODO: move to tag based releases
|
# TODO: move to tag based releases
|
||||||
# if: startsWith(github.ref, 'refs/tags')
|
# if: startsWith(github.ref, 'refs/tags')
|
||||||
|
|
6
.github/workflows/test-pypi.yaml
vendored
6
.github/workflows/test-pypi.yaml
vendored
|
@ -18,7 +18,7 @@ jobs:
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install --upgrade pip
|
python3 -m pip install --upgrade pip
|
||||||
python3 -m pip install --verbose llama-cpp-python[server,test]
|
python3 -m pip install --verbose llama-cpp-python[all]
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
python3 -c "import llama_cpp"
|
python3 -c "import llama_cpp"
|
||||||
|
@ -38,7 +38,7 @@ jobs:
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install --upgrade pip
|
python3 -m pip install --upgrade pip
|
||||||
python3 -m pip install --verbose llama-cpp-python[server,test]
|
python3 -m pip install --verbose llama-cpp-python[all]
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
python3 -c "import llama_cpp"
|
python3 -c "import llama_cpp"
|
||||||
|
@ -58,7 +58,7 @@ jobs:
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install --upgrade pip
|
python3 -m pip install --upgrade pip
|
||||||
python3 -m pip install --verbose llama-cpp-python[server,test]
|
python3 -m pip install --verbose llama-cpp-python[all]
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
python3 -c "import llama_cpp"
|
python3 -c "import llama_cpp"
|
24
.github/workflows/test.yaml
vendored
24
.github/workflows/test.yaml
vendored
|
@ -14,7 +14,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
|
python-version: ["3.8", "3.9", "3.10", "3.11"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
@ -26,18 +26,18 @@ jobs:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
|
python3 -m pip install --upgrade pip
|
||||||
pip install . -v
|
python3 -m pip install .[all] -v
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
pytest
|
python3 -m pytest
|
||||||
|
|
||||||
build-windows:
|
build-windows:
|
||||||
|
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
|
python-version: ["3.8", "3.9", "3.10", "3.11"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
@ -49,18 +49,18 @@ jobs:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
|
python3 -m pip install --upgrade pip
|
||||||
pip install . -v
|
python3 -m pip install .[all] -v
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
pytest
|
python3 -m pytest
|
||||||
|
|
||||||
build-macos:
|
build-macos:
|
||||||
|
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
|
python-version: ["3.8", "3.9", "3.10", "3.11"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
@ -72,8 +72,8 @@ jobs:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
|
python3 -m pip install --upgrade pip
|
||||||
pip install . -v
|
python3 -m pip install .[all] --verbose
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
pytest
|
python3 -m pytest
|
|
@ -2,33 +2,37 @@ cmake_minimum_required(VERSION 3.4...3.22)
|
||||||
|
|
||||||
project(llama_cpp)
|
project(llama_cpp)
|
||||||
|
|
||||||
option(FORCE_CMAKE "Force CMake build of Python bindings" OFF)
|
option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
|
||||||
|
|
||||||
set(FORCE_CMAKE $ENV{FORCE_CMAKE})
|
|
||||||
|
|
||||||
if (UNIX AND NOT FORCE_CMAKE)
|
if (LLAMA_BUILD)
|
||||||
add_custom_command(
|
|
||||||
OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
|
|
||||||
COMMAND make libllama.so
|
|
||||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp
|
|
||||||
)
|
|
||||||
add_custom_target(
|
|
||||||
run ALL
|
|
||||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
|
|
||||||
)
|
|
||||||
install(
|
|
||||||
FILES ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
|
|
||||||
DESTINATION llama_cpp
|
|
||||||
)
|
|
||||||
else()
|
|
||||||
set(BUILD_SHARED_LIBS "On")
|
set(BUILD_SHARED_LIBS "On")
|
||||||
|
if (APPLE)
|
||||||
|
# Need to disable these llama.cpp flags on Apple
|
||||||
|
# otherwise users may encounter invalid instruction errors
|
||||||
|
set(LLAMA_AVX "Off" CACHE BOOL "llama: enable AVX" FORCE)
|
||||||
|
set(LLAMA_AVX2 "Off" CACHE BOOL "llama: enable AVX2" FORCE)
|
||||||
|
set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
|
||||||
|
set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mtune=native")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -mtune=native")
|
||||||
|
endif()
|
||||||
add_subdirectory(vendor/llama.cpp)
|
add_subdirectory(vendor/llama.cpp)
|
||||||
install(
|
install(
|
||||||
TARGETS llama
|
TARGETS llama
|
||||||
LIBRARY DESTINATION llama_cpp
|
LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
|
||||||
RUNTIME DESTINATION llama_cpp
|
RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
|
||||||
ARCHIVE DESTINATION llama_cpp
|
ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
|
||||||
FRAMEWORK DESTINATION llama_cpp
|
FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
|
||||||
RESOURCE DESTINATION llama_cpp
|
RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
|
||||||
|
)
|
||||||
|
# Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
|
||||||
|
install(
|
||||||
|
TARGETS llama
|
||||||
|
LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
|
||||||
|
RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
|
||||||
|
ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
|
||||||
|
FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
|
||||||
|
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
18
Makefile
18
Makefile
|
@ -5,26 +5,30 @@ update:
|
||||||
update.vendor:
|
update.vendor:
|
||||||
cd vendor/llama.cpp && git pull origin master
|
cd vendor/llama.cpp && git pull origin master
|
||||||
|
|
||||||
|
deps:
|
||||||
|
python3 -m pip install pip
|
||||||
|
python3 -m pip install -e ".[all]"
|
||||||
|
|
||||||
build:
|
build:
|
||||||
python3 setup.py develop
|
python3 -m pip install -e .
|
||||||
|
|
||||||
build.cuda:
|
build.cuda:
|
||||||
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
|
CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install -e .
|
||||||
|
|
||||||
build.opencl:
|
build.opencl:
|
||||||
CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop
|
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install -e .
|
||||||
|
|
||||||
build.openblas:
|
build.openblas:
|
||||||
CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
|
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install -e .
|
||||||
|
|
||||||
build.blis:
|
build.blis:
|
||||||
CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop
|
CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" python3 -m pip install -e .
|
||||||
|
|
||||||
build.metal:
|
build.metal:
|
||||||
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 setup.py develop
|
CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install -e .
|
||||||
|
|
||||||
build.sdist:
|
build.sdist:
|
||||||
python3 setup.py sdist
|
python3 -m build --sdist
|
||||||
|
|
||||||
deploy.pypi:
|
deploy.pypi:
|
||||||
python3 -m twine upload dist/*
|
python3 -m twine upload dist/*
|
||||||
|
|
17
README.md
17
README.md
|
@ -1,4 +1,4 @@
|
||||||
# 🦙 Python Bindings for `llama.cpp`
|
# 🦙 Python Bindings for [`llama.cpp`](https://github.com/ggerganov/llama.cpp)
|
||||||
|
|
||||||
[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest)
|
[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest)
|
||||||
[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
|
[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
|
||||||
|
@ -48,7 +48,6 @@ Otherwise, while installing it will build the llama.ccp x86 version which will b
|
||||||
### Installation with Hardware Acceleration
|
### Installation with Hardware Acceleration
|
||||||
|
|
||||||
`llama.cpp` supports multiple BLAS backends for faster processing.
|
`llama.cpp` supports multiple BLAS backends for faster processing.
|
||||||
Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend.
|
|
||||||
|
|
||||||
To install with OpenBLAS, set the `LLAMA_BLAS and LLAMA_BLAS_VENDOR` environment variables before installing:
|
To install with OpenBLAS, set the `LLAMA_BLAS and LLAMA_BLAS_VENDOR` environment variables before installing:
|
||||||
|
|
||||||
|
@ -208,24 +207,26 @@ If you find any issues with the documentation, please open an issue or submit a
|
||||||
|
|
||||||
This package is under active development and I welcome any contributions.
|
This package is under active development and I welcome any contributions.
|
||||||
|
|
||||||
To get started, clone the repository and install the package in development mode:
|
To get started, clone the repository and install the package in editable / development mode:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git
|
git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git
|
||||||
cd llama-cpp-python
|
cd llama-cpp-python
|
||||||
|
|
||||||
|
# Upgrade pip (required for editable mode)
|
||||||
|
pip install --upgrade pip
|
||||||
|
|
||||||
# Install with pip
|
# Install with pip
|
||||||
pip install -e .
|
pip install -e .
|
||||||
|
|
||||||
# if you want to use the fastapi / openapi server
|
# if you want to use the fastapi / openapi server
|
||||||
pip install -e .[server]
|
pip install -e .[server]
|
||||||
|
|
||||||
# If you're a poetry user, installing will also include a virtual environment
|
# to install all optional dependencies
|
||||||
poetry install --all-extras
|
pip install -e .[all]
|
||||||
. .venv/bin/activate
|
|
||||||
|
|
||||||
# Will need to be re-run any time vendor/llama.cpp is updated
|
# to clear the local build cache
|
||||||
python3 setup.py develop
|
make clean
|
||||||
```
|
```
|
||||||
|
|
||||||
# How does this compare to other Python bindings of `llama.cpp`?
|
# How does this compare to other Python bindings of `llama.cpp`?
|
||||||
|
|
|
@ -21,7 +21,7 @@ ENV LLAMA_CUBLAS=1
|
||||||
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
|
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
|
||||||
|
|
||||||
# Install llama-cpp-python (build with cuda)
|
# Install llama-cpp-python (build with cuda)
|
||||||
RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
|
||||||
|
|
||||||
# Run the server
|
# Run the server
|
||||||
CMD python3 -m llama_cpp.server
|
CMD python3 -m llama_cpp.server
|
||||||
|
|
|
@ -19,9 +19,9 @@ RUN mkdir /app
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . /app
|
COPY . /app
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
|
RUN python3 -m pip install --upgrade pip
|
||||||
|
|
||||||
RUN make build && make clean
|
RUN make deps && make build && make clean
|
||||||
|
|
||||||
# Set environment variable for the host
|
# Set environment variable for the host
|
||||||
ENV HOST=0.0.0.0
|
ENV HOST=0.0.0.0
|
||||||
|
|
|
@ -82,9 +82,12 @@ To get started, clone the repository and install the package in development mode
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone git@github.com:abetlen/llama-cpp-python.git
|
git clone git@github.com:abetlen/llama-cpp-python.git
|
||||||
|
cd llama-cpp-python
|
||||||
git submodule update --init --recursive
|
git submodule update --init --recursive
|
||||||
# Will need to be re-run any time vendor/llama.cpp is updated
|
# Will need to be re-run any time vendor/llama.cpp is updated
|
||||||
python3 setup.py develop
|
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install -e .[all]
|
||||||
```
|
```
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
|
@ -30,7 +30,7 @@ conda activate llama
|
||||||
*(you needed xcode installed in order pip to build/compile the C++ code)*
|
*(you needed xcode installed in order pip to build/compile the C++ code)*
|
||||||
```
|
```
|
||||||
pip uninstall llama-cpp-python -y
|
pip uninstall llama-cpp-python -y
|
||||||
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
|
CMAKE_ARGS="-DLLAMA_METAL=on" pip install -U llama-cpp-python --no-cache-dir
|
||||||
pip install 'llama-cpp-python[server]'
|
pip install 'llama-cpp-python[server]'
|
||||||
|
|
||||||
# you should now have llama-cpp-python v0.1.62 or higher installed
|
# you should now have llama-cpp-python v0.1.62 or higher installed
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .llama_cpp import *
|
from .llama_cpp import *
|
||||||
from .llama import *
|
from .llama import *
|
||||||
|
|
||||||
from .version import __version__
|
__version__ = "0.2.0"
|
|
@ -182,21 +182,27 @@ class LlamaState:
|
||||||
self.llama_state_size = llama_state_size
|
self.llama_state_size = llama_state_size
|
||||||
|
|
||||||
|
|
||||||
LogitsProcessor = Callable[[List[int], List[float]], List[float]]
|
LogitsProcessor = Callable[
|
||||||
|
[npt.NDArray[np.intc], npt.NDArray[np.single]], npt.NDArray[np.single]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class LogitsProcessorList(List[LogitsProcessor]):
|
class LogitsProcessorList(List[LogitsProcessor]):
|
||||||
def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]:
|
def __call__(
|
||||||
|
self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
|
||||||
|
) -> npt.NDArray[np.single]:
|
||||||
for processor in self:
|
for processor in self:
|
||||||
scores = processor(input_ids, scores)
|
scores = processor(input_ids, scores)
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
|
|
||||||
StoppingCriteria = Callable[[List[int], List[float]], bool]
|
StoppingCriteria = Callable[[npt.NDArray[np.intc], npt.NDArray[np.single]], bool]
|
||||||
|
|
||||||
|
|
||||||
class StoppingCriteriaList(List[StoppingCriteria]):
|
class StoppingCriteriaList(List[StoppingCriteria]):
|
||||||
def __call__(self, input_ids: List[int], logits: List[float]) -> bool:
|
def __call__(
|
||||||
|
self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
|
||||||
|
) -> bool:
|
||||||
return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
|
return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
|
||||||
|
|
||||||
|
|
||||||
|
@ -281,11 +287,12 @@ class Llama:
|
||||||
self._p_tensor_split = None
|
self._p_tensor_split = None
|
||||||
|
|
||||||
if self.tensor_split is not None:
|
if self.tensor_split is not None:
|
||||||
FloatArray = (ctypes.c_float * len(self.tensor_split))(*self.tensor_split)
|
# Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
|
||||||
self._p_tensor_split = ctypes.POINTER(ctypes.c_float)(
|
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
|
||||||
FloatArray
|
self._c_tensor_split = FloatArray(
|
||||||
|
*tensor_split
|
||||||
) # keep a reference to the array so it is not gc'd
|
) # keep a reference to the array so it is not gc'd
|
||||||
self.params.tensor_split = self._p_tensor_split
|
self.params.tensor_split = self._c_tensor_split
|
||||||
|
|
||||||
self.params.rope_freq_base = rope_freq_base
|
self.params.rope_freq_base = rope_freq_base
|
||||||
self.params.rope_freq_scale = rope_freq_scale
|
self.params.rope_freq_scale = rope_freq_scale
|
||||||
|
@ -535,11 +542,7 @@ class Llama:
|
||||||
logits: npt.NDArray[np.single] = self._scores[-1, :]
|
logits: npt.NDArray[np.single] = self._scores[-1, :]
|
||||||
|
|
||||||
if logits_processor is not None:
|
if logits_processor is not None:
|
||||||
logits = np.array(
|
logits[:] = logits_processor(self._input_ids, logits)
|
||||||
logits_processor(self._input_ids.tolist(), logits.tolist()),
|
|
||||||
dtype=np.single,
|
|
||||||
)
|
|
||||||
self._scores[-1, :] = logits
|
|
||||||
|
|
||||||
nl_logit = logits[self._token_nl]
|
nl_logit = logits[self._token_nl]
|
||||||
candidates = self._candidates
|
candidates = self._candidates
|
||||||
|
@ -778,7 +781,7 @@ class Llama:
|
||||||
grammar=grammar,
|
grammar=grammar,
|
||||||
)
|
)
|
||||||
if stopping_criteria is not None and stopping_criteria(
|
if stopping_criteria is not None and stopping_criteria(
|
||||||
self._input_ids.tolist(), self._scores[-1, :].tolist()
|
self._input_ids, self._scores[-1, :]
|
||||||
):
|
):
|
||||||
return
|
return
|
||||||
tokens_or_none = yield token
|
tokens_or_none = yield token
|
||||||
|
@ -1105,7 +1108,7 @@ class Llama:
|
||||||
break
|
break
|
||||||
|
|
||||||
if stopping_criteria is not None and stopping_criteria(
|
if stopping_criteria is not None and stopping_criteria(
|
||||||
self._input_ids.tolist(), self._scores[-1, :].tolist()
|
self._input_ids, self._scores[-1, :]
|
||||||
):
|
):
|
||||||
text = self.detokenize(completion_tokens)
|
text = self.detokenize(completion_tokens)
|
||||||
finish_reason = "stop"
|
finish_reason = "stop"
|
||||||
|
@ -1525,6 +1528,8 @@ class Llama:
|
||||||
def create_chat_completion(
|
def create_chat_completion(
|
||||||
self,
|
self,
|
||||||
messages: List[ChatCompletionMessage],
|
messages: List[ChatCompletionMessage],
|
||||||
|
functions: Optional[List[ChatCompletionFunction]] = None,
|
||||||
|
function_call: Optional[Union[str, ChatCompletionFunctionCall]] = None,
|
||||||
temperature: float = 0.2,
|
temperature: float = 0.2,
|
||||||
top_p: float = 0.95,
|
top_p: float = 0.95,
|
||||||
top_k: int = 40,
|
top_k: int = 40,
|
||||||
|
|
|
@ -87,8 +87,8 @@ c_size_t_p = POINTER(c_size_t)
|
||||||
# llama.h bindings
|
# llama.h bindings
|
||||||
|
|
||||||
GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas")
|
GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas")
|
||||||
GGML_CUDA_MAX_DEVICES = ctypes.c_int(16)
|
GGML_CUDA_MAX_DEVICES = 16
|
||||||
LLAMA_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1)
|
LLAMA_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else 1
|
||||||
|
|
||||||
# define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
# define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
||||||
LLAMA_DEFAULT_SEED = ctypes.c_int(0xFFFFFFFF)
|
LLAMA_DEFAULT_SEED = ctypes.c_int(0xFFFFFFFF)
|
||||||
|
@ -335,13 +335,13 @@ llama_grammar_p = c_void_p
|
||||||
# // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
# // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
||||||
# LLAMA_GRETYPE_CHAR_ALT = 6,
|
# LLAMA_GRETYPE_CHAR_ALT = 6,
|
||||||
# };
|
# };
|
||||||
LLAMA_GRETYPE_END = c_int(0)
|
LLAMA_GRETYPE_END = 0
|
||||||
LLAMA_GRETYPE_ALT = c_int(1)
|
LLAMA_GRETYPE_ALT = 1
|
||||||
LLAMA_GRETYPE_RULE_REF = c_int(2)
|
LLAMA_GRETYPE_RULE_REF = 2
|
||||||
LLAMA_GRETYPE_CHAR = c_int(3)
|
LLAMA_GRETYPE_CHAR = 3
|
||||||
LLAMA_GRETYPE_CHAR_NOT = c_int(4)
|
LLAMA_GRETYPE_CHAR_NOT = 4
|
||||||
LLAMA_GRETYPE_CHAR_RNG_UPPER = c_int(5)
|
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5
|
||||||
LLAMA_GRETYPE_CHAR_ALT = c_int(6)
|
LLAMA_GRETYPE_CHAR_ALT = 6
|
||||||
|
|
||||||
|
|
||||||
# typedef struct llama_grammar_element {
|
# typedef struct llama_grammar_element {
|
||||||
|
@ -407,7 +407,7 @@ _lib.llama_model_quantize_default_params.restype = llama_model_quantize_params
|
||||||
# // If numa is true, use NUMA optimizations
|
# // If numa is true, use NUMA optimizations
|
||||||
# // Call once at the start of the program
|
# // Call once at the start of the program
|
||||||
# LLAMA_API void llama_backend_init(bool numa);
|
# LLAMA_API void llama_backend_init(bool numa);
|
||||||
def llama_backend_init(numa: c_bool):
|
def llama_backend_init(numa: Union[c_bool, bool]):
|
||||||
return _lib.llama_backend_init(numa)
|
return _lib.llama_backend_init(numa)
|
||||||
|
|
||||||
|
|
||||||
|
@ -651,9 +651,9 @@ _lib.llama_model_quantize.restype = c_int
|
||||||
# int n_threads);
|
# int n_threads);
|
||||||
def llama_apply_lora_from_file(
|
def llama_apply_lora_from_file(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
path_lora: c_char_p,
|
path_lora: Union[c_char_p, bytes],
|
||||||
path_base_model: c_char_p,
|
path_base_model: Union[c_char_p, bytes],
|
||||||
n_threads: c_int,
|
n_threads: Union[c_int, int],
|
||||||
) -> int:
|
) -> int:
|
||||||
return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
|
return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
|
||||||
|
|
||||||
|
@ -671,7 +671,7 @@ def llama_model_apply_lora_from_file(
|
||||||
model: llama_model_p,
|
model: llama_model_p,
|
||||||
path_lora: Union[c_char_p, bytes],
|
path_lora: Union[c_char_p, bytes],
|
||||||
path_base_model: Union[c_char_p, bytes],
|
path_base_model: Union[c_char_p, bytes],
|
||||||
n_threads: c_int,
|
n_threads: Union[c_int, int],
|
||||||
) -> int:
|
) -> int:
|
||||||
return _lib.llama_model_apply_lora_from_file(
|
return _lib.llama_model_apply_lora_from_file(
|
||||||
model, path_lora, path_base_model, n_threads
|
model, path_lora, path_base_model, n_threads
|
||||||
|
@ -751,7 +751,7 @@ def llama_load_session_file(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
path_session: bytes,
|
path_session: bytes,
|
||||||
tokens_out, # type: Array[llama_token]
|
tokens_out, # type: Array[llama_token]
|
||||||
n_token_capacity: c_size_t,
|
n_token_capacity: Union[c_size_t, int],
|
||||||
n_token_count_out, # type: _Pointer[c_size_t]
|
n_token_count_out, # type: _Pointer[c_size_t]
|
||||||
) -> int:
|
) -> int:
|
||||||
return _lib.llama_load_session_file(
|
return _lib.llama_load_session_file(
|
||||||
|
@ -774,7 +774,7 @@ def llama_save_session_file(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
path_session: bytes,
|
path_session: bytes,
|
||||||
tokens, # type: Array[llama_token]
|
tokens, # type: Array[llama_token]
|
||||||
n_token_count: c_size_t,
|
n_token_count: Union[c_size_t, int],
|
||||||
) -> int:
|
) -> int:
|
||||||
return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
|
return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
|
||||||
|
|
||||||
|
@ -801,9 +801,9 @@ _lib.llama_save_session_file.restype = c_size_t
|
||||||
def llama_eval(
|
def llama_eval(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
tokens, # type: Array[llama_token]
|
tokens, # type: Array[llama_token]
|
||||||
n_tokens: c_int,
|
n_tokens: Union[c_int, int],
|
||||||
n_past: c_int,
|
n_past: Union[c_int, int],
|
||||||
n_threads: c_int,
|
n_threads: Union[c_int, int],
|
||||||
) -> int:
|
) -> int:
|
||||||
return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
|
return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
|
||||||
|
|
||||||
|
@ -822,9 +822,9 @@ _lib.llama_eval.restype = c_int
|
||||||
def llama_eval_embd(
|
def llama_eval_embd(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
embd, # type: Array[c_float]
|
embd, # type: Array[c_float]
|
||||||
n_tokens: c_int,
|
n_tokens: Union[c_int, int],
|
||||||
n_past: c_int,
|
n_past: Union[c_int, int],
|
||||||
n_threads: c_int,
|
n_threads: Union[c_int, int],
|
||||||
) -> int:
|
) -> int:
|
||||||
return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads)
|
return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads)
|
||||||
|
|
||||||
|
@ -1042,8 +1042,8 @@ _lib.llama_token_to_piece_with_model.restype = c_int
|
||||||
# size_t start_rule_index);
|
# size_t start_rule_index);
|
||||||
def llama_grammar_init(
|
def llama_grammar_init(
|
||||||
rules, # type: Array[llama_grammar_element_p] # type: ignore
|
rules, # type: Array[llama_grammar_element_p] # type: ignore
|
||||||
n_rules: c_size_t,
|
n_rules: Union[c_size_t, int],
|
||||||
start_rule_index: c_size_t,
|
start_rule_index: Union[c_size_t, int],
|
||||||
) -> llama_grammar_p:
|
) -> llama_grammar_p:
|
||||||
return _lib.llama_grammar_init(rules, n_rules, start_rule_index)
|
return _lib.llama_grammar_init(rules, n_rules, start_rule_index)
|
||||||
|
|
||||||
|
@ -1084,8 +1084,8 @@ def llama_sample_repetition_penalty(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
candidates, # type: _Pointer[llama_token_data_array]
|
candidates, # type: _Pointer[llama_token_data_array]
|
||||||
last_tokens_data, # type: Array[llama_token]
|
last_tokens_data, # type: Array[llama_token]
|
||||||
last_tokens_size: c_int,
|
last_tokens_size: Union[c_int, int],
|
||||||
penalty: c_float,
|
penalty: Union[c_float, float],
|
||||||
):
|
):
|
||||||
return _lib.llama_sample_repetition_penalty(
|
return _lib.llama_sample_repetition_penalty(
|
||||||
ctx, candidates, last_tokens_data, last_tokens_size, penalty
|
ctx, candidates, last_tokens_data, last_tokens_size, penalty
|
||||||
|
@ -1108,9 +1108,9 @@ def llama_sample_frequency_and_presence_penalties(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
candidates, # type: _Pointer[llama_token_data_array]
|
candidates, # type: _Pointer[llama_token_data_array]
|
||||||
last_tokens_data, # type: Array[llama_token]
|
last_tokens_data, # type: Array[llama_token]
|
||||||
last_tokens_size: c_int,
|
last_tokens_size: Union[c_int, int],
|
||||||
alpha_frequency: c_float,
|
alpha_frequency: Union[c_float, float],
|
||||||
alpha_presence: c_float,
|
alpha_presence: Union[c_float, float],
|
||||||
):
|
):
|
||||||
return _lib.llama_sample_frequency_and_presence_penalties(
|
return _lib.llama_sample_frequency_and_presence_penalties(
|
||||||
ctx,
|
ctx,
|
||||||
|
@ -1146,7 +1146,7 @@ def llama_sample_classifier_free_guidance(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
candidates, # type: _Pointer[llama_token_data_array]
|
candidates, # type: _Pointer[llama_token_data_array]
|
||||||
guidance_ctx: llama_context_p,
|
guidance_ctx: llama_context_p,
|
||||||
scale: c_float,
|
scale: Union[c_float, float],
|
||||||
):
|
):
|
||||||
return _lib.llama_sample_classifier_free_guidance(
|
return _lib.llama_sample_classifier_free_guidance(
|
||||||
ctx, candidates, guidance_ctx, scale
|
ctx, candidates, guidance_ctx, scale
|
||||||
|
@ -1182,8 +1182,8 @@ _lib.llama_sample_softmax.restype = None
|
||||||
def llama_sample_top_k(
|
def llama_sample_top_k(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
candidates, # type: _Pointer[llama_token_data_array]
|
candidates, # type: _Pointer[llama_token_data_array]
|
||||||
k: c_int,
|
k: Union[c_int, int],
|
||||||
min_keep: c_size_t,
|
min_keep: Union[c_size_t, int],
|
||||||
):
|
):
|
||||||
return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
|
return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
|
||||||
|
|
||||||
|
@ -1202,8 +1202,8 @@ _lib.llama_sample_top_k.restype = None
|
||||||
def llama_sample_top_p(
|
def llama_sample_top_p(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
candidates, # type: _Pointer[llama_token_data_array]
|
candidates, # type: _Pointer[llama_token_data_array]
|
||||||
p: c_float,
|
p: Union[c_float, float],
|
||||||
min_keep: c_size_t,
|
min_keep: Union[c_size_t, int],
|
||||||
):
|
):
|
||||||
return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
|
return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
|
||||||
|
|
||||||
|
@ -1222,8 +1222,8 @@ _lib.llama_sample_top_p.restype = None
|
||||||
def llama_sample_tail_free(
|
def llama_sample_tail_free(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
candidates, # type: _Pointer[llama_token_data_array]
|
candidates, # type: _Pointer[llama_token_data_array]
|
||||||
z: c_float,
|
z: Union[c_float, float],
|
||||||
min_keep: c_size_t,
|
min_keep: Union[c_size_t, int],
|
||||||
):
|
):
|
||||||
return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
|
return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
|
||||||
|
|
||||||
|
@ -1242,8 +1242,8 @@ _lib.llama_sample_tail_free.restype = None
|
||||||
def llama_sample_typical(
|
def llama_sample_typical(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
candidates, # type: _Pointer[llama_token_data_array]
|
candidates, # type: _Pointer[llama_token_data_array]
|
||||||
p: c_float,
|
p: Union[c_float, float],
|
||||||
min_keep: c_size_t,
|
min_keep: Union[c_size_t, int],
|
||||||
):
|
):
|
||||||
return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
|
return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
|
||||||
|
|
||||||
|
@ -1261,7 +1261,7 @@ _lib.llama_sample_typical.restype = None
|
||||||
def llama_sample_temperature(
|
def llama_sample_temperature(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
candidates, # type: _Pointer[llama_token_data_array]
|
candidates, # type: _Pointer[llama_token_data_array]
|
||||||
temp: c_float,
|
temp: Union[c_float, float],
|
||||||
):
|
):
|
||||||
return _lib.llama_sample_temperature(ctx, candidates, temp)
|
return _lib.llama_sample_temperature(ctx, candidates, temp)
|
||||||
|
|
||||||
|
@ -1301,9 +1301,9 @@ _lib.llama_sample_grammar.restype = None
|
||||||
def llama_sample_token_mirostat(
|
def llama_sample_token_mirostat(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
candidates, # type: _Pointer[llama_token_data_array]
|
candidates, # type: _Pointer[llama_token_data_array]
|
||||||
tau: c_float,
|
tau: Union[c_float, float],
|
||||||
eta: c_float,
|
eta: Union[c_float, float],
|
||||||
m: c_int,
|
m: Union[c_int, int],
|
||||||
mu, # type: _Pointer[c_float]
|
mu, # type: _Pointer[c_float]
|
||||||
) -> int:
|
) -> int:
|
||||||
return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
|
return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
|
||||||
|
@ -1329,8 +1329,8 @@ _lib.llama_sample_token_mirostat.restype = llama_token
|
||||||
def llama_sample_token_mirostat_v2(
|
def llama_sample_token_mirostat_v2(
|
||||||
ctx: llama_context_p,
|
ctx: llama_context_p,
|
||||||
candidates, # type: _Pointer[llama_token_data_array]
|
candidates, # type: _Pointer[llama_token_data_array]
|
||||||
tau: c_float,
|
tau: Union[c_float, float],
|
||||||
eta: c_float,
|
eta: Union[c_float, float],
|
||||||
mu, # type: _Pointer[c_float]
|
mu, # type: _Pointer[c_float]
|
||||||
) -> int:
|
) -> int:
|
||||||
return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
|
return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
|
||||||
|
@ -1531,5 +1531,5 @@ _lib.llama_dump_timing_info_yaml.restype = None
|
||||||
_llama_initialized = False
|
_llama_initialized = False
|
||||||
|
|
||||||
if not _llama_initialized:
|
if not _llama_initialized:
|
||||||
llama_backend_init(c_bool(False))
|
llama_backend_init(False)
|
||||||
_llama_initialized = True
|
_llama_initialized = True
|
||||||
|
|
|
@ -63,6 +63,16 @@ class ChatCompletionMessage(TypedDict):
|
||||||
user: NotRequired[str]
|
user: NotRequired[str]
|
||||||
|
|
||||||
|
|
||||||
|
class ChatCompletionFunction(TypedDict):
|
||||||
|
name: str
|
||||||
|
description: NotRequired[str]
|
||||||
|
parameters: Dict[str, Any] # TODO: make this more specific
|
||||||
|
|
||||||
|
|
||||||
|
class ChatCompletionFunctionCall(TypedDict):
|
||||||
|
name: str
|
||||||
|
|
||||||
|
|
||||||
class ChatCompletionChoice(TypedDict):
|
class ChatCompletionChoice(TypedDict):
|
||||||
index: int
|
index: int
|
||||||
message: ChatCompletionMessage
|
message: ChatCompletionMessage
|
||||||
|
@ -77,9 +87,11 @@ class ChatCompletion(TypedDict):
|
||||||
choices: List[ChatCompletionChoice]
|
choices: List[ChatCompletionChoice]
|
||||||
usage: CompletionUsage
|
usage: CompletionUsage
|
||||||
|
|
||||||
|
|
||||||
class ChatCompletionChunkDeltaEmpty(TypedDict):
|
class ChatCompletionChunkDeltaEmpty(TypedDict):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ChatCompletionChunkDelta(TypedDict):
|
class ChatCompletionChunkDelta(TypedDict):
|
||||||
role: NotRequired[Literal["assistant"]]
|
role: NotRequired[Literal["assistant"]]
|
||||||
content: NotRequired[str]
|
content: NotRequired[str]
|
||||||
|
|
|
@ -19,6 +19,9 @@ from pydantic import BaseModel, Field
|
||||||
from pydantic_settings import BaseSettings
|
from pydantic_settings import BaseSettings
|
||||||
from sse_starlette.sse import EventSourceResponse
|
from sse_starlette.sse import EventSourceResponse
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import numpy.typing as npt
|
||||||
|
|
||||||
|
|
||||||
class Settings(BaseSettings):
|
class Settings(BaseSettings):
|
||||||
model: str = Field(
|
model: str = Field(
|
||||||
|
@ -38,11 +41,13 @@ class Settings(BaseSettings):
|
||||||
default=None,
|
default=None,
|
||||||
description="Split layers across multiple GPUs in proportion.",
|
description="Split layers across multiple GPUs in proportion.",
|
||||||
)
|
)
|
||||||
rope_freq_base: float = Field(default=10000, ge=1, description="RoPE base frequency")
|
rope_freq_base: float = Field(
|
||||||
rope_freq_scale: float = Field(default=1.0, description="RoPE frequency scaling factor")
|
default=10000, ge=1, description="RoPE base frequency"
|
||||||
seed: int = Field(
|
|
||||||
default=1337, description="Random seed. -1 for random."
|
|
||||||
)
|
)
|
||||||
|
rope_freq_scale: float = Field(
|
||||||
|
default=1.0, description="RoPE frequency scaling factor"
|
||||||
|
)
|
||||||
|
seed: int = Field(default=1337, description="Random seed. -1 for random.")
|
||||||
n_batch: int = Field(
|
n_batch: int = Field(
|
||||||
default=512, ge=1, description="The batch size to use per eval."
|
default=512, ge=1, description="The batch size to use per eval."
|
||||||
)
|
)
|
||||||
|
@ -559,9 +564,9 @@ def make_logit_bias_processor(
|
||||||
to_bias[input_id] = score
|
to_bias[input_id] = score
|
||||||
|
|
||||||
def logit_bias_processor(
|
def logit_bias_processor(
|
||||||
input_ids: List[int],
|
input_ids: npt.NDArray[np.intc],
|
||||||
scores: List[float],
|
scores: npt.NDArray[np.single],
|
||||||
) -> List[float]:
|
) -> npt.NDArray[np.single]:
|
||||||
new_scores = [None] * len(scores)
|
new_scores = [None] * len(scores)
|
||||||
for input_id, score in enumerate(scores):
|
for input_id, score in enumerate(scores):
|
||||||
new_scores[input_id] = score + to_bias.get(input_id, 0.0)
|
new_scores[input_id] = score + to_bias.get(input_id, 0.0)
|
||||||
|
@ -594,9 +599,11 @@ async def create_completion(
|
||||||
kwargs = body.model_dump(exclude=exclude)
|
kwargs = body.model_dump(exclude=exclude)
|
||||||
|
|
||||||
if body.logit_bias is not None:
|
if body.logit_bias is not None:
|
||||||
kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
|
kwargs["logits_processor"] = llama_cpp.LogitsProcessorList(
|
||||||
make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
|
[
|
||||||
])
|
make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
iterator_or_completion: Union[llama_cpp.Completion, Iterator[
|
iterator_or_completion: Union[llama_cpp.Completion, Iterator[
|
||||||
llama_cpp.CompletionChunk
|
llama_cpp.CompletionChunk
|
||||||
|
@ -663,6 +670,14 @@ class CreateChatCompletionRequest(BaseModel):
|
||||||
messages: List[ChatCompletionRequestMessage] = Field(
|
messages: List[ChatCompletionRequestMessage] = Field(
|
||||||
default=[], description="A list of messages to generate completions for."
|
default=[], description="A list of messages to generate completions for."
|
||||||
)
|
)
|
||||||
|
functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field(
|
||||||
|
default=None,
|
||||||
|
description="A list of functions to apply to the generated completions.",
|
||||||
|
)
|
||||||
|
function_call: Optional[Union[str, llama_cpp.ChatCompletionFunctionCall]] = Field(
|
||||||
|
default=None,
|
||||||
|
description="A function to apply to the generated completions.",
|
||||||
|
)
|
||||||
max_tokens: int = max_tokens_field
|
max_tokens: int = max_tokens_field
|
||||||
temperature: float = temperature_field
|
temperature: float = temperature_field
|
||||||
top_p: float = top_p_field
|
top_p: float = top_p_field
|
||||||
|
@ -721,9 +736,11 @@ async def create_chat_completion(
|
||||||
kwargs = body.model_dump(exclude=exclude)
|
kwargs = body.model_dump(exclude=exclude)
|
||||||
|
|
||||||
if body.logit_bias is not None:
|
if body.logit_bias is not None:
|
||||||
kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
|
kwargs["logits_processor"] = llama_cpp.LogitsProcessorList(
|
||||||
make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
|
[
|
||||||
])
|
make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
iterator_or_completion: Union[llama_cpp.ChatCompletion, Iterator[
|
iterator_or_completion: Union[llama_cpp.ChatCompletion, Iterator[
|
||||||
llama_cpp.ChatCompletionChunk
|
llama_cpp.ChatCompletionChunk
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
__version__ = "0.1.85"
|
|
1803
poetry.lock
generated
1803
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -1,3 +0,0 @@
|
||||||
[virtualenvs]
|
|
||||||
in-project = true
|
|
||||||
prefer-active-python = true
|
|
|
@ -1,48 +1,63 @@
|
||||||
[tool.poetry]
|
[build-system]
|
||||||
|
requires = ["scikit-build-core>=0.5.0"]
|
||||||
|
build-backend = "scikit_build_core.build"
|
||||||
|
|
||||||
|
[project]
|
||||||
name = "llama_cpp_python"
|
name = "llama_cpp_python"
|
||||||
version = "0.1.85"
|
dynamic = ["version"]
|
||||||
description = "Python bindings for the llama.cpp library"
|
description = "Python bindings for the llama.cpp library"
|
||||||
authors = ["Andrei Betlen <abetlen@gmail.com>"]
|
|
||||||
license = "MIT"
|
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
homepage = "https://github.com/abetlen/llama-cpp-python"
|
license = { text = "MIT" }
|
||||||
repository = "https://github.com/abetlen/llama-cpp-python"
|
authors = [
|
||||||
packages = [{include = "llama_cpp"}]
|
{ name = "Andrei Betlen", email = "abetlen@gmail.com" },
|
||||||
include = [
|
]
|
||||||
"LICENSE.md",
|
dependencies = [
|
||||||
|
"typing-extensions>=4.5.0",
|
||||||
|
"numpy>=1.20.0",
|
||||||
|
"diskcache>=5.6.1",
|
||||||
|
]
|
||||||
|
requires-python = ">=3.8"
|
||||||
|
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
server = [
|
||||||
|
"uvicorn>=0.22.0",
|
||||||
|
"fastapi>=0.100.0",
|
||||||
|
"pydantic-settings>=2.0.1",
|
||||||
|
"sse-starlette>=1.6.1",
|
||||||
|
]
|
||||||
|
test = [
|
||||||
|
"pytest>=7.4.0",
|
||||||
|
"httpx>=0.24.1",
|
||||||
|
]
|
||||||
|
dev = [
|
||||||
|
"black>=23.3.0",
|
||||||
|
"twine>=4.0.2",
|
||||||
|
"mkdocs>=1.4.3",
|
||||||
|
"mkdocstrings[python]>=0.22.0",
|
||||||
|
"mkdocs-material>=9.1.18",
|
||||||
|
"pytest>=7.4.0",
|
||||||
|
"httpx>=0.24.1",
|
||||||
|
]
|
||||||
|
all = [
|
||||||
|
"llama_cpp_python[server,test,dev]",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.scikit-build]
|
||||||
python = "^3.8.1"
|
wheel.packages = ["llama_cpp"]
|
||||||
typing-extensions = "^4.7.1"
|
cmake.verbose = true
|
||||||
numpy = "^1.24.4"
|
cmake.minimum-version = "3.12"
|
||||||
diskcache = "^5.6.3"
|
minimum-version = "0.5"
|
||||||
uvicorn = { version = "^0.23.2", optional = true }
|
ninja.make-fallback = false
|
||||||
fastapi = { version = ">=0.100.0", optional = true }
|
|
||||||
sse-starlette = { version = ">=1.6.1", optional = true }
|
|
||||||
pydantic-settings = { version = ">=2.0.1", optional = true }
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.scikit-build.metadata.version]
|
||||||
black = "^23.9.1"
|
provider = "scikit_build_core.metadata.regex"
|
||||||
twine = "^4.0.2"
|
input = "llama_cpp/__init__.py"
|
||||||
mkdocs = "^1.5.2"
|
|
||||||
mkdocstrings = {extras = ["python"], version = "^0.23.0"}
|
|
||||||
mkdocs-material = "^9.3.1"
|
|
||||||
pytest = "^7.4.2"
|
|
||||||
httpx = "^0.25.0"
|
|
||||||
scikit-build = "0.17.6"
|
|
||||||
|
|
||||||
[tool.poetry.extras]
|
[project.urls]
|
||||||
server = ["uvicorn", "fastapi", "pydantic-settings", "sse-starlette"]
|
Homepage = "https://github.com/abetlen/llama-cpp-python"
|
||||||
|
Issues = "https://github.com/abetlen/llama-cpp-python/issues"
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
addopts = "--ignore=vendor"
|
addopts = "--ignore=vendor"
|
||||||
|
|
||||||
[build-system]
|
|
||||||
requires = [
|
|
||||||
"setuptools>=42",
|
|
||||||
"scikit-build>=0.13",
|
|
||||||
"cmake>=3.18",
|
|
||||||
"ninja",
|
|
||||||
]
|
|
||||||
build-backend = "setuptools.build_meta"
|
|
35
setup.py
35
setup.py
|
@ -1,35 +0,0 @@
|
||||||
from skbuild import setup
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
this_directory = Path(__file__).parent
|
|
||||||
long_description = (this_directory / "README.md").read_text(encoding="utf-8")
|
|
||||||
|
|
||||||
exec(open('llama_cpp/version.py').read())
|
|
||||||
|
|
||||||
setup(
|
|
||||||
name="llama_cpp_python",
|
|
||||||
description="A Python wrapper for llama.cpp",
|
|
||||||
long_description=long_description,
|
|
||||||
long_description_content_type="text/markdown",
|
|
||||||
version=__version__,
|
|
||||||
author="Andrei Betlen",
|
|
||||||
author_email="abetlen@gmail.com",
|
|
||||||
license="MIT",
|
|
||||||
package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"},
|
|
||||||
package_data={"llama_cpp": ["py.typed"]},
|
|
||||||
packages=["llama_cpp", "llama_cpp.server"],
|
|
||||||
install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"],
|
|
||||||
extras_require={
|
|
||||||
"server": ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
|
|
||||||
},
|
|
||||||
python_requires=">=3.7",
|
|
||||||
classifiers=[
|
|
||||||
"Programming Language :: Python :: 3",
|
|
||||||
"Programming Language :: Python :: 3.7",
|
|
||||||
"Programming Language :: Python :: 3.8",
|
|
||||||
"Programming Language :: Python :: 3.9",
|
|
||||||
"Programming Language :: Python :: 3.10",
|
|
||||||
"Programming Language :: Python :: 3.11",
|
|
||||||
],
|
|
||||||
)
|
|
Loading…
Reference in a new issue