diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..fd64c09 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,166 @@ +_skbuild/ + +.envrc + +models/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..5df12aa --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,96 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +# Prerequisites + +Please answer the following questions for yourself before submitting an issue. + +- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now. +- [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md). +- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed). +- [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share. + +# Expected Behavior + +Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do. + +# Current Behavior + +Please provide a detailed written description of what `llama-cpp-python` did, instead. + +# Environment and Context + +Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions. + +* Physical (or virtual) hardware you are using, e.g. for Linux: + +`$ lscpu` + +* Operating System, e.g. for Linux: + +`$ uname -a` + +* SDK version, e.g. for Linux: + +``` +$ python3 --version +$ make --version +$ g++ --version +``` + +# Failure Information (for bugs) + +Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template. + +# Steps to Reproduce + +Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better. + +1. step 1 +2. step 2 +3. step 3 +4. etc. + +**Note: Many issues seem to be regarding functional or performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.** + +Try the following: + +1. `git clone https://github.com/abetlen/llama-cpp-python` +2. `cd llama-cpp-python` +3. `rm -rf _skbuild/` # delete any old builds +4. `python setup.py develop` +5. `cd ./vendor/llama.cpp` +6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp +7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues) + +# Failure Logs + +Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes. + +Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability. + +Example environment info: +``` +llama-cpp-python$ git log | head -1 +commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2 + +llama-cpp-python$ python3 --version +Python 3.10.10 + +llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy" +fastapi 0.95.0 +numpy 1.24.3 +sse-starlette 1.3.3 +uvicorn 0.21.1 + +llama-cpp-python/vendor/llama.cpp$ git log | head -3 +commit 66874d4fbcc7866377246efbcee938e8cc9c7d76 +Author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> +Date: Thu May 25 20:18:01 2023 -0600 +``` diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..bbcbbe7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..91abb11 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "weekly" diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml new file mode 100644 index 0000000..16b00a2 --- /dev/null +++ b/.github/workflows/build-docker.yaml @@ -0,0 +1,39 @@ +name: Build Docker + +on: workflow_dispatch + +permissions: + contents: write + packages: write + +jobs: + docker: + name: Build and push Docker image + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: "true" + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v4 + with: + context: . + push: true # push to registry + pull: true # always fetch the latest base images + platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 + tags: ghcr.io/abetlen/llama-cpp-python:latest \ No newline at end of file diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 92b6e5b..ddefd68 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -28,4 +28,4 @@ jobs: # if: startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@release/v1 with: - password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/test-pypi.yaml b/.github/workflows/test-pypi.yaml new file mode 100644 index 0000000..38f2d92 --- /dev/null +++ b/.github/workflows/test-pypi.yaml @@ -0,0 +1,64 @@ +name: Tests for PyPI package + +on: workflow_dispatch + +jobs: + build-linux: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + + steps: + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install --verbose llama-cpp-python[server,test] + - name: Test with pytest + run: | + python3 -c "import llama_cpp" + + build-windows: + + runs-on: windows-latest + strategy: + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + + steps: + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install --verbose llama-cpp-python[server,test] + - name: Test with pytest + run: | + python3 -c "import llama_cpp" + + build-macos: + + runs-on: macos-latest + strategy: + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + + steps: + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install --verbose llama-cpp-python[server,test] + - name: Test with pytest + run: | + python3 -c "import llama_cpp" \ No newline at end of file diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4481085..a73e347 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -26,7 +26,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings pip install . -v - name: Test with pytest run: | @@ -49,7 +49,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings pip install . -v - name: Test with pytest run: | @@ -72,7 +72,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings pip install . -v - name: Test with pytest run: | diff --git a/.gitignore b/.gitignore index fd64c09..36ed7f7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.vscode/ + _skbuild/ .envrc @@ -11,6 +13,10 @@ __pycache__/ # C extensions *.so +*.dylib +*.metal +*.dll +*.lib # Distribution / packaging .Python @@ -164,3 +170,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ + +# downloaded model .bin files +docker/open_llama/*.bin diff --git a/.gitmodules b/.gitmodules index 6267b09..7edf097 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = git@github.com:ggerganov/llama.cpp.git + url = https://github.com/ggerganov/llama.cpp.git diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..ff3e950 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,24 @@ +# Read the Docs configuration file for MkDocs projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + +mkdocs: + configuration: mkdocs.yml + +python: + install: + - method: pip + path: . + - requirements: docs/requirements.txt + +submodules: + include: all + recursive: true \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..11251c6 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,117 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [Added] + +- (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting. + +## [0.1.68] + +## [Added] + +- (llama.cpp) Update llama.cpp + +## [0.1.67] + +## Fixed + +- Fix performance bug in Llama model by pre-allocating memory tokens and logits. +- Fix bug in Llama model where the model was not free'd after use. + +## [0.1.66] + +## Added + +- (llama.cpp) New model API + +## Fixed + +- Performance issue during eval caused by looped np.concatenate call +- State pickling issue when saving cache to disk + +## [0.1.65] + +### Added + +- (llama.cpp) Fix struct misalignment bug + +## [0.1.64] + +### Added + +- (llama.cpp) Update llama.cpp +- Fix docs for seed. Set -1 for random. + +## [0.1.63] + +### Added + +- (llama.cpp) Add full gpu utilisation in CUDA +- (llama.cpp) Add get_vocab +- (llama.cpp) Add low_vram parameter +- (server) Add logit_bias parameter + +## [0.1.62] + +### Fixed + +- Metal support working +- Cache re-enabled + +## [0.1.61] + +### Fixed + +- Fix broken pip installation + +## [0.1.60] + +### NOTE + +- This release was deleted due to a bug with the packaging system that caused pip installations to fail. + +### Fixed + +- Truncate max_tokens in create_completion so requested tokens doesn't exceed context size. +- Temporarily disable cache for completion requests + +## [v0.1.59] + +### Added + +- (llama.cpp) k-quants support +- (server) mirostat sampling parameters to server + +### Fixed + +- Support both `.so` and `.dylib` for `libllama` on MacOS + +## [v0.1.58] + +### Added + +- (llama.cpp) Metal Silicon support + +## [v0.1.57] + +### Added + +- (llama.cpp) OpenLlama 3B support + +## [v0.1.56] + +### Added + +- (misc) Added first version of the changelog +- (server) Use async routes +- (python-api) Use numpy for internal buffers to reduce memory usage and improve performance. + +### Fixed + +- (python-api) Performance bug in stop sequence check slowing down streaming. \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 27e06ac..788402a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,11 @@ cmake_minimum_required(VERSION 3.4...3.22) project(llama_cpp) -if (UNIX) +option(FORCE_CMAKE "Force CMake build of Python bindings" OFF) + +set(FORCE_CMAKE $ENV{FORCE_CMAKE}) + +if (UNIX AND NOT FORCE_CMAKE) add_custom_command( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so COMMAND make libllama.so @@ -23,5 +27,8 @@ else() TARGETS llama LIBRARY DESTINATION llama_cpp RUNTIME DESTINATION llama_cpp + ARCHIVE DESTINATION llama_cpp + FRAMEWORK DESTINATION llama_cpp + RESOURCE DESTINATION llama_cpp ) -endif(UNIX) +endif() diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c359260 --- /dev/null +++ b/Makefile @@ -0,0 +1,66 @@ +update: + poetry install + git submodule update --init --recursive + +update.vendor: + cd vendor/llama.cpp && git pull origin master + +build: + python3 setup.py develop + +build.cuda: + CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop + +build.opencl: + CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop + +build.openblas: + CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop + +build.blis: + CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop + +build.metal: + CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 setup.py develop + +build.sdist: + python3 setup.py sdist + +deploy.pypi: + python3 -m twine upload dist/* + +deploy.gh-docs: + mkdocs build + mkdocs gh-deploy + +test: + python3 -m pytest + +docker: + docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile . + +run-server: + uvicorn --factory llama.server:app --host ${HOST} --port ${PORT} + +clean: + - cd vendor/llama.cpp && make clean + - cd vendor/llama.cpp && rm libllama.so + - rm -rf _skbuild + - rm llama_cpp/*.so + - rm llama_cpp/*.dylib + - rm llama_cpp/*.metal + - rm llama_cpp/*.dll + - rm llama_cpp/*.lib + +.PHONY: \ + update \ + update.vendor \ + build \ + build.cuda \ + build.opencl \ + build.openblas \ + build.sdist \ + deploy.pypi \ + deploy.gh-docs \ + docker \ + clean \ No newline at end of file diff --git a/README.md b/README.md index 2c8c0a5..0322c73 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # 🦙 Python Bindings for `llama.cpp` -[![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python) +[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest) [![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml) [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) @@ -15,16 +15,70 @@ This package provides: - OpenAI-like API - LangChain compatibility -## Installation +Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest). -Install from PyPI: + +## Installation from PyPI (recommended) + +Install from PyPI (requires a c compiler): ```bash pip install llama-cpp-python ``` +The above command will attempt to install the package and build `llama.cpp` from source. +This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. + +If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different compiler options, please add the following flags to ensure that the package is rebuilt correctly: + +```bash +pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir +``` + +Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: +``` +wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh +bash Miniforge3-MacOSX-arm64.sh +``` +Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac. + +### Installation with OpenBLAS / cuBLAS / CLBlast / Metal + +`llama.cpp` supports multiple BLAS backends for faster processing. +Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend. + +To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing: + +```bash +CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python +``` + +To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing: + +```bash +CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python +``` + +To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing: + +```bash +CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python +``` + +To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable before installing: + +```bash +CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python +``` + +Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](docs/install/macos.md) + ## High-level API +The high-level API provides a simple managed interface through the `Llama` class. + +Below is a short example demonstrating how to use the high-level API to generate text: + ```python >>> from llama_cpp import Llama >>> llm = Llama(model_path="./models/7B/ggml-model.bin") @@ -51,6 +105,15 @@ pip install llama-cpp-python } ``` +### Adjusting the Context Window +The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements. + +For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object: + +```python +llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048) +``` + ## Web Server `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. @@ -60,16 +123,40 @@ To install the server package and get started: ```bash pip install llama-cpp-python[server] -export MODEL=./models/7B/ggml-model.bin -python3 -m llama_cpp.server +python3 -m llama_cpp.server --model models/7B/ggml-model.bin ``` Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. +## Docker image + +A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server: + +```bash +docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest +``` + ## Low-level API -The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`. -The entire API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and should mirror [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). +The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`. +The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). + +Below is a short example demonstrating how to use the low-level API to tokenize a prompt: + +```python +>>> import llama_cpp +>>> import ctypes +>>> params = llama_cpp.llama_context_default_params() +# use bytes for char * params +>>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params) +>>> max_tokens = params.n_ctx +# use ctypes arrays for array params +>>> tokens = (llama_cpp.llama_token * int(max_tokens))() +>>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) +>>> llama_cpp.llama_free(ctx) +``` + +Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API. # Documentation @@ -84,8 +171,19 @@ This package is under active development and I welcome any contributions. To get started, clone the repository and install the package in development mode: ```bash -git clone git@github.com:abetlen/llama-cpp-python.git -git submodule update --init --recursive +git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git +cd llama-cpp-python + +# Install with pip +pip install -e . + +# if you want to use the fastapi / openapi server +pip install -e .[server] + +# If you're a poetry user, installing will also include a virtual environment +poetry install --all-extras +. .venv/bin/activate + # Will need to be re-run any time vendor/llama.cpp is updated python3 setup.py develop ``` diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..053d311 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,66 @@ +# Install Docker Server + +**Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR! + +[Install Docker Engine](https://docs.docker.com/engine/install) + +**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) + +# Simple Dockerfiles for building the llama-cpp-python server with external model bin files +## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image +``` +cd ./openblas_simple +docker build -t openblas_simple . +docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t openblas_simple +``` +where `/` is the full path to the model file on the Docker host system. + +## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image +``` +cd ./cuda_simple +docker build -t cuda_simple . +docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple +``` +where `/` is the full path to the model file on the Docker host system. + +# "Open-Llama-in-a-box" +## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server +``` +$ cd ./open_llama +./build.sh +./start.sh +``` + +# Manually choose your own Llama model from Hugging Face +`python3 ./hug_model.py -a TheBloke -t llama` +You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. +``` +docker $ ls -lh *.bin +-rw-rw-r-- 1 user user 4.8G May 23 18:30 q5_1.bin +lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> q5_1.bin +``` +**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least +**TWICE** as much disk space as the size of the model: + +| Model | Quantized size | +|------:|----------------:| +| 3B | 3 GB | +| 7B | 5 GB | +| 13B | 10 GB | +| 33B | 25 GB | +| 65B | 50 GB | + +**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` + +## Use OpenBLAS +Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS: +### Build: +`docker build -t openblas .` +### Run: +`docker run --cap-add SYS_RESOURCE -t openblas` + +## Use CuBLAS +### Build: +`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .` +### Run: +`docker run --cap-add SYS_RESOURCE -t cublas` diff --git a/docker/cuda_simple/Dockerfile b/docker/cuda_simple/Dockerfile new file mode 100644 index 0000000..e4a2f07 --- /dev/null +++ b/docker/cuda_simple/Dockerfile @@ -0,0 +1,16 @@ +ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04" +FROM nvidia/cuda:${CUDA_IMAGE} + +# We need to set the host to 0.0.0.0 to allow outside access +ENV HOST 0.0.0.0 + +COPY . . + +# Install the package +RUN apt update && apt install -y python3 python3-pip +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings + +RUN LLAMA_CUBLAS=1 pip install llama-cpp-python + +# Run the server +CMD python3 -m llama_cpp.server diff --git a/docker/open_llama/Dockerfile b/docker/open_llama/Dockerfile new file mode 100644 index 0000000..7788f33 --- /dev/null +++ b/docker/open_llama/Dockerfile @@ -0,0 +1,51 @@ +# Define the image argument and provide a default value +ARG IMAGE=python:3-slim-bullseye + +# Use the image as specified +FROM ${IMAGE} + +# Re-declare the ARG after FROM +ARG IMAGE + +# Update and upgrade the existing packages +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + ninja-build \ + build-essential + +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings + +# Perform the conditional installations based on the image +RUN echo "Image: ${IMAGE}" && \ + if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \ + echo "OpenBLAS install:" && \ + apt-get install -y --no-install-recommends libopenblas-dev && \ + LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \ +else \ + echo "CuBLAS install:" && \ + LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \ +fi + +# Clean up apt cache +RUN rm -rf /var/lib/apt/lists/* + +# Set a working directory for better clarity +WORKDIR /app + +# Copy files to the app directory +RUN echo "Installing model...this can take some time..." +COPY ./model.bin /app/model.bin +COPY ./start_server.sh /app/start_server.sh + +# Make the server start script executable +RUN chmod +x /app/start_server.sh + +# Set environment variable for the host +ENV HOST=0.0.0.0 + +# Expose a port for the server +EXPOSE 8000 + +# Run the server start script +CMD ["/bin/sh", "/app/start_server.sh"] diff --git a/docker/open_llama/build.sh b/docker/open_llama/build.sh new file mode 100755 index 0000000..3a6457d --- /dev/null +++ b/docker/open_llama/build.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +MODEL="open_llama_3b" +# Get open_llama_3b_ggml q5_1 quantization +python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1" +ls -lh *.bin + +# Build the default OpenBLAS image +docker build -t $MODEL . +docker images | egrep "^(REPOSITORY|$MODEL)" + +echo +echo "To start the docker container run:" +echo "docker run -t -p 8000:8000 $MODEL" diff --git a/docker/open_llama/hug_model.py b/docker/open_llama/hug_model.py new file mode 100644 index 0000000..13c5b6b --- /dev/null +++ b/docker/open_llama/hug_model.py @@ -0,0 +1,139 @@ +import requests +import json +import os +import struct +import argparse + +def make_request(url, params=None): + print(f"Making request to {url}...") + response = requests.get(url, params=params) + if response.status_code == 200: + return json.loads(response.text) + else: + print(f"Request failed with status code {response.status_code}") + return None + +def check_magic_and_version(filename): + with open(filename, 'rb') as f: + # Read the first 6 bytes from the file + data = f.read(6) + + # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int + # and the next 2 bytes as a little-endian unsigned short + magic, version = struct.unpack('= 10485760: # 10 MB + print('.', end='', flush=True) + total_downloaded = 0 + print("\nDownload complete.") + + # Creating a symbolic link from destination to "model.bin" + if os.path.isfile("model.bin"): + os.remove("model.bin") # remove the existing link if any + os.symlink(destination, "model.bin") + else: + print(f"Download failed with status code {response.status_code}") + +def get_user_choice(model_list): + # Print the enumerated list + print("\n") + for i, (model_id, rfilename) in enumerate(model_list): + print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}") + + # Get user's choice + choice = input("Choose a model to download by entering the corresponding number: ") + try: + index = int(choice) - 1 + if 0 <= index < len(model_list): + # Return the chosen model + return model_list[index] + else: + print("Invalid choice.") + except ValueError: + print("Invalid input. Please enter a number corresponding to a model.") + except IndexError: + print("Invalid choice. Index out of range.") + + return None + +def main(): + # Create an argument parser + parser = argparse.ArgumentParser(description='Process some parameters.') + + # Arguments + parser.add_argument('-v', '--version', type=int, default=0x0003, + help='hexadecimal version number of ggml file') + parser.add_argument('-a', '--author', type=str, default='TheBloke', + help='HuggingFace author filter') + parser.add_argument('-t', '--tag', type=str, default='llama', + help='HuggingFace tag filter') + parser.add_argument('-s', '--search', type=str, default='', + help='HuggingFace search filter') + parser.add_argument('-f', '--filename', type=str, default='q5_1', + help='HuggingFace model repository filename substring match') + + # Parse the arguments + args = parser.parse_args() + + # Define the parameters + params = { + "author": args.author, + "tags": args.tag, + "search": args.search + } + + models = make_request('https://huggingface.co/api/models', params=params) + if models is None: + return + + model_list = [] + # Iterate over the models + for model in models: + model_id = model['id'] + model_info = make_request(f'https://huggingface.co/api/models/{model_id}') + if model_info is None: + continue + + for sibling in model_info.get('siblings', []): + rfilename = sibling.get('rfilename') + if rfilename and args.filename in rfilename: + model_list.append((model_id, rfilename)) + + # Choose the model + model_list.sort(key=lambda x: x[0]) + if len(model_list) == 0: + print("No models found") + exit(1) + elif len(model_list) == 1: + model_choice = model_list[0] + else: + model_choice = get_user_choice(model_list) + + if model_choice is not None: + model_id, rfilename = model_choice + url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" + dest = f"{model_id.replace('/', '_')}_{rfilename}" + download_file(url, dest) + _, version = check_magic_and_version(dest) + if version != args.version: + print(f"Warning: Expected version {args.version}, but found different version in the file.") + else: + print("Error - model choice was None") + exit(2) + +if __name__ == '__main__': + main() diff --git a/docker/open_llama/start.sh b/docker/open_llama/start.sh new file mode 100755 index 0000000..7ee8f74 --- /dev/null +++ b/docker/open_llama/start.sh @@ -0,0 +1,28 @@ +#!/bin/sh + +MODEL="open_llama_3b" + +# Start Docker container +docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL & +sleep 10 +echo +docker ps | egrep "(^CONTAINER|$MODEL)" + +# Test the model works +echo +curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{ + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": [ + "\n", + "###" + ] +}' | grep Paris +if [ $? -eq 0 ] +then + echo + echo "$MODEL is working!!" +else + echo + echo "ERROR: $MODEL not replying." + exit 1 +fi diff --git a/docker/open_llama/start_server.sh b/docker/open_llama/start_server.sh new file mode 100755 index 0000000..d3329ee --- /dev/null +++ b/docker/open_llama/start_server.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +# For mlock support +ulimit -l unlimited + +if [ "$IMAGE" = "python:3-slim-bullseye" ]; then + python3 -B -m llama_cpp.server --model /app/model.bin +else + # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM + python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000 +fi diff --git a/docker/openblas_simple/Dockerfile b/docker/openblas_simple/Dockerfile new file mode 100644 index 0000000..8231bdb --- /dev/null +++ b/docker/openblas_simple/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3-slim-bullseye + +# We need to set the host to 0.0.0.0 to allow outside access +ENV HOST 0.0.0.0 + +COPY . . + +# Install the package +RUN apt update && apt install -y libopenblas-dev ninja-build build-essential +RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings + +RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose + +# Run the server +CMD python3 -m llama_cpp.server diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile new file mode 100644 index 0000000..77680c8 --- /dev/null +++ b/docker/simple/Dockerfile @@ -0,0 +1,33 @@ +# Define the image argument and provide a default value +ARG IMAGE=python:3-slim-bullseye + +# Use the image as specified +FROM ${IMAGE} + +# Re-declare the ARG after FROM +ARG IMAGE + +# Update and upgrade the existing packages +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + ninja-build \ + build-essential + +RUN mkdir /app +WORKDIR /app +COPY . /app + +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings + +RUN make build && make clean + +# Set environment variable for the host +ENV HOST=0.0.0.0 +ENV PORT=8000 + +# Expose a port for the server +EXPOSE 8000 + +# Run the server start script +CMD ["/bin/sh", "/app/docker/simple/run.sh"] diff --git a/docker/simple/run.sh b/docker/simple/run.sh new file mode 100644 index 0000000..c85e73d --- /dev/null +++ b/docker/simple/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +make build +uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT diff --git a/docs/api-reference.md b/docs/api-reference.md new file mode 100644 index 0000000..1290cad --- /dev/null +++ b/docs/api-reference.md @@ -0,0 +1,53 @@ +--- +title: API Reference +--- + +::: llama_cpp.Llama + options: + members: + - __init__ + - tokenize + - detokenize + - reset + - eval + - sample + - generate + - create_embedding + - embed + - create_completion + - __call__ + - create_chat_completion + - set_cache + - save_state + - load_state + - token_bos + - token_eos + show_root_heading: true + +::: llama_cpp.LlamaCache + options: + show_root_heading: true + +::: llama_cpp.LlamaState + options: + show_root_heading: true + +::: llama_cpp.LogitsProcessor + options: + show_root_heading: true + +::: llama_cpp.LogitsProcessorList + options: + show_root_heading: true + +::: llama_cpp.StoppingCriteria + options: + show_root_heading: true + +::: llama_cpp.StoppingCriteriaList + options: + show_root_heading: true + +::: llama_cpp.llama_cpp + options: + show_if_no_docstring: true \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 4055155..7d5ccc3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -87,31 +87,6 @@ git submodule update --init --recursive python3 setup.py develop ``` -## API Reference - -::: llama_cpp.Llama - options: - members: - - __init__ - - tokenize - - detokenize - - reset - - eval - - sample - - generate - - create_embedding - - embed - - create_completion - - __call__ - - create_chat_completion - - token_bos - - token_eos - show_root_heading: true - -::: llama_cpp.llama_cpp - options: - show_if_no_docstring: true - ## License This project is licensed under the terms of the MIT license. \ No newline at end of file diff --git a/docs/install/macos.md b/docs/install/macos.md new file mode 100644 index 0000000..3330396 --- /dev/null +++ b/docs/install/macos.md @@ -0,0 +1,59 @@ +--- +title: MacOS Install with Metal GPU +--- + +**(1) Make sure you have xcode installed... at least the command line parts** +``` +# check the path of your xcode install +xcode-select -p + +# xcode installed returns +# /Applications/Xcode-beta.app/Contents/Developer + +# if xcode is missing then install it... it takes ages; +xcode-select --install +``` + +**(2) Install the conda version for MacOS that supports Metal GPU** +``` +wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh +bash Miniforge3-MacOSX-arm64.sh +``` + +**(3) Make a conda environment** +``` +conda create -n llama python=3.9.16 +conda activate llama +``` + +**(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62** + *(you needed xcode installed in order pip to build/compile the C++ code)* +``` +pip uninstall llama-cpp-python -y +CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir +pip install 'llama-cpp-python[server]' + +# you should now have llama-cpp-python v0.1.62 or higher installed +llama-cpp-python         0.1.68 + +``` + +**(5) Download a v3 ggml model** + - **ggmlv3** + - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0 + +https://huggingface.co/TheBloke/open-llama-7b-open-instruct-GGML + + +**(6) run the llama-cpp-python API server with MacOS Metal GPU support** +``` +# config your ggml model path +# make sure it is ggml v3 +# make sure it is q4_0 +export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin +python3 -m llama_cpp.server --model $MODEL --n_gpu_layers 1 +``` + +***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used* + + diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..199bd4f --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,3 @@ +mkdocs +mkdocs-material +mkdocstrings[python] \ No newline at end of file diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py index 3ed0eac..4b3189d 100644 --- a/examples/high_level_api/fastapi_server.py +++ b/examples/high_level_api/fastapi_server.py @@ -4,259 +4,34 @@ To run this example: ```bash pip install fastapi uvicorn sse-starlette -export MODEL=../models/7B/ggml-model.bin -uvicorn fastapi_server_chat:app --reload +export MODEL=../models/7B/... +``` + +Then run: +``` +uvicorn llama_cpp.server.app:app --reload +``` + +or + +``` +python3 -m llama_cpp.server ``` Then visit http://localhost:8000/docs to see the interactive API docs. + +To actually see the implementation of the server, see llama_cpp/server/app.py + """ import os -import json -from typing import List, Optional, Literal, Union, Iterator, Dict -from typing_extensions import TypedDict - -import llama_cpp - -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict -from sse_starlette.sse import EventSourceResponse - - -class Settings(BaseSettings): - model: str - n_ctx: int = 2048 - n_batch: int = 8 - n_threads: int = int(os.cpu_count() / 2) or 1 - f16_kv: bool = True - use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... - embedding: bool = True - last_n_tokens_size: int = 64 - - -app = FastAPI( - title="🦙 llama.cpp Python API", - version="0.0.1", -) -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) -settings = Settings() -llama = llama_cpp.Llama( - settings.model, - f16_kv=settings.f16_kv, - use_mlock=settings.use_mlock, - embedding=settings.embedding, - n_threads=settings.n_threads, - n_batch=settings.n_batch, - n_ctx=settings.n_ctx, - last_n_tokens_size=settings.last_n_tokens_size, -) - - -class CreateCompletionRequest(BaseModel): - prompt: str - suffix: Optional[str] = Field(None) - max_tokens: int = 16 - temperature: float = 0.8 - top_p: float = 0.95 - echo: bool = False - stop: List[str] = [] - stream: bool = False - - # ignored or currently unsupported - model: Optional[str] = Field(None) - n: Optional[int] = 1 - logprobs: Optional[int] = Field(None) - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - best_of: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) - - # llama.cpp specific parameters - top_k: int = 40 - repeat_penalty: float = 1.1 - - class Config: - schema_extra = { - "example": { - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": ["\n", "###"], - } - } - - -CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) - - -@app.post( - "/v1/completions", - response_model=CreateCompletionResponse, -) -def create_completion(request: CreateCompletionRequest): - if request.stream: - chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict()) # type: ignore - return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks) - return llama( - **request.dict( - exclude={ - "model", - "n", - "logprobs", - "frequency_penalty", - "presence_penalty", - "best_of", - "logit_bias", - "user", - } - ) - ) - - -class CreateEmbeddingRequest(BaseModel): - model: Optional[str] - input: str - user: Optional[str] - - class Config: - schema_extra = { - "example": { - "input": "The food was delicious and the waiter...", - } - } - - -CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) - - -@app.post( - "/v1/embeddings", - response_model=CreateEmbeddingResponse, -) -def create_embedding(request: CreateEmbeddingRequest): - return llama.create_embedding(**request.dict(exclude={"model", "user"})) - - -class ChatCompletionRequestMessage(BaseModel): - role: Union[Literal["system"], Literal["user"], Literal["assistant"]] - content: str - user: Optional[str] = None - - -class CreateChatCompletionRequest(BaseModel): - model: Optional[str] - messages: List[ChatCompletionRequestMessage] - temperature: float = 0.8 - top_p: float = 0.95 - stream: bool = False - stop: List[str] = [] - max_tokens: int = 128 - - # ignored or currently unsupported - model: Optional[str] = Field(None) - n: Optional[int] = 1 - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) - - # llama.cpp specific parameters - repeat_penalty: float = 1.1 - - class Config: - schema_extra = { - "example": { - "messages": [ - ChatCompletionRequestMessage( - role="system", content="You are a helpful assistant." - ), - ChatCompletionRequestMessage( - role="user", content="What is the capital of France?" - ), - ] - } - } - - -CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) - - -@app.post( - "/v1/chat/completions", - response_model=CreateChatCompletionResponse, -) -async def create_chat_completion( - request: CreateChatCompletionRequest, -) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: - completion_or_chunks = llama.create_chat_completion( - **request.dict( - exclude={ - "model", - "n", - "presence_penalty", - "frequency_penalty", - "logit_bias", - "user", - } - ), - ) - - if request.stream: - - async def server_sent_events( - chat_chunks: Iterator[llama_cpp.ChatCompletionChunk], - ): - for chat_chunk in chat_chunks: - yield dict(data=json.dumps(chat_chunk)) - yield dict(data="[DONE]") - - chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore - - return EventSourceResponse( - server_sent_events(chunks), - ) - completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore - return completion - - -class ModelData(TypedDict): - id: str - object: Literal["model"] - owned_by: str - permissions: List[str] - - -class ModelList(TypedDict): - object: Literal["list"] - data: List[ModelData] - - -GetModelResponse = create_model_from_typeddict(ModelList) - - -@app.get("/v1/models", response_model=GetModelResponse) -def get_models() -> ModelList: - return { - "object": "list", - "data": [ - { - "id": llama.model_path, - "object": "model", - "owned_by": "me", - "permissions": [], - } - ], - } +import uvicorn +from llama_cpp.server.app import create_app if __name__ == "__main__": - import os - import uvicorn + app = create_app() - uvicorn.run(app, host=os.getenv("HOST", "localhost"), port=os.getenv("PORT", 8000)) + uvicorn.run( + app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) + ) diff --git a/examples/low_level_api/Chat.py b/examples/low_level_api/Chat.py new file mode 100644 index 0000000..fcef8cd --- /dev/null +++ b/examples/low_level_api/Chat.py @@ -0,0 +1,71 @@ +#!/bin/python +import sys, os, datetime +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +AI_NAME = env_or_def("AI_NAME", "ChatLLaMa") +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") +USER_NAME = env_or_def("USER_NAME", "USER") +N_PREDICTS = int(env_or_def("N_PREDICTS", "2048")) +N_THREAD = int(env_or_def("N_THREAD", "8")) + +today = datetime.datetime.today() +DATE_YEAR=today.strftime("%Y") +DATE_TIME=today.strftime("%H:%M") + +prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}. +{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision. +There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other. +The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. +The transcript only includes text, it does not include markup like HTML and Markdown. + +{USER_NAME}: Hello, {AI_NAME}! +{AI_NAME}: Hello {USER_NAME}! How may I help you today? +{USER_NAME}: What year is it? +{AI_NAME}: We are in {DATE_YEAR}. +{USER_NAME}: Please tell me the largest city in Europe. +{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia. +{USER_NAME}: What can you tell me about Moscow? +{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center. +{USER_NAME}: What is a cat? +{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. +{USER_NAME}: How do I pass command line arguments to a Node.js program? +{AI_NAME}: The arguments are stored in process.argv. + + argv[0] is the path to the Node. js executable. + argv[1] is the path to the script file. + argv[2] is the first argument passed to the script. + argv[3] is the second argument passed to the script and so on. +{USER_NAME}: Name a color. +{AI_NAME}: Blue. +{USER_NAME}: What time is it? +{AI_NAME}: It is {DATE_TIME}. +{USER_NAME}:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + n_ctx=2048, + temp=0.7, + top_k=40, + top_p=0.5, + repeat_last_n=256, + n_batch=1024, + repeat_penalty=1.17647, + model=MODEL, + n_threads=N_THREAD, + n_predict=N_PREDICTS, + use_color=True, + interactive=True, + antiprompt=[f"{USER_NAME}:"], + input_prefix=" ", + input_suffix=f"{AI_NAME}:", + prompt=prompt, +) + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/Miku.py b/examples/low_level_api/Miku.py new file mode 100644 index 0000000..eb9a2cf --- /dev/null +++ b/examples/low_level_api/Miku.py @@ -0,0 +1,59 @@ +#!/bin/python +import sys, os +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +AI_NAME = env_or_def("AI_NAME", "Miku") +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") +USER_NAME = env_or_def("USER_NAME", "Anon") +N_PREDICTS = int(env_or_def("N_PREDICTS", "4096")) +N_THREAD = int(env_or_def("N_THREAD", "0")) + +prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer. +{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. +{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. +{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad. +{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her. +The conversation is only between {USER_NAME} and {AI_NAME} +The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice. +{AI_NAME} can only communicate through text, so she can't send images or videos. + + +{USER_NAME}: Hello! +{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression! +{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^ +{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :) +{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant! +{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off! +{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that! +{AI_NAME}: What do you like to do in your free time? ^_^ +{USER_NAME}:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + n_batch=1024, + n_ctx=2048, + n_keep=-1, + repeat_last_n=256, + repeat_penalty=1.17647, + temp=0.7, + top_k=40, + top_p=0.5, + model=MODEL, + n_predict=N_PREDICTS, + use_color=True, + interactive=True, + antiprompt=[f"{USER_NAME}:"], + prompt=prompt, +) + +if N_THREAD > 0: + params.n_threads = N_THREAD + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/ReasonAct.py b/examples/low_level_api/ReasonAct.py new file mode 100644 index 0000000..82e5c44 --- /dev/null +++ b/examples/low_level_api/ReasonAct.py @@ -0,0 +1,49 @@ +#!/bin/python +import sys, os, datetime +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") + +prompt=f"""You run in a loop of Thought, Action, Observation. +At the end of the loop either Answer or restate your Thought and Action. +Use Thought to describe your thoughts about the question you have been asked. +Use Action to run one of these actions available to you: +- calculate[python math expression] +Observation will be the result of running those actions + + +Question: What is 4 * 7 / 3? +Thought: Do I need to use an action? Yes, I use calculate to do math +Action: calculate[4 * 7 / 3] +Observation: 9.3333333333 +Thought: Do I need to use an action? No, have the result +Answer: The calculate tool says it is 9.3333333333 +Question: What is capital of france? +Thought: Do I need to use an action? No, I know the answer +Answer: Paris is the capital of France +Question:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + interactive=True, + interactive_start=True, + top_k=10000, + temp=0.2, + repeat_penalty=1, + n_threads=7, + n_ctx=2048, + antiprompt=["Question:","Observation:"], + model=MODEL, + input_prefix=" ", + n_predict=-1, + prompt=prompt, +) + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index f16980c..55d08db 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -1,8 +1,9 @@ import os import argparse +import re from dataclasses import dataclass, field -from typing import List, Optional +from typing import List # Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp @@ -12,23 +13,36 @@ class GptParams: seed: int = -1 n_threads: int = min(4, os.cpu_count() or 1) n_predict: int = 128 - repeat_last_n: int = 64 n_parts: int = -1 n_ctx: int = 512 n_batch: int = 8 n_keep: int = 0 + ignore_eos: bool = False + logit_bias: dict[int, float] = field(default_factory=dict) top_k: int = 40 top_p: float = 0.95 + tfs_z: float = 1.00 + typical_p: float = 1.00 temp: float = 0.80 repeat_penalty: float = 1.10 + repeat_last_n: int = 64 + frequency_penalty: float = 0.0 + presence_penalty: float = 0.0 + mirostat: int = 0 + mirostat_tau: float = 5.0 + mirostat_eta: float = 0.1 model: str = "./models/llama-7B/ggml-model.bin" prompt: str = "" + path_session: str = "" input_prefix: str = " " - + input_suffix: str = "" antiprompt: List[str] = field(default_factory=list) + lora_adapter: str = "" + lora_base: str = "" + memory_f16: bool = True random_prompt: bool = False use_color: bool = False @@ -38,7 +52,7 @@ class GptParams: interactive_start: bool = False instruct: bool = False - ignore_eos: bool = False + penalize_nl: bool = True perplexity: bool = False use_mmap: bool = True use_mlock: bool = False @@ -50,8 +64,7 @@ class GptParams: # If chat ended prematurely, append this to the conversation to fix it. # Set to "\nUser:" etc. # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:"" - fix_prefix: str = " " - output_postfix: str = "" + fix_prefix: str = "" input_echo: bool = True, # Default instructions for Alpaca @@ -61,59 +74,43 @@ class GptParams: instruct_inp_suffix: str="\n\n### Response:\n\n" -def gpt_params_parse(argv = None, params: Optional[GptParams] = None): - if params is None: - params = GptParams() - +def gpt_params_parse(argv = None): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed") parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads") - parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") - parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") + parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict") + parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts") parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx") - parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16") - parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") - parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") - parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") - parser.add_argument("--n_predict", type=int, default=128, help="number of model parts",dest="n_predict") - parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") - parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch") parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep") + + parser.add_argument( + "-l", + "--logit-bias", + type=str, + action='append', + help="--logit-bias TOKEN_ID(+/-)BIAS", + dest="logit_bias_str" + ) + parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos") + parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") + parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") + parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z") + parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") + parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") + parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") + parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z") + parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty") + parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat") + parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau") + parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta") + parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model") - parser.add_argument( - "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" - ) - parser.add_argument("--embedding", action="store_true", help="", dest="embedding") - parser.add_argument( - "--interactive-start", - action="store_true", - help="run in interactive mode", - dest="interactive" - ) - parser.add_argument( - "--interactive-first", - action="store_true", - help="run in interactive mode and wait for input right away", - dest="interactive_start" - ) - parser.add_argument( - "-ins", - "--instruct", - action="store_true", - help="run in instruction mode (use with Alpaca or Vicuna models)", - dest="instruct" - ) - parser.add_argument( - "--color", - action="store_true", - help="colorise output to distinguish prompt and user input from generations", - dest="use_color" - ) - parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") - parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap") - parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test") - parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt") + parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") + parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") + parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session") + parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") + parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix") parser.add_argument( "-r", "--reverse-prompt", @@ -122,16 +119,70 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None): help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).", dest="antiprompt" ) - parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity") - parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos") - parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts") + + parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter") + parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base") + + parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16") parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt") - parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") + parser.add_argument( + "--color", + action="store_true", + help="colorise output to distinguish prompt and user input from generations", + dest="use_color" + ) + parser.add_argument( + "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" + ) + + parser.add_argument("--embedding", action="store_true", help="", dest="embedding") + parser.add_argument( + "--interactive-first", + action="store_true", + help="run in interactive mode and wait for input right away", + dest="interactive_start" + ) + + parser.add_argument( + "-ins", + "--instruct", + action="store_true", + help="run in instruction mode (use with Alpaca or Vicuna models)", + dest="instruct" + ) + parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl") + parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity") + parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap") + parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") + parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test") + parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt") + + #Custom args parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix") - parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix") parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo") + + parser.add_argument( + "--interactive-start", + action="store_true", + help="run in interactive mode", + dest="interactive" + ) + args = parser.parse_args(argv) - return args + + logit_bias_str = args.logit_bias_str + delattr(args, "logit_bias_str") + params = GptParams(**vars(args)) + + if (params.lora_adapter): + params.use_mmap = False + + if (logit_bias_str != None): + for i in logit_bias_str: + if (m := re.match(r"(\d+)([-+]\d+)", i)): + params.logit_bias[int(m.group(1))] = float(m.group(2)) + + return params def gpt_random_prompt(rng): return [ @@ -148,4 +199,4 @@ def gpt_random_prompt(rng): ][rng % 10] if __name__ == "__main__": - print(GptParams(gpt_params_parse())) + print(gpt_params_parse()) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index a61a55e..f5d51a3 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -10,40 +10,14 @@ Quirks: You should also still be feeding the model with a "primer" prompt that shows it the expected format. """ +import ctypes import sys from time import time -from os import cpu_count +from os import cpu_count, path import llama_cpp from common import GptParams, gpt_params_parse, gpt_random_prompt - -ANSI_COLOR_RESET = "\x1b[0m" -ANSI_COLOR_YELLOW = "\x1b[33m" -ANSI_BOLD = "\x1b[1m" -ANSI_COLOR_GREEN = "\x1b[32m" - -CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET -CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW -CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN - -# Iterative search -# Actively searches and prevents a pattern from being returned -class IterSearch: - def __init__(self, pattern): - self.pattern = list(pattern) - self.buffer = [] - - def __call__(self, char): - self.buffer += [char] - - if (self.pattern[:len(self.buffer)] == self.buffer): - if (len(self.buffer) >= len(self.pattern)): - self.buffer.clear() - return [] - - _tmp = self.buffer[:] - self.buffer.clear() - return _tmp +import util # A LLaMA interactive session class LLaMAInteract: @@ -77,9 +51,11 @@ specified) expect poor results""", file=sys.stderr) # runtime args self.input_consumed = 0 self.n_past = 0 + self.n_session_consumed = 0 self.first_antiprompt = [] self.remaining_tokens = self.params.n_predict self.output_echo = self.params.input_echo + self.multibyte_fix = [] # model load self.lparams = llama_cpp.llama_context_default_params() @@ -94,6 +70,19 @@ specified) expect poor results""", file=sys.stderr) if (not self.ctx): raise RuntimeError(f"error: failed to load model '{self.params.model}'") + if (self.params.ignore_eos): + self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf") + + if (len(self.params.lora_adapter) > 0): + if (llama_cpp.llama_apply_lora_from_file( + self.ctx, + self.params.lora_adapter.encode("utf8"), + self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None, + self.params.n_threads + ) != 0): + print("error: failed to apply lora adapter") + return + print(file=sys.stderr) print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ | {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr) @@ -117,13 +106,52 @@ specified) expect poor results""", file=sys.stderr) with open(self.params.file) as f: self.params.prompt = f.read() + self.session_tokens: list[llama_cpp.llama_token] = [] + if (len(self.params.path_session) > 0): + print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr) + + if (path.exists(self.params.path_session)): + _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))() + _n_token_count_out = llama_cpp.c_size_t() + if (llama_cpp.llama_load_session_file( + self.ctx, + self.params.path_session.encode("utf8"), + _session_tokens, + self.params.n_ctx, + ctypes.byref(_n_token_count_out) + ) != 1): + print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr) + return + _n_token_count_out = _n_token_count_out.value + self.session_tokens = _session_tokens[:_n_token_count_out] + print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr) + else: + print(f"session file does not exist, will create", file=sys.stderr) + # tokenize the prompt self.embd = [] self.embd_inp = self._tokenize(self.params.prompt) - if (len(self.embd_inp) > self.params.n_ctx - 4): + if (len(self.embd_inp) > self.n_ctx - 4): raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})") + # debug message about similarity of saved session, if applicable + self.n_matching_session_tokens = 0 + if len(self.session_tokens) > 0: + for id in self.session_tokens: + if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]: + break + self.n_matching_session_tokens += 1 + + if self.n_matching_session_tokens >= len(self.embd_inp): + print(f"session file has exact match for prompt!") + elif self.n_matching_session_tokens < (len(self.embd_inp) / 2): + print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") + else: + print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") + + self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) + # number of tokens to keep when resetting context if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct): self.params.n_keep = len(self.embd_inp) @@ -132,11 +160,12 @@ specified) expect poor results""", file=sys.stderr) self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False) # in instruct mode, we inject a prefix and a suffix to each input by the user + self.antiecho = None if (self.params.instruct): self.params.interactive_start = True _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False) self.first_antiprompt.append(_ptn) - self.antiecho = IterSearch(_ptn) + self.antiecho = util.IterSearch(_ptn) # enable interactive mode if reverse prompt or interactive start is specified if (len(self.params.antiprompt) != 0 or self.params.interactive_start): @@ -144,6 +173,7 @@ specified) expect poor results""", file=sys.stderr) # determine newline token self.llama_token_newline = self._tokenize("\n", False) + self.llama_token_eot = self._tokenize(" [end of text]\n", False) if (self.params.verbose_prompt): print(f""" @@ -170,16 +200,24 @@ number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr) if len(self.params.input_prefix) > 0: print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr) - print(f"""sampling: temp = {self.params.temp},\ + print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\ +repeat_penalty = {self.params.repeat_penalty},\ +presence_penalty = {self.params.presence_penalty},\ +frequency_penalty = {self.params.frequency_penalty},\ top_k = {self.params.top_k},\ +tfs_z = {self.params.tfs_z},\ top_p = {self.params.top_p},\ -repeat_last_n = {self.params.repeat_last_n},\ -repeat_penalty = {self.params.repeat_penalty} +typical_p = {self.params.typical_p},\ +temp = {self.params.temp},\ +mirostat = {self.params.mirostat},\ +mirostat_lr = {self.params.mirostat_eta},\ +mirostat_ent = {self.params.mirostat_tau},\ -generate: n_ctx = {self.n_ctx}, \ -n_batch = {self.params.n_batch}, \ -n_predict = {self.params.n_predict}, \ +generate: n_ctx = {self.n_ctx},\ +n_batch = {self.params.n_batch},\ +n_predict = {self.params.n_predict},\ n_keep = {self.params.n_keep} + """, file=sys.stderr) # determine antiprompt tokens @@ -195,24 +233,24 @@ n_keep = {self.params.n_keep} - If you want to submit another line, end your input in '\\'. """, file=sys.stderr) - self.set_color(CONSOLE_COLOR_PROMPT) + self.set_color(util.CONSOLE_COLOR_PROMPT) # tokenize a prompt def _tokenize(self, prompt, bos=True): - _arr = (llama_cpp.llama_token * (len(prompt) + 1))() - _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos) + _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))() + _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos) return _arr[:_n] - def use_antiprompt(self): - return len(self.first_antiprompt) > 0 - def set_color(self, c): if (self.params.use_color): print(c, end="") + def use_antiprompt(self): + return len(self.first_antiprompt) > 0 + # generate tokens def generate(self): - while self.remaining_tokens > 0 or self.params.interactive: + while self.remaining_tokens > 0 or self.params.interactive or self.params.n_predict == -1: # predict if len(self.embd) > 0: # infinite text generation via context swapping @@ -228,43 +266,131 @@ n_keep = {self.params.n_keep} self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd) ] self.embd = _insert + self.embd + self.params.path_session = "" + + # try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + if self.n_session_consumed < len(self.session_tokens): + for i in range(len(self.embd)): + if self.embd[i] != self.session_tokens[self.n_session_consumed]: + self.session_tokens = self.session_tokens[:self.n_session_consumed] + break + + self.n_past += 1 + self.n_session_consumed += 1 + + if self.n_session_consumed >= len(self.session_tokens): + i += 1 + break + + if i > 0: + self.embd = self.embd[i:] + + # evaluate tokens in batches + # embd is typically prepared beforehand to fit within a batch, but not always + #TODO BUG: The batching code causes nonsensical generation + """for i in range(0, len(self.embd), self.params.n_batch): + n_eval = self.params.n_batch + _arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval]) + if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0: + print(f"failed to eval") + return + + self.n_past += n_eval""" if (llama_cpp.llama_eval( self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads ) != 0): raise Exception("Failed to llama_eval!") + if len(self.embd) > 0 and len(self.params.path_session) > 0: + self.session_tokens.extend(self.embd) + self.n_session_consumed = len(self.session_tokens) + self.n_past += len(self.embd) self.embd = [] - if len(self.embd_inp) <= self.input_consumed: + if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting # out of user input, sample next token + top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k + repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n - if (self.params.ignore_eos): - logits = llama_cpp.llama_get_logits(self.ctx) - logits[llama_cpp.llama_token_eos()] = llama_cpp.c_float(0) + # optionally save the session on first sample (for faster prompt loading next time) + if len(self.params.path_session) > 0 and self.need_to_save_session: + self.need_to_save_session = False + llama_cpp.llama_save_session_file( + self.ctx, + self.params.path_session.encode("utf8"), + (llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens), + len(self.session_tokens) + ) + + id = 0 + + logits = llama_cpp.llama_get_logits(self.ctx) + n_vocab = llama_cpp.llama_n_vocab(self.ctx) + + # Apply params.logit_bias map + for key, value in self.params.logit_bias.items(): + logits[key] += value + + _arr = (llama_cpp.llama_token_data * n_vocab)(*[ + llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + for token_id in range(n_vocab) + ]) + candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) + + # Apply penalties + nl_logit = logits[llama_cpp.llama_token_nl()] + last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx) + + _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:]) + llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p, + _arr, + last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty)) + llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p, + _arr, + last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty)) + + if not self.params.penalize_nl: + logits[llama_cpp.llama_token_nl()] = nl_logit + + if self.params.temp <= 0: + # Greedy sampling + id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p) + else: + if self.params.mirostat == 1: + mirostat_mu = 2.0 * self.params.mirostat_tau + mirostat_m = 100 + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu)) + elif self.params.mirostat == 2: + mirostat_mu = 2.0 * self.params.mirostat_tau + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu)) + else: + # Temperature sampling + llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token(self.ctx, candidates_p) + # print("`{}`".format(candidates_p.size)) - _arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):] - id = llama_cpp.llama_sample_top_p_top_k( - self.ctx, - (llama_cpp.llama_token * len(_arr))(*_arr), - len(_arr), - self.params.top_k, - self.params.top_p, - self.params.temp, - self.params.repeat_penalty, - ) self.last_n_tokens.pop(0) self.last_n_tokens.append(id) # replace end of text token with newline token when in interactive mode if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct): id = self.llama_token_newline[0] + self.embd.append(id) if (self.use_antiprompt()): # tokenize and inject first reverse prompt self.embd_inp += self.first_antiprompt[0] - - # add it to the context - self.embd.append(id) + for id in self.first_antiprompt[0]: + self.embd.append(id) + else: + # add it to the context + self.embd.append(id) # echo this to console self.output_echo = True @@ -287,7 +413,7 @@ n_keep = {self.params.n_keep} # display tokens if self.output_echo: for id in self.embd: - if self.params.instruct: + if self.antiecho != None: for r in self.antiecho(id): yield r else: @@ -295,7 +421,7 @@ n_keep = {self.params.n_keep} # reset color to default if we there is no pending user input if (self.params.input_echo and len(self.embd_inp) == self.input_consumed): - self.set_color(CONSOLE_COLOR_DEFAULT) + self.set_color(util.CONSOLE_COLOR_DEFAULT) if (self.params.interactive and len(self.embd_inp) <= self.input_consumed): # if antiprompt is present, stop @@ -313,9 +439,9 @@ n_keep = {self.params.n_keep} # end of text token if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(): if (not self.params.instruct): - for i in " [end of text]\n": + for i in self.llama_token_eot: yield i - break + break # respect n_predict even if antiprompt is present if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1): @@ -336,12 +462,12 @@ n_keep = {self.params.n_keep} def exit(self): llama_cpp.llama_free(self.ctx) - self.set_color(CONSOLE_COLOR_DEFAULT) + self.set_color(util.CONSOLE_COLOR_DEFAULT) # return past text def past(self): for id in self.last_n_tokens[-self.n_past:]: - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf8", errors="ignore") # write input def input(self, prompt: str): @@ -355,7 +481,29 @@ n_keep = {self.params.n_keep} def output(self): self.remaining_tokens = self.params.n_predict for id in self.generate(): - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + cur_char = llama_cpp.llama_token_to_str(self.ctx, id) + + # Add remainder of missing bytes + if None in self.multibyte_fix: + self.multibyte_fix[self.multibyte_fix.index(None)] = cur_char + + # Return completed utf char + if len(self.multibyte_fix) > 0 and not None in self.multibyte_fix: + yield (b"".join(self.multibyte_fix)).decode("utf8") + self.multibyte_fix = [] + continue + + # Contains multi-byte UTF8 + for num, pattern in [(2, 192), (3, 224), (4, 240)]: + # Bitwise AND check + if pattern & int.from_bytes(cur_char, 'little') == pattern: + self.multibyte_fix = [cur_char] + ([None] * (num-1)) + + # Stop incomplete bytes from passing + if len(self.multibyte_fix) > 0: + continue + + yield cur_char.decode("utf8") # read user input def read_input(self): @@ -371,21 +519,21 @@ n_keep = {self.params.n_keep} self.params.input_echo = False while self.params.interactive: - self.set_color(CONSOLE_COLOR_USER_INPUT) + self.set_color(util.CONSOLE_COLOR_USER_INPUT) if (self.params.instruct): print('\n> ', end="") self.input(self.read_input()) else: print(self.params.input_prefix, end="") - self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.output_postfix}") - print(self.params.output_postfix,end="") - self.set_color(CONSOLE_COLOR_DEFAULT) + self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}") + print(self.params.input_suffix,end="") + self.set_color(util.CONSOLE_COLOR_DEFAULT) try: for i in self.output(): print(i,end="",flush=True) except KeyboardInterrupt: - self.set_color(CONSOLE_COLOR_DEFAULT) + self.set_color(util.CONSOLE_COLOR_DEFAULT) if not self.params.instruct: print(self.params.fix_prefix,end="") self.input(self.params.fix_prefix) @@ -414,8 +562,7 @@ The transcript only includes text, it does not include markup like HTML and Mark {USER_NAME}: Name a color. {AI_NAME}: Blue {USER_NAME}:""" - args = gpt_params_parse() - params = GptParams(**vars(args)) + params = gpt_params_parse() with LLaMAInteract(params) as m: m.interact() diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py index b048c0a..9e38ec7 100644 --- a/examples/low_level_api/low_level_api_llama_cpp.py +++ b/examples/low_level_api/low_level_api_llama_cpp.py @@ -37,6 +37,10 @@ embd = [] last_n_size = 64 last_n_tokens_data = [0] * last_n_size n_batch = 24 +last_n_repeat = 64 +repeat_penalty = 1 +frequency_penalty = 0.0 +presence_penalty = 0.0 while remaining_tokens > 0: if len(embd) > 0: @@ -47,15 +51,28 @@ while remaining_tokens > 0: n_past += len(embd) embd = [] if len(embd_inp) <= input_consumed: - id = llama_cpp.llama_sample_top_p_top_k( - ctx, - (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data), - len(last_n_tokens_data), - 40, - 0.8, - 0.2, - 1.0 / 0.85, - ) + logits = llama_cpp.llama_get_logits(ctx) + n_vocab = llama_cpp.llama_n_vocab(ctx) + + _arr = (llama_cpp.llama_token_data * n_vocab)(*[ + llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + for token_id in range(n_vocab) + ]) + candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) + + _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data) + llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p, + _arr, + last_n_repeat, repeat_penalty) + llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p, + _arr, + last_n_repeat, frequency_penalty, presence_penalty) + + llama_cpp.llama_sample_top_k(ctx, candidates_p, 40) + llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8) + llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2) + id = llama_cpp.llama_sample_token(ctx, candidates_p) + last_n_tokens_data = last_n_tokens_data[1:] + [id] embd.append(id) input_noecho = False @@ -70,7 +87,7 @@ while remaining_tokens > 0: if not input_noecho: for id in embd: print( - llama_cpp.llama_token_to_str(ctx, id).decode("utf-8"), + llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"), end="", flush=True, ) diff --git a/examples/low_level_api/util.py b/examples/low_level_api/util.py new file mode 100644 index 0000000..9d0ec2f --- /dev/null +++ b/examples/low_level_api/util.py @@ -0,0 +1,95 @@ + +ANSI_COLOR_RESET = "\x1b[0m" +ANSI_COLOR_YELLOW = "\x1b[33m" +ANSI_BOLD = "\x1b[1m" +ANSI_COLOR_GREEN = "\x1b[32m" + +CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET +CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW +CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN + +# Iterative search +# Actively searches and prevents a pattern from being returned +class IterSearch: + def __init__(self, pattern): + self.pattern = list(pattern) + self.buffer = [] + + def __call__(self, char): + self.buffer += [char] + + if (self.pattern[:len(self.buffer)] == self.buffer): + if (len(self.buffer) >= len(self.pattern)): + self.buffer.clear() + return [] + + _tmp = self.buffer[:] + self.buffer.clear() + return _tmp + +class Circle: + def __init__(self, size, default=0): + self.list = [default] * size + self.maxsize = size + self.size = 0 + self.offset = 0 + + def append(self, elem): + if self.size < self.maxsize: + self.list[self.size] = elem + self.size += 1 + else: + self.list[self.offset] = elem + self.offset = (self.offset + 1) % self.maxsize + + def __getitem__(self, val): + if isinstance(val, int): + if 0 > val or val >= self.size: + raise IndexError('Index out of range') + return self.list[val] if self.size < self.maxsize else self.list[(self.offset + val) % self.maxsize] + elif isinstance(val, slice): + start, stop, step = val.start, val.stop, val.step + if step is None: + step = 1 + if start is None: + start = 0 + if stop is None: + stop = self.size + if start < 0: + start = self.size + start + if stop < 0: + stop = self.size + stop + + indices = range(start, stop, step) + return [self.list[(self.offset + i) % self.maxsize] for i in indices if i < self.size] + else: + raise TypeError('Invalid argument type') + + + + +if __name__ == "__main__": + c = Circle(5) + + c.append(1) + print(c.list) + print(c[:]) + assert c[0] == 1 + assert c[:5] == [1] + + for i in range(2,5+1): + c.append(i) + print(c.list) + print(c[:]) + assert c[0] == 1 + assert c[:5] == [1,2,3,4,5] + + for i in range(5+1,9+1): + c.append(i) + print(c.list) + print(c[:]) + assert c[0] == 5 + assert c[:5] == [5,6,7,8,9] + #assert c[:-5] == [5,6,7,8,9] + assert c[:10] == [5,6,7,8,9] + diff --git a/examples/notebooks/Guidance.ipynb b/examples/notebooks/Guidance.ipynb new file mode 100644 index 0000000..045856e --- /dev/null +++ b/examples/notebooks/Guidance.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Stop program
Tweak this proverb to apply to model instructions instead.\n",
+       "\n",
+       "Where there is no guidance, a people falls,\n",
+       "but in an abundance of counselors there is safety.\n",
+       "- Proverbs 11:14\n",
+       "\n",
+       "UPDATED\n",
+       "Where there is no guidance for assembling a model, people will struggle,\n",
+       "but with clear instructions, the process becomes safe and successful.\n",
+       "- GPT 2 (updated): Proverbs 11:14
\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n", + "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n", + "os.environ[\"OPENAI_API_HOST\"] = \"http://100.64.159.73:8000\"\n", + "\n", + "import guidance\n", + "\n", + "# set the default language model used to execute guidance programs\n", + "guidance.llm = guidance.llms.OpenAI(\"text-davinci-003\", caching=False)\n", + "\n", + "# define a guidance program that adapts a proverb\n", + "program = guidance(\"\"\"Tweak this proverb to apply to model instructions instead.\n", + "\n", + "{{proverb}}\n", + "- {{book}} {{chapter}}:{{verse}}\n", + "\n", + "UPDATED\n", + "Where there is no guidance{{gen 'rewrite' stop=\"\\\\n-\"}}\n", + "- GPT {{gen 'chapter'}}:{{gen 'verse'}}\"\"\")\n", + "\n", + "# execute the program on a specific proverb\n", + "executed_program = program(\n", + " proverb=\"Where there is no guidance, a people falls,\\nbut in an abundance of counselors there is safety.\",\n", + " book=\"Proverbs\",\n", + " chapter=11,\n", + " verse=14\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 880e42d..62e0dae 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -2,13 +2,200 @@ import os import sys import uuid import time +import math import multiprocessing -from typing import List, Optional, Union, Generator, Sequence, Iterator -from collections import deque +from abc import ABC, abstractmethod +from typing import ( + List, + Optional, + Union, + Generator, + Sequence, + Iterator, + Deque, + Tuple, + Callable, +) +from collections import deque, OrderedDict + +import diskcache from . import llama_cpp from .llama_types import * +import numpy as np +import numpy.typing as npt + + +class BaseLlamaCache(ABC): + """Base cache class for a llama.cpp model.""" + + def __init__(self, capacity_bytes: int = (2 << 30)): + self.capacity_bytes = capacity_bytes + + @property + @abstractmethod + def cache_size(self) -> int: + raise NotImplementedError + + def _find_longest_prefix_key( + self, + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: + pass + + @abstractmethod + def __getitem__(self, key: Sequence[int]) -> "LlamaState": + raise NotImplementedError + + @abstractmethod + def __contains__(self, key: Sequence[int]) -> bool: + raise NotImplementedError + + @abstractmethod + def __setitem__(self, key: Sequence[int], value: "LlamaState") -> None: + raise NotImplementedError + + +class LlamaRAMCache(BaseLlamaCache): + """Cache for a llama.cpp model using RAM.""" + + def __init__(self, capacity_bytes: int = (2 << 30)): + super().__init__(capacity_bytes) + self.capacity_bytes = capacity_bytes + self.cache_state: OrderedDict[Tuple[int, ...], "LlamaState"] = OrderedDict() + + @property + def cache_size(self): + return sum([state.llama_state_size for state in self.cache_state.values()]) + + def _find_longest_prefix_key( + self, + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: + min_len = 0 + min_key = None + keys = ( + (k, Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() + ) + for k, prefix_len in keys: + if prefix_len > min_len: + min_len = prefix_len + min_key = k + return min_key + + def __getitem__(self, key: Sequence[int]) -> "LlamaState": + key = tuple(key) + _key = self._find_longest_prefix_key(key) + if _key is None: + raise KeyError("Key not found") + value = self.cache_state[_key] + self.cache_state.move_to_end(_key) + return value + + def __contains__(self, key: Sequence[int]) -> bool: + return self._find_longest_prefix_key(tuple(key)) is not None + + def __setitem__(self, key: Sequence[int], value: "LlamaState"): + key = tuple(key) + if key in self.cache_state: + del self.cache_state[key] + self.cache_state[key] = value + while self.cache_size > self.capacity_bytes and len(self.cache_state) > 0: + self.cache_state.popitem(last=False) + + +# Alias for backwards compatibility +LlamaCache = LlamaRAMCache + + +class LlamaDiskCache(BaseLlamaCache): + """Cache for a llama.cpp model using disk.""" + + def __init__( + self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30) + ): + super().__init__(capacity_bytes) + self.cache = diskcache.Cache(cache_dir) + + @property + def cache_size(self): + return int(self.cache.volume()) # type: ignore + + def _find_longest_prefix_key( + self, + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: + min_len = 0 + min_key: Optional[Tuple[int, ...]] = None + for k in self.cache.iterkeys(): # type: ignore + prefix_len = Llama.longest_token_prefix(k, key) + if prefix_len > min_len: + min_len = prefix_len + min_key = k # type: ignore + return min_key + + def __getitem__(self, key: Sequence[int]) -> "LlamaState": + key = tuple(key) + _key = self._find_longest_prefix_key(key) + if _key is None: + raise KeyError("Key not found") + value: "LlamaState" = self.cache.pop(_key) # type: ignore + # NOTE: This puts an integer as key in cache, which breaks, + # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens + # self.cache.push(_key, side="front") # type: ignore + return value + + def __contains__(self, key: Sequence[int]) -> bool: + return self._find_longest_prefix_key(tuple(key)) is not None + + def __setitem__(self, key: Sequence[int], value: "LlamaState"): + print("LlamaDiskCache.__setitem__: called", file=sys.stderr) + key = tuple(key) + if key in self.cache: + print("LlamaDiskCache.__setitem__: delete", file=sys.stderr) + del self.cache[key] + self.cache[key] = value + print("LlamaDiskCache.__setitem__: set", file=sys.stderr) + while self.cache_size > self.capacity_bytes and len(self.cache) > 0: + key_to_remove = next(iter(self.cache)) + del self.cache[key_to_remove] + print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) + + +class LlamaState: + def __init__( + self, + input_ids: npt.NDArray[np.intc], + scores: npt.NDArray[np.single], + n_tokens: int, + llama_state: bytes, + llama_state_size: int, + ): + self.input_ids = input_ids + self.scores = scores + self.n_tokens = n_tokens + self.llama_state = llama_state + self.llama_state_size = llama_state_size + + +LogitsProcessor = Callable[[List[int], List[float]], List[float]] + + +class LogitsProcessorList(List[LogitsProcessor]): + def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: + for processor in self: + scores = processor(input_ids, scores) + return scores + + +StoppingCriteria = Callable[[List[int], List[float]], bool] + + +class StoppingCriteriaList(List[StoppingCriteria]): + def __call__(self, input_ids: List[int], logits: List[float]) -> bool: + return any([stopping_criteria(input_ids, logits) for stopping_criteria in self]) + class Llama: """High-level Python wrapper for a llama.cpp model.""" @@ -19,16 +206,20 @@ class Llama: # NOTE: These parameters are likely to change in the future. n_ctx: int = 512, n_parts: int = -1, + n_gpu_layers: int = 0, seed: int = 1337, - f16_kv: bool = False, + f16_kv: bool = True, logits_all: bool = False, vocab_only: bool = False, use_mmap: bool = True, use_mlock: bool = False, embedding: bool = False, n_threads: Optional[int] = None, - n_batch: int = 8, + n_batch: int = 512, last_n_tokens_size: int = 64, + lora_base: Optional[str] = None, + lora_path: Optional[str] = None, + low_vram: bool = False, verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -37,7 +228,7 @@ class Llama: model_path: Path to the model. n_ctx: Maximum context size. n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined. - seed: Random seed. 0 for random. + seed: Random seed. -1 for random. f16_kv: Use half-precision for key/value cache. logits_all: Return logits for all tokens, not just the last token. vocab_only: Only load the vocabulary no weights. @@ -47,6 +238,8 @@ class Llama: n_threads: Number of threads to use. If None, the number of threads is automatically determined. n_batch: Maximum number of prompt tokens to batch together when calling llama_eval. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. + lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. + lora_path: Path to a LoRA file to apply to the model. verbose: Print verbose output to stderr. Raises: @@ -60,36 +253,106 @@ class Llama: self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx - self.params.n_parts = n_parts + self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv self.params.logits_all = logits_all self.params.vocab_only = vocab_only - self.params.use_mmap = use_mmap + self.params.use_mmap = use_mmap if lora_path is None else False self.params.use_mlock = use_mlock self.params.embedding = embedding + self.params.low_vram = low_vram self.last_n_tokens_size = last_n_tokens_size - self.last_n_tokens_data = deque( - [llama_cpp.llama_token(0)] * self.last_n_tokens_size, - maxlen=self.last_n_tokens_size, - ) - self.tokens_consumed = 0 self.n_batch = min(n_ctx, n_batch) + self.cache: Optional[BaseLlamaCache] = None + self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) + self.lora_base = lora_base + self.lora_path = lora_path + + ### DEPRECATED ### + self.n_parts = n_parts + ### DEPRECATED ### + if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") - self.ctx = llama_cpp.llama_init_from_file( + self.model = llama_cpp.llama_load_model_from_file( self.model_path.encode("utf-8"), self.params ) + assert self.model is not None + + self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params) + + assert self.ctx is not None + + if self.lora_path: + if llama_cpp.llama_model_apply_lora_from_file( + self.model, + llama_cpp.c_char_p(self.lora_path.encode("utf-8")), + llama_cpp.c_char_p(self.lora_base.encode("utf-8")) + if self.lora_base is not None + else llama_cpp.c_char_p(0), + llama_cpp.c_int(self.n_threads), + ): + raise RuntimeError( + f"Failed to apply LoRA from lora path: {self.lora_path} to base path: {self.lora_base}" + ) if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) - def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]: + self._n_vocab = self.n_vocab() + self._n_ctx = self.n_ctx() + size = llama_cpp.c_size_t(self._n_vocab) + sorted = llama_cpp.c_bool(False) + self._candidates_data = np.array( + [], + dtype=np.dtype( + [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True + ), + ) + self._candidates_data.resize(3, self._n_vocab, refcheck=False) + candidates = llama_cpp.llama_token_data_array( + data=self._candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p), + size=size, + sorted=sorted, + ) + self._candidates = candidates + self._token_nl = Llama.token_nl() + self._token_eos = Llama.token_eos() + self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore + self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single) + + self.n_tokens = 0 + self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) + self.scores: npt.NDArray[np.single] = np.ndarray( + (n_ctx, self._n_vocab), dtype=np.single + ) + + @property + def _input_ids(self) -> npt.NDArray[np.intc]: + return self.input_ids[: self.n_tokens] + + @property + def _scores(self) -> npt.NDArray[np.single]: + return self.scores[: self.n_tokens, :] + + @property + def eval_tokens(self) -> Deque[int]: + return deque(self.input_ids[: self.n_tokens].tolist(), maxlen=self._n_ctx) + + @property + def eval_logits(self) -> Deque[List[float]]: + return deque( + self.scores[: self.n_tokens, :].tolist(), + maxlen=self._n_ctx if self.params.logits_all else 1, + ) + + def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: """Tokenize a string. Args: @@ -102,20 +365,32 @@ class Llama: A list of tokens. """ assert self.ctx is not None - n_ctx = llama_cpp.llama_n_ctx(self.ctx) - tokens = (llama_cpp.llama_token * int(n_ctx))() + n_ctx = self._n_ctx + tokens = (llama_cpp.llama_token * n_ctx)() n_tokens = llama_cpp.llama_tokenize( self.ctx, text, tokens, - n_ctx, - llama_cpp.c_bool(True), + llama_cpp.c_int(n_ctx), + llama_cpp.c_bool(add_bos), ) - if int(n_tokens) < 0: - raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}') + if n_tokens < 0: + n_tokens = abs(n_tokens) + tokens = (llama_cpp.llama_token * n_tokens)() + n_tokens = llama_cpp.llama_tokenize( + self.ctx, + text, + tokens, + llama_cpp.c_int(n_tokens), + llama_cpp.c_bool(add_bos), + ) + if n_tokens < 0: + raise RuntimeError( + f'Failed to tokenize: text="{text}" n_tokens={n_tokens}' + ) return list(tokens[:n_tokens]) - def detokenize(self, tokens: List[llama_cpp.llama_token]) -> bytes: + def detokenize(self, tokens: List[int]) -> bytes: """Detokenize a list of tokens. Args: @@ -127,45 +402,204 @@ class Llama: assert self.ctx is not None output = b"" for token in tokens: - output += llama_cpp.llama_token_to_str(self.ctx, token) + output += llama_cpp.llama_token_to_str( + self.ctx, llama_cpp.llama_token(token) + ) return output + def set_cache(self, cache: Optional[BaseLlamaCache]): + """Set the cache. + + Args: + cache: The cache to set. + """ + self.cache = cache + def reset(self): """Reset the model state.""" - self.last_n_tokens_data.extend( - [llama_cpp.llama_token(0)] * self.last_n_tokens_size - ) - self.tokens_consumed = 0 + self.n_tokens = 0 - def eval(self, tokens: Sequence[llama_cpp.llama_token]): + def eval(self, tokens: Sequence[int]): """Evaluate a list of tokens. Args: tokens: The list of tokens to evaluate. """ assert self.ctx is not None - n_ctx = int(llama_cpp.llama_n_ctx(self.ctx)) + n_ctx = self._n_ctx for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] - n_past = min(n_ctx - len(batch), self.tokens_consumed) + n_past = min(n_ctx - len(batch), len(self._input_ids)) + n_tokens = len(batch) return_code = llama_cpp.llama_eval( ctx=self.ctx, tokens=(llama_cpp.llama_token * len(batch))(*batch), - n_tokens=llama_cpp.c_int(len(batch)), + n_tokens=llama_cpp.c_int(n_tokens), n_past=llama_cpp.c_int(n_past), n_threads=llama_cpp.c_int(self.n_threads), ) - if int(return_code) != 0: + if return_code != 0: raise RuntimeError(f"llama_eval returned {return_code}") - self.last_n_tokens_data.extend(batch) - self.tokens_consumed += len(batch) + # Save tokens + self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch + # Save logits + rows = n_tokens if self.params.logits_all else 1 + cols = self._n_vocab + offset = ( + 0 if self.params.logits_all else n_tokens - 1 + ) # NOTE: Only save the last token logits if logits_all is False + self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape( + -1 + )[:] = llama_cpp.llama_get_logits(self.ctx)[: rows * cols] + # Update n_tokens + self.n_tokens += n_tokens + + def _sample( + self, + last_n_tokens_data, # type: llama_cpp.Array[llama_cpp.llama_token] + last_n_tokens_size: llama_cpp.c_int, + top_k: llama_cpp.c_int, + top_p: llama_cpp.c_float, + temp: llama_cpp.c_float, + tfs_z: llama_cpp.c_float, + repeat_penalty: llama_cpp.c_float, + frequency_penalty: llama_cpp.c_float, + presence_penalty: llama_cpp.c_float, + mirostat_mode: llama_cpp.c_int, + mirostat_tau: llama_cpp.c_float, + mirostat_eta: llama_cpp.c_float, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, + ): + assert self.ctx is not None + assert self.n_tokens > 0 + n_vocab = self._n_vocab + n_ctx = self._n_ctx + top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k + last_n_tokens_size = ( + llama_cpp.c_int(n_ctx) + if last_n_tokens_size.value < 0 + else last_n_tokens_size + ) + logits: npt.NDArray[np.single] = self._scores[-1, :] + + if logits_processor is not None: + logits = np.array( + logits_processor(self._input_ids.tolist(), logits.tolist()), + dtype=np.single, + ) + self._scores[-1, :] = logits + + nl_logit = logits[self._token_nl] + candidates = self._candidates + candidates_data = self._candidates_data + candidates_data["id"][:] = self._candidates_data_id # type: ignore + candidates_data["logit"][:] = logits + candidates_data["p"][:] = self._candidates_data_p # type: ignore + candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p) + candidates.sorted = llama_cpp.c_bool(False) + candidates.size = llama_cpp.c_size_t(n_vocab) + llama_cpp.llama_sample_repetition_penalty( + ctx=self.ctx, + last_tokens_data=last_n_tokens_data, + last_tokens_size=last_n_tokens_size, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + penalty=repeat_penalty, + ) + llama_cpp.llama_sample_frequency_and_presence_penalties( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + last_tokens_data=last_n_tokens_data, + last_tokens_size=last_n_tokens_size, + alpha_frequency=frequency_penalty, + alpha_presence=presence_penalty, + ) + if not penalize_nl: + candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit) + if temp.value == 0.0: + return llama_cpp.llama_sample_token_greedy( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + ) + elif mirostat_mode.value == 1: + mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) + mirostat_m = llama_cpp.c_int(100) + llama_cpp.llama_sample_temperature( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + temp=temp, + ) + return llama_cpp.llama_sample_token_mirostat( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + tau=mirostat_tau, + eta=mirostat_eta, + mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore + m=mirostat_m, + ) + elif mirostat_mode.value == 2: + mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) + llama_cpp.llama_sample_temperature( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + temp=temp, + ) + return llama_cpp.llama_sample_token_mirostat_v2( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + tau=mirostat_tau, + eta=mirostat_eta, + mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore + ) + else: + llama_cpp.llama_sample_top_k( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + k=top_k, + min_keep=llama_cpp.c_size_t(1), + ) + llama_cpp.llama_sample_tail_free( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + z=tfs_z, + min_keep=llama_cpp.c_size_t(1), + ) + llama_cpp.llama_sample_typical( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + p=llama_cpp.c_float(1.0), + min_keep=llama_cpp.c_size_t(1), + ) + llama_cpp.llama_sample_top_p( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + p=top_p, + min_keep=llama_cpp.c_size_t(1), + ) + llama_cpp.llama_sample_temperature( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + temp=temp, + ) + return llama_cpp.llama_sample_token( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + ) def sample( self, - top_k: int, - top_p: float, - temp: float, - repeat_penalty: float, + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_eta: float = 0.1, + mirostat_tau: float = 5.0, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, ): """Sample a token from the model. @@ -179,28 +613,45 @@ class Llama: The sampled token. """ assert self.ctx is not None - return llama_cpp.llama_sample_top_p_top_k( - ctx=self.ctx, + last_n_tokens_data = [llama_cpp.llama_token(0)] * max( + 0, self.last_n_tokens_size - len(self._input_ids) + ) + self._input_ids[-self.last_n_tokens_size :].tolist() + return self._sample( last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)( - *self.last_n_tokens_data + *last_n_tokens_data ), last_n_tokens_size=llama_cpp.c_int(self.last_n_tokens_size), top_k=llama_cpp.c_int(top_k), top_p=llama_cpp.c_float(top_p), temp=llama_cpp.c_float(temp), + tfs_z=llama_cpp.c_float(tfs_z), repeat_penalty=llama_cpp.c_float(repeat_penalty), + frequency_penalty=llama_cpp.c_float(frequency_penalty), + presence_penalty=llama_cpp.c_float(presence_penalty), + mirostat_mode=llama_cpp.c_int(mirostat_mode), + mirostat_tau=llama_cpp.c_float(mirostat_tau), + mirostat_eta=llama_cpp.c_float(mirostat_eta), + penalize_nl=penalize_nl, + logits_processor=logits_processor, ) def generate( self, - tokens: Sequence[llama_cpp.llama_token], - top_k: int, - top_p: float, - temp: float, - repeat_penalty: float, - ) -> Generator[ - llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None - ]: + tokens: Sequence[int], + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + reset: bool = True, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. Examples: @@ -215,12 +666,30 @@ class Llama: top_p: The top-p sampling parameter. temp: The temperature parameter. repeat_penalty: The repeat penalty parameter. + reset: Whether to reset the model state. Yields: The generated tokens. """ assert self.ctx is not None - self.reset() + + if reset and len(self._input_ids) > 0: + longest_prefix = 0 + for a, b in zip(self._input_ids, tokens[:-1]): + if a == b: + longest_prefix += 1 + else: + break + if longest_prefix > 0: + if self.verbose: + print("Llama.generate: prefix-match hit", file=sys.stderr) + reset = False + tokens = tokens[longest_prefix:] + self.n_tokens = longest_prefix + + if reset: + self.reset() + while True: self.eval(tokens) token = self.sample( @@ -228,13 +697,26 @@ class Llama: top_p=top_p, temp=temp, repeat_penalty=repeat_penalty, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + logits_processor=logits_processor, ) + if stopping_criteria is not None and stopping_criteria( + self._input_ids.tolist(), self._scores[-1, :].tolist() + ): + return tokens_or_none = yield token tokens = [token] if tokens_or_none is not None: tokens.extend(tokens_or_none) - def create_embedding(self, input: str) -> Embedding: + def create_embedding( + self, input: Union[str, List[str]], model: Optional[str] = None + ) -> Embedding: """Embed a string. Args: @@ -244,6 +726,7 @@ class Llama: An embedding object. """ assert self.ctx is not None + model_name: str = model if model is not None else self.model_path if self.params.embedding == False: raise RuntimeError( @@ -253,30 +736,40 @@ class Llama: if self.verbose: llama_cpp.llama_reset_timings(self.ctx) - tokens = self.tokenize(input.encode("utf-8")) - self.reset() - self.eval(tokens) - n_tokens = len(tokens) - embedding = llama_cpp.llama_get_embeddings(self.ctx)[ - : llama_cpp.llama_n_embd(self.ctx) - ] + if isinstance(input, str): + inputs = [input] + else: + inputs = input + data: List[EmbeddingData] = [] + total_tokens = 0 + for index, input in enumerate(inputs): + tokens = self.tokenize(input.encode("utf-8")) + self.reset() + self.eval(tokens) + n_tokens = len(tokens) + total_tokens += n_tokens + embedding = llama_cpp.llama_get_embeddings(self.ctx)[ + : llama_cpp.llama_n_embd(self.ctx) + ] + + data.append( + { + "object": "embedding", + "embedding": embedding, + "index": index, + } + ) if self.verbose: llama_cpp.llama_print_timings(self.ctx) return { "object": "list", - "data": [ - { - "object": "embedding", - "embedding": embedding, - "index": 0, - } - ], - "model": self.model_path, + "data": data, + "model": model_name, "usage": { - "prompt_tokens": n_tokens, - "total_tokens": n_tokens, + "prompt_tokens": total_tokens, + "total_tokens": total_tokens, }, } @@ -300,19 +793,33 @@ class Llama: top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, - stop: List[str] = [], + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, - ) -> Union[Iterator[Completion], Iterator[CompletionChunk],]: + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, + ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None - completion_id = f"cmpl-{str(uuid.uuid4())}" - created = int(time.time()) - completion_tokens: List[llama_cpp.llama_token] = [] + + completion_id: str = f"cmpl-{str(uuid.uuid4())}" + created: int = int(time.time()) + completion_tokens: List[int] = [] # Add blank space to start of prompt to match OG llama tokenizer - prompt_tokens = self.tokenize(b" " + prompt.encode("utf-8")) - text = b"" - returned_characters = 0 + prompt_tokens: List[int] = self.tokenize(b" " + prompt.encode("utf-8")) + text: bytes = b"" + returned_tokens: int = 0 + stop = ( + stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] + ) + model_name: str = model if model is not None else self.model_path if self.verbose: llama_cpp.llama_reset_timings(self.ctx) @@ -327,29 +834,82 @@ class Llama: max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens) elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)): raise ValueError( - f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}" + f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}" ) + # Truncate max_tokens if requested tokens would exceed the context window + max_tokens = ( + max_tokens + if max_tokens + len(prompt_tokens) < self._n_ctx + else (self._n_ctx - len(prompt_tokens)) + ) + if stop != []: stop_sequences = [s.encode("utf-8") for s in stop] else: stop_sequences = [] - finish_reason = None + if logprobs is not None and self.params.logits_all is False: + raise ValueError( + "logprobs is not supported for models created with logits_all=False" + ) + + if self.cache: + try: + cache_item = self.cache[prompt_tokens] + cache_prefix_len = Llama.longest_token_prefix( + cache_item.input_ids.tolist(), prompt_tokens + ) + eval_prefix_len = Llama.longest_token_prefix( + self._input_ids.tolist(), prompt_tokens + ) + if cache_prefix_len > eval_prefix_len: + self.load_state(cache_item) + if self.verbose: + print("Llama._create_completion: cache hit", file=sys.stderr) + except KeyError: + if self.verbose: + print("Llama._create_completion: cache miss", file=sys.stderr) + + finish_reason = "length" + multibyte_fix = 0 for token in self.generate( prompt_tokens, top_k=top_k, top_p=top_p, temp=temperature, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ): - if token == llama_cpp.llama_token_eos(): + if token == self._token_eos: text = self.detokenize(completion_tokens) finish_reason = "stop" break + completion_tokens.append(token) all_text = self.detokenize(completion_tokens) + + # Contains multi-byte UTF8 + for k, char in enumerate(all_text[-3:]): + k = 3 - k + for num, pattern in [(2, 192), (3, 224), (4, 240)]: + # Bitwise AND check + if num > k and pattern & char == pattern: + multibyte_fix = num - k + + # Stop incomplete bytes from passing + if multibyte_fix > 0: + multibyte_fix -= 1 + continue + any_stop = [s for s in stop_sequences if s in all_text] if len(any_stop) > 0: first_stop = any_stop[0] @@ -358,82 +918,289 @@ class Llama: break if stream: - start = returned_characters - longest = 0 + remaining_tokens = completion_tokens[returned_tokens:] + remaining_text = self.detokenize(remaining_tokens) + remaining_length = len(remaining_text) + # We want to avoid yielding any characters from # the generated text if they are part of a stop # sequence. + first_stop_position = 0 for s in stop_sequences: - for i in range(len(s), 0, -1): - if all_text.endswith(s[:i]): - if i > longest: - longest = i + for i in range(min(len(s), remaining_length), 0, -1): + if remaining_text.endswith(s[:i]): + if i > first_stop_position: + first_stop_position = i break - text = all_text[: len(all_text) - longest] - returned_characters += len(text[start:]) - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": self.model_path, - "choices": [ - { - "text": text[start:].decode("utf-8"), - "index": 0, - "logprobs": None, - "finish_reason": None, + + token_end_position = 0 + for token in remaining_tokens: + token_end_position += len(self.detokenize([token])) + # Check if stop sequence is in the token + if token_end_position >= ( + remaining_length - first_stop_position - 1 + ): + break + logprobs_or_none: Optional[CompletionLogprobs] = None + if logprobs is not None: + token_str = self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + text_offset = len(prompt) + len( + self.detokenize(completion_tokens[:returned_tokens]) + ) + token_offset = len(prompt_tokens) + returned_tokens + logits = self._scores[token_offset - 1, :].tolist() + current_logprobs = Llama.logits_to_logprobs(logits) + sorted_logprobs = list( + sorted( + zip(current_logprobs, range(len(current_logprobs))), + reverse=True, + ) + ) + top_logprob = { + self.detokenize([i]).decode( + "utf-8", errors="ignore" + ): logprob + for logprob, i in sorted_logprobs[:logprobs] } - ], - } + top_logprob.update({token_str: current_logprobs[int(token)]}) + logprobs_or_none = { + "tokens": [ + self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + ], + "text_offset": [text_offset], + "token_logprobs": [current_logprobs[int(token)]], + "top_logprobs": [top_logprob], + } + returned_tokens += 1 + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": self.detokenize([token]).decode( + "utf-8", errors="ignore" + ), + "index": 0, + "logprobs": logprobs_or_none, + "finish_reason": None, + } + ], + } + if len(completion_tokens) >= max_tokens: text = self.detokenize(completion_tokens) finish_reason = "length" break - if finish_reason is None: - finish_reason = "length" - - if stream: - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": self.model_path, - "choices": [ - { - "text": text[returned_characters:].decode("utf-8"), - "index": 0, - "logprobs": None, - "finish_reason": finish_reason, - } - ], - } - return - - text = text.decode("utf-8") - - if echo: - text = prompt + text - - if suffix is not None: - text = text + suffix - - if logprobs is not None: - raise NotImplementedError("logprobs not implemented") + if stopping_criteria is not None and stopping_criteria( + self._input_ids.tolist(), self._scores[-1, :].tolist() + ): + text = self.detokenize(completion_tokens) + finish_reason = "stop" if self.verbose: llama_cpp.llama_print_timings(self.ctx) + if stream: + remaining_tokens = completion_tokens[returned_tokens:] + all_text = self.detokenize(remaining_tokens) + any_stop = [s for s in stop_sequences if s in all_text] + if len(any_stop) > 0: + end = min(all_text.index(stop) for stop in any_stop) + else: + end = len(all_text) + + token_end_position = 0 + for token in remaining_tokens: + token_end_position += len(self.detokenize([token])) + + logprobs_or_none: Optional[CompletionLogprobs] = None + if logprobs is not None: + token_str = self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + text_offset = len(prompt) + len( + self.detokenize(completion_tokens[:returned_tokens]) + ) + token_offset = len(prompt_tokens) + returned_tokens - 1 + logits = self._scores[token_offset, :].tolist() + current_logprobs = Llama.logits_to_logprobs(logits) + sorted_logprobs = list( + sorted( + zip(current_logprobs, range(len(current_logprobs))), + reverse=True, + ) + ) + top_logprob = { + self.detokenize([i]).decode("utf-8", errors="ignore"): logprob + for logprob, i in sorted_logprobs[:logprobs] + } + top_logprob.update({token_str: current_logprobs[int(token)]}) + logprobs_or_none = { + "tokens": [ + self.detokenize([token]).decode("utf-8", errors="ignore") + ], + "text_offset": [text_offset], + "token_logprobs": [current_logprobs[int(token)]], + "top_logprobs": [top_logprob], + } + + if token_end_position >= end: + last_text = self.detokenize([token]) + if token_end_position == end - 1: + break + returned_tokens += 1 + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": last_text[ + : len(last_text) - (token_end_position - end) + ].decode("utf-8", errors="ignore"), + "index": 0, + "logprobs": logprobs_or_none, + "finish_reason": None, + } + ], + } + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": "", + "index": 0, + "logprobs": None, + "finish_reason": finish_reason, + } + ], + } + break + returned_tokens += 1 + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": self.detokenize([token]).decode( + "utf-8", errors="ignore" + ), + "index": 0, + "logprobs": logprobs_or_none, + "finish_reason": None, + } + ], + } + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": "", + "index": 0, + "logprobs": None, + "finish_reason": finish_reason, + } + ], + } + if self.cache: + if self.verbose: + print("Llama._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() + print("Llama._create_completion: cache saved", file=sys.stderr) + return + + if self.cache: + if self.verbose: + print("Llama._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() + + text_str = text.decode("utf-8", errors="ignore") + + if echo: + text_str = prompt + text_str + + if suffix is not None: + text_str = text_str + suffix + + logprobs_or_none: Optional[CompletionLogprobs] = None + if logprobs is not None: + text_offset = 0 if echo else len(prompt) + token_offset = 0 if echo else len(prompt_tokens[1:]) + text_offsets: List[int] = [] + token_logprobs: List[Optional[float]] = [] + tokens: List[str] = [] + top_logprobs: List[Optional[Dict[str, float]]] = [] + + if echo: + # Remove leading BOS token + all_tokens = prompt_tokens[1:] + completion_tokens + else: + all_tokens = completion_tokens + + all_token_strs = [ + self.detokenize([token]).decode("utf-8", errors="ignore") + for token in all_tokens + ] + all_logprobs = [ + Llama.logits_to_logprobs(row.tolist()) for row in self._scores + ][token_offset:] + for token, token_str, logprobs_token in zip( + all_tokens, all_token_strs, all_logprobs + ): + text_offsets.append(text_offset) + text_offset += len(token_str) + tokens.append(token_str) + sorted_logprobs = list( + sorted( + zip(logprobs_token, range(len(logprobs_token))), reverse=True + ) + ) + token_logprobs.append(logprobs_token[int(token)]) + top_logprob: Optional[Dict[str, float]] = { + self.detokenize([i]).decode("utf-8", errors="ignore"): logprob + for logprob, i in sorted_logprobs[:logprobs] + } + top_logprob.update({token_str: logprobs_token[int(token)]}) + top_logprobs.append(top_logprob) + # Weird idosincracy of the OpenAI API where + # token_logprobs and top_logprobs are null for + # the first token. + if echo and len(all_tokens) > 0: + token_logprobs[0] = None + top_logprobs[0] = None + logprobs_or_none = { + "tokens": tokens, + "text_offset": text_offsets, + "token_logprobs": token_logprobs, + "top_logprobs": top_logprobs, + } + yield { "id": completion_id, "object": "text_completion", "created": created, - "model": self.model_path, + "model": model_name, "choices": [ { - "text": text, + "text": text_str, "index": 0, - "logprobs": None, + "logprobs": logprobs_or_none, "finish_reason": finish_reason, } ], @@ -453,10 +1220,19 @@ class Llama: top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, - stop: List[str] = [], + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -489,9 +1265,18 @@ class Llama: logprobs=logprobs, echo=echo, stop=stop, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks @@ -508,10 +1293,19 @@ class Llama: top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, - stop: List[str] = [], + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -544,9 +1338,18 @@ class Llama: logprobs=logprobs, echo=echo, stop=stop, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ) def _convert_text_completion_to_chat( @@ -601,7 +1404,9 @@ class Llama: "index": 0, "delta": { "content": chunk["choices"][0]["text"], - }, + } + if chunk["choices"][0]["finish_reason"] is None + else {}, "finish_reason": chunk["choices"][0]["finish_reason"], } ], @@ -610,13 +1415,21 @@ class Llama: def create_chat_completion( self, messages: List[ChatCompletionMessage], - temperature: float = 0.8, + temperature: float = 0.2, top_p: float = 0.95, top_k: int = 40, stream: bool = False, - stop: List[str] = [], - max_tokens: int = 128, + stop: Optional[Union[str, List[str]]] = [], + max_tokens: int = 256, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -633,13 +1446,15 @@ class Llama: Returns: Generated chat completion or a stream of chat completion chunks. """ - instructions = """Complete the following chat conversation between the user and the assistant. System messages should be strictly followed as additional instructions.""" - chat_history = "\n".join( - f'{message["role"]} {message.get("user", "")}: {message["content"]}' + stop = ( + stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] + ) + chat_history = "".join( + f'### {"Human" if message["role"] == "user" else "Assistant"}:{message["content"]}' for message in messages ) - PROMPT = f" \n\n### Instructions:{instructions}\n\n### Inputs:{chat_history}\n\n### Response:\nassistant: " - PROMPT_STOP = ["###", "\nuser: ", "\nassistant: ", "\nsystem: "] + PROMPT = chat_history + "### Assistant:" + PROMPT_STOP = ["### Assistant:", "### Human:"] completion_or_chunks = self( prompt=PROMPT, stop=PROMPT_STOP + stop, @@ -649,6 +1464,14 @@ class Llama: stream=stream, max_tokens=max_tokens, repeat_penalty=repeat_penalty, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore @@ -658,6 +1481,9 @@ class Llama: return self._convert_text_completion_to_chat(completion) def __del__(self): + if self.model is not None: + llama_cpp.llama_free_model(self.model) + self.model = None if self.ctx is not None: llama_cpp.llama_free(self.ctx) self.ctx = None @@ -667,7 +1493,7 @@ class Llama: verbose=self.verbose, model_path=self.model_path, n_ctx=self.params.n_ctx, - n_parts=self.params.n_parts, + n_gpu_layers=self.params.n_gpu_layers, seed=self.params.seed, f16_kv=self.params.f16_kv, logits_all=self.params.logits_all, @@ -675,11 +1501,15 @@ class Llama: use_mmap=self.params.use_mmap, use_mlock=self.params.use_mlock, embedding=self.params.embedding, + low_vram=self.params.low_vram, last_n_tokens_size=self.last_n_tokens_size, - last_n_tokens_data=self.last_n_tokens_data, - tokens_consumed=self.tokens_consumed, n_batch=self.n_batch, n_threads=self.n_threads, + lora_base=self.lora_base, + lora_path=self.lora_path, + ### DEPRECATED ### + n_parts=self.n_parts, + ### DEPRECATED ### ) def __setstate__(self, state): @@ -687,6 +1517,7 @@ class Llama: model_path=state["model_path"], n_ctx=state["n_ctx"], n_parts=state["n_parts"], + n_gpu_layers=state["n_gpu_layers"], seed=state["seed"], f16_kv=state["f16_kv"], logits_all=state["logits_all"], @@ -694,21 +1525,121 @@ class Llama: use_mmap=state["use_mmap"], use_mlock=state["use_mlock"], embedding=state["embedding"], + low_vram=state["low_vram"], n_threads=state["n_threads"], n_batch=state["n_batch"], last_n_tokens_size=state["last_n_tokens_size"], + lora_base=state["lora_base"], + lora_path=state["lora_path"], verbose=state["verbose"], ) - self.last_n_tokens_data = state["last_n_tokens_data"] - self.tokens_consumed = state["tokens_consumed"] + def save_state(self) -> LlamaState: + assert self.ctx is not None + if self.verbose: + print("Llama.save_state: saving llama state", file=sys.stderr) + state_size = llama_cpp.llama_get_state_size(self.ctx) + if self.verbose: + print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr) + llama_state = (llama_cpp.c_uint8 * int(state_size))() + if self.verbose: + print("Llama.save_state: allocated state", file=sys.stderr) + n_bytes = llama_cpp.llama_copy_state_data(self.ctx, llama_state) + if self.verbose: + print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr) + if int(n_bytes) > int(state_size): + raise RuntimeError("Failed to copy llama state data") + llama_state_compact = (llama_cpp.c_uint8 * int(n_bytes))() + llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes)) + if self.verbose: + print( + f"Llama.save_state: saving {n_bytes} bytes of llama state", + file=sys.stderr, + ) + return LlamaState( + scores=self.scores.copy(), + input_ids=self.input_ids.copy(), + n_tokens=self.n_tokens, + llama_state=bytes(llama_state_compact), + llama_state_size=n_bytes, + ) + + def load_state(self, state: LlamaState) -> None: + assert self.ctx is not None + self.scores = state.scores.copy() + self.input_ids = state.input_ids.copy() + self.n_tokens = state.n_tokens + state_size = state.llama_state_size + LLamaStateArrayType = llama_cpp.c_uint8 * state_size + llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state) + + if llama_cpp.llama_set_state_data(self.ctx, llama_state) != state_size: + raise RuntimeError("Failed to set llama state data") + + def n_ctx(self) -> int: + """Return the context window size.""" + assert self.ctx is not None + return llama_cpp.llama_n_ctx(self.ctx) + + def n_embd(self) -> int: + """Return the embedding size.""" + assert self.ctx is not None + return llama_cpp.llama_n_embd(self.ctx) + + def n_vocab(self) -> int: + """Return the vocabulary size.""" + assert self.ctx is not None + return llama_cpp.llama_n_vocab(self.ctx) + + def tokenizer(self) -> "LlamaTokenizer": + """Return the tokenizer for this model.""" + assert self.ctx is not None + return LlamaTokenizer(self) @staticmethod - def token_eos() -> llama_cpp.llama_token: + def token_eos() -> int: """Return the end-of-sequence token.""" return llama_cpp.llama_token_eos() @staticmethod - def token_bos() -> llama_cpp.llama_token: + def token_bos() -> int: """Return the beginning-of-sequence token.""" return llama_cpp.llama_token_bos() + + @staticmethod + def token_nl() -> int: + """Return the newline token.""" + return llama_cpp.llama_token_nl() + + @staticmethod + def logits_to_logprobs(logits: List[float]) -> List[float]: + exps = [math.exp(float(x)) for x in logits] + sum_exps = sum(exps) + return [math.log(x / sum_exps) for x in exps] + + @staticmethod + def longest_token_prefix(a: Sequence[int], b: Sequence[int]): + longest_prefix = 0 + for _a, _b in zip(a, b): + if _a == _b: + longest_prefix += 1 + else: + break + return longest_prefix + + +class LlamaTokenizer: + def __init__(self, llama: Llama): + self.llama = llama + + def encode(self, text: str, add_bos: bool = True) -> List[int]: + return self.llama.tokenize( + text.encode("utf-8", errors="ignore"), add_bos=add_bos + ) + + def decode(self, tokens: List[int]) -> str: + return self.llama.detokenize(tokens).decode("utf-8", errors="ignore") + + @classmethod + def from_ggml_file(cls, path: str) -> "LlamaTokenizer": + return cls(Llama(model_path=path, vocab_only=True)) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 8a5869c..17c6319 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1,49 +1,77 @@ import sys import os import ctypes -from ctypes import c_int, c_float, c_char_p, c_void_p, c_bool, POINTER, Structure, Array, c_uint8, c_size_t +from ctypes import ( + c_double, + c_int, + c_float, + c_char_p, + c_int32, + c_uint32, + c_void_p, + c_bool, + POINTER, + _Pointer, # type: ignore + Structure, + Array, + c_uint8, + c_size_t, +) import pathlib +from typing import List, Union + # Load the library -def _load_shared_library(lib_base_name): - # Determine the file extension based on the platform - if sys.platform.startswith("linux"): - lib_ext = ".so" - elif sys.platform == "darwin": - lib_ext = ".so" - elif sys.platform == "win32": - lib_ext = ".dll" - else: - raise RuntimeError("Unsupported platform") - +def _load_shared_library(lib_base_name: str): # Construct the paths to the possible shared library names _base_path = pathlib.Path(__file__).parent.resolve() # Searching for the library in the current directory under the name "libllama" (default name # for llamacpp) and "llama" (default name for this repo) - _lib_paths = [ - _base_path / f"lib{lib_base_name}{lib_ext}", - _base_path / f"{lib_base_name}{lib_ext}" - ] + _lib_paths: List[pathlib.Path] = [] + # Determine the file extension based on the platform + if sys.platform.startswith("linux"): + _lib_paths += [ + _base_path / f"lib{lib_base_name}.so", + ] + elif sys.platform == "darwin": + _lib_paths += [ + _base_path / f"lib{lib_base_name}.so", + _base_path / f"lib{lib_base_name}.dylib", + ] + elif sys.platform == "win32": + _lib_paths += [ + _base_path / f"{lib_base_name}.dll", + ] + else: + raise RuntimeError("Unsupported platform") - if ("LLAMA_CPP_LIB" in os.environ): + if "LLAMA_CPP_LIB" in os.environ: lib_base_name = os.environ["LLAMA_CPP_LIB"] _lib = pathlib.Path(lib_base_name) _base_path = _lib.parent.resolve() _lib_paths = [_lib.resolve()] + cdll_args = dict() # type: ignore # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) + cdll_args["winmode"] = 0 # Try to load the shared library, handling potential errors for _lib_path in _lib_paths: if _lib_path.exists(): try: - return ctypes.CDLL(str(_lib_path)) + return ctypes.CDLL(str(_lib_path), **cdll_args) except Exception as e: raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") - raise FileNotFoundError(f"Shared library with base name '{lib_base_name}' not found") + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found" + ) + # Specify the base name of the shared library to load _lib_base_name = "llama" @@ -51,54 +79,214 @@ _lib_base_name = "llama" # Load the library _lib = _load_shared_library(_lib_base_name) -# C types +# Misc +c_float_p = POINTER(c_float) +c_uint8_p = POINTER(c_uint8) +c_size_t_p = POINTER(c_size_t) + +# llama.h bindings + +GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas") +GGML_CUDA_MAX_DEVICES = ctypes.c_int(16) +LLAMA_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1) + +# #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) +# #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) +# #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) +# #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) +# #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' +LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) + +# #define LLAMA_FILE_VERSION 3 +LLAMA_FILE_VERSION = c_int(3) +LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT +LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML +LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN +LLAMA_SESSION_VERSION = c_int(1) + +# #define LLAMA_DEFAULT_SEED 0xFFFFFFFF +LLAMA_DEFAULT_SEED = c_int(0xFFFFFFFF) + +# struct llama_model; +llama_model_p = c_void_p + +# struct llama_context; llama_context_p = c_void_p +# typedef int llama_token; llama_token = c_int llama_token_p = POINTER(llama_token) +# typedef struct llama_token_data { +# llama_token id; // token id +# float logit; // log-odds of the token +# float p; // probability of the token +# } llama_token_data; class llama_token_data(Structure): _fields_ = [ - ("id", llama_token), # token id - ("p", c_float), # probability of the token - ("plog", c_float), # log probability of the token + ("id", llama_token), + ("logit", c_float), + ("p", c_float), ] llama_token_data_p = POINTER(llama_token_data) + +# typedef struct llama_token_data_array { +# llama_token_data * data; +# size_t size; +# bool sorted; +# } llama_token_data_array; +class llama_token_data_array(Structure): + _fields_ = [ + ("data", llama_token_data_p), + ("size", c_size_t), + ("sorted", c_bool), + ] + + +llama_token_data_array_p = POINTER(llama_token_data_array) + +# typedef void (*llama_progress_callback)(float progress, void *ctx); llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) +# struct llama_context_params { +# uint32_t seed; // RNG seed, -1 for random +# int32_t n_ctx; // text context +# int32_t n_batch; // prompt processing batch size +# int32_t n_gpu_layers; // number of layers to store in VRAM +# int32_t main_gpu; // the GPU that is used for scratch and small tensors +# float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs +# // called with a progress value between 0 and 1, pass NULL to disable +# llama_progress_callback progress_callback; +# // context pointer passed to the progress callback +# void * progress_callback_user_data; + + +# // Keep the booleans together to avoid misalignment during copy-by-value. +# bool low_vram; // if true, reduce VRAM usage at the cost of performance +# bool f16_kv; // use fp16 for KV cache +# bool logits_all; // the llama_eval() call computes all logits, not just the last one +# bool vocab_only; // only load the vocabulary, no weights +# bool use_mmap; // use mmap if possible +# bool use_mlock; // force system to keep model in RAM +# bool embedding; // embedding mode only +# }; class llama_context_params(Structure): _fields_ = [ - ("n_ctx", c_int), # text context - ("n_parts", c_int), # -1 for default - ("seed", c_int), # RNG seed, 0 for random - ("f16_kv", c_bool), # use fp16 for KV cache - ( - "logits_all", - c_bool, - ), # the llama_eval() call computes all logits, not just the last one - ("vocab_only", c_bool), # only load the vocabulary, no weights - ("use_mmap", c_bool), # use mmap if possible - ("use_mlock", c_bool), # force system to keep model in RAM - ("embedding", c_bool), # embedding mode only - # called with a progress value between 0 and 1, pass NULL to disable + ("seed", c_uint32), + ("n_ctx", c_int32), + ("n_batch", c_int32), + ("n_gpu_layers", c_int32), + ("main_gpu", c_int32), + ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), ("progress_callback", llama_progress_callback), - # context pointer passed to the progress callback ("progress_callback_user_data", c_void_p), + ("low_vram", c_bool), + ("f16_kv", c_bool), + ("logits_all", c_bool), + ("vocab_only", c_bool), + ("use_mmap", c_bool), + ("use_mlock", c_bool), + ("embedding", c_bool), ] llama_context_params_p = POINTER(llama_context_params) - -# Functions +# enum llama_ftype { +# LLAMA_FTYPE_ALL_F32 = 0, +# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 +# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed +# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed +# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors +# }; +LLAMA_FTYPE_ALL_F32 = c_int(0) +LLAMA_FTYPE_MOSTLY_F16 = c_int(1) +LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) +LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) +LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) +LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) +LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) +LLAMA_FTYPE_MOSTLY_Q2_K = c_int(10) +LLAMA_FTYPE_MOSTLY_Q3_K_S = c_int(11) +LLAMA_FTYPE_MOSTLY_Q3_K_M = c_int(12) +LLAMA_FTYPE_MOSTLY_Q3_K_L = c_int(13) +LLAMA_FTYPE_MOSTLY_Q4_K_S = c_int(14) +LLAMA_FTYPE_MOSTLY_Q4_K_M = c_int(15) +LLAMA_FTYPE_MOSTLY_Q5_K_S = c_int(16) +LLAMA_FTYPE_MOSTLY_Q5_K_M = c_int(17) +LLAMA_FTYPE_MOSTLY_Q6_K = c_int(18) +# // model quantization parameters +# typedef struct llama_model_quantize_params { +# int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() +# enum llama_ftype ftype; // quantize to this llama_ftype +# bool allow_requantize; // allow quantizing non-f32/f16 tensors +# bool quantize_output_tensor; // quantize output.weight +# } llama_model_quantize_params; +class llama_model_quantize_params(Structure): + _fields_ = [ + ("nthread", c_int), + ("ftype", c_int), + ("allow_requantize", c_bool), + ("quantize_output_tensor", c_bool), + ] + + +# // performance timing information +# struct llama_timings { +# double t_start_ms; +# double t_end_ms; +# double t_load_ms; +# double t_sample_ms; +# double t_p_eval_ms; +# double t_eval_ms; + + +# int32_t n_sample; +# int32_t n_p_eval; +# int32_t n_eval; +# }; +class llama_timings(Structure): + _fields_ = [ + ("t_start_ms", c_double), + ("t_end_ms", c_double), + ("t_load_ms", c_double), + ("t_sample_ms", c_double), + ("t_p_eval_ms", c_double), + ("t_eval_ms", c_double), + ("n_sample", c_int32), + ("n_p_eval", c_int32), + ("n_eval", c_int32), + ] + + +# LLAMA_API struct llama_context_params llama_context_default_params(); def llama_context_default_params() -> llama_context_params: return _lib.llama_context_default_params() @@ -106,21 +294,97 @@ def llama_context_default_params() -> llama_context_params: _lib.llama_context_default_params.argtypes = [] _lib.llama_context_default_params.restype = llama_context_params -def llama_mmap_supported() -> c_bool: + +# LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); +def llama_model_quantize_default_params() -> llama_model_quantize_params: + return _lib.llama_model_quantize_default_params() + + +_lib.llama_model_quantize_default_params.argtypes = [] +_lib.llama_model_quantize_default_params.restype = llama_model_quantize_params + + +# LLAMA_API bool llama_mmap_supported(); +def llama_mmap_supported() -> bool: return _lib.llama_mmap_supported() + _lib.llama_mmap_supported.argtypes = [] _lib.llama_mmap_supported.restype = c_bool -def llama_mlock_supported() -> c_bool: + +# LLAMA_API bool llama_mlock_supported(); +def llama_mlock_supported() -> bool: return _lib.llama_mlock_supported() + _lib.llama_mlock_supported.argtypes = [] _lib.llama_mlock_supported.restype = c_bool -# Various functions for loading a ggml llama model. -# Allocate (almost) all memory needed for the model. -# Return NULL on failure + +# // TODO: not great API - very likely to change +# // Initialize the llama + ggml backend +# // If numa is true, use NUMA optimizations +# // Call once at the start of the program +# LLAMA_API void llama_init_backend(bool numa); +def llama_init_backend(numa: c_bool): + return _lib.llama_init_backend(numa) + + +_lib.llama_init_backend.argtypes = [c_bool] +_lib.llama_init_backend.restype = None + + +# LLAMA_API struct llama_model * llama_load_model_from_file( +# const char * path_model, +# struct llama_context_params params); +def llama_load_model_from_file( + path_model: bytes, params: llama_context_params +) -> llama_model_p: + return _lib.llama_load_model_from_file(path_model, params) + + +_lib.llama_load_model_from_file.argtypes = [c_char_p, llama_context_params] +_lib.llama_load_model_from_file.restype = llama_model_p + + +# LLAMA_API void llama_free_model(struct llama_model * model); +def llama_free_model(model: llama_model_p): + return _lib.llama_free_model(model) + + +_lib.llama_free_model.argtypes = [llama_model_p] +_lib.llama_free_model.restype = None + + +# LLAMA_API struct llama_context * llama_new_context_with_model( +# struct llama_model * model, +# struct llama_context_params params); +def llama_new_context_with_model( + model: llama_model_p, params: llama_context_params +) -> llama_context_p: + return _lib.llama_new_context_with_model(model, params) + + +_lib.llama_new_context_with_model.argtypes = [llama_model_p, llama_context_params] +_lib.llama_new_context_with_model.restype = llama_context_p + + +# LLAMA_API int64_t llama_time_us(); +def llama_time_us() -> int: + return _lib.llama_time_us() + + +_lib.llama_time_us.argtypes = [] +_lib.llama_time_us.restype = ctypes.c_int64 + + +# // Various functions for loading a ggml llama model. +# // Allocate (almost) all memory needed for the model. +# // Return NULL on failure +# LLAMA_API struct llama_context * llama_init_from_file( +# const char * path_model, +# struct llama_context_params params); def llama_init_from_file( path_model: bytes, params: llama_context_params ) -> llama_context_p: @@ -132,67 +396,203 @@ _lib.llama_init_from_file.restype = llama_context_p # Frees all allocated memory +# LLAMA_API void llama_free(struct llama_context * ctx); def llama_free(ctx: llama_context_p): - _lib.llama_free(ctx) + return _lib.llama_free(ctx) _lib.llama_free.argtypes = [llama_context_p] _lib.llama_free.restype = None -# TODO: not great API - very likely to change -# Returns 0 on success +# // Returns 0 on success +# LLAMA_API int llama_model_quantize( +# const char * fname_inp, +# const char * fname_out, +# const llama_model_quantize_params * params); def llama_model_quantize( - fname_inp: bytes, fname_out: bytes, itype: c_int -) -> c_int: - return _lib.llama_model_quantize(fname_inp, fname_out, itype) + fname_inp: bytes, + fname_out: bytes, + params, # type: POINTER(llama_model_quantize_params) # type: ignore +) -> int: + return _lib.llama_model_quantize(fname_inp, fname_out, params) -_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int] +_lib.llama_model_quantize.argtypes = [ + c_char_p, + c_char_p, + POINTER(llama_model_quantize_params), +] _lib.llama_model_quantize.restype = c_int -# Returns the KV cache that will contain the context for the -# ongoing prediction with the model. -def llama_get_kv_cache(ctx: llama_context_p): - return _lib.llama_get_kv_cache(ctx) -_lib.llama_get_kv_cache.argtypes = [llama_context_p] -_lib.llama_get_kv_cache.restype = POINTER(c_uint8) +# Apply a LoRA adapter to a loaded model +# path_base_model is the path to a higher quality model to use as a base for +# the layers modified by the adapter. Can be NULL to use the current loaded model. +# The model needs to be reloaded before applying a new adapter, otherwise the adapter +# will be applied on top of the previous one +# Returns 0 on success +# LLAMA_API int llama_apply_lora_from_file( +# struct llama_context * ctx, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); +def llama_apply_lora_from_file( + ctx: llama_context_p, + path_lora: c_char_p, + path_base_model: c_char_p, + n_threads: c_int, +) -> int: + return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) -# Returns the size of the KV cache -def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t: - return _lib.llama_get_kv_cache_size(ctx) -_lib.llama_get_kv_cache_size.argtypes = [llama_context_p] -_lib.llama_get_kv_cache_size.restype = c_size_t +_lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int] +_lib.llama_apply_lora_from_file.restype = c_int + + +# LLAMA_API int llama_model_apply_lora_from_file( +# const struct llama_model * model, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); +def llama_model_apply_lora_from_file( + model: llama_model_p, + path_lora: Union[c_char_p, bytes], + path_base_model: Union[c_char_p, bytes], + n_threads: c_int, +) -> int: + return _lib.llama_model_apply_lora_from_file( + model, path_lora, path_base_model, n_threads + ) + + +_lib.llama_model_apply_lora_from_file.argtypes = [ + llama_model_p, + c_char_p, + c_char_p, + c_int, +] +_lib.llama_model_apply_lora_from_file.restype = c_int + # Returns the number of tokens in the KV cache -def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: +# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); +def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int: return _lib.llama_get_kv_cache_token_count(ctx) + _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] _lib.llama_get_kv_cache_token_count.restype = c_int -# Sets the KV cache containing the current context for the model -def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int): - return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count) +# Sets the current rng seed. +# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); +def llama_set_rng_seed(ctx: llama_context_p, seed: c_uint32): + return _lib.llama_set_rng_seed(ctx, seed) -_lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int] -_lib.llama_set_kv_cache.restype = None + +_lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int] +_lib.llama_set_rng_seed.restype = None + + +# Returns the maximum size in bytes of the state (rng, logits, embedding +# and kv_cache) - will often be smaller after compacting tokens +# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); +def llama_get_state_size(ctx: llama_context_p) -> int: + return _lib.llama_get_state_size(ctx) + + +_lib.llama_get_state_size.argtypes = [llama_context_p] +_lib.llama_get_state_size.restype = c_size_t + + +# Copies the state to the specified destination address. +# Destination needs to have allocated enough memory. +# Returns the number of bytes copied +# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); +def llama_copy_state_data( + ctx: llama_context_p, dst # type: Array[c_uint8] +) -> int: + return _lib.llama_copy_state_data(ctx, dst) + + +_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] +_lib.llama_copy_state_data.restype = c_size_t + + +# Set the state reading from the specified address +# Returns the number of bytes read +# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); +def llama_set_state_data( + ctx: llama_context_p, src # type: Array[c_uint8] +) -> int: + return _lib.llama_set_state_data(ctx, src) + + +_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p] +_lib.llama_set_state_data.restype = c_size_t + + +# Save/load session file +# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); +def llama_load_session_file( + ctx: llama_context_p, + path_session: bytes, + tokens_out, # type: Array[llama_token] + n_token_capacity: c_size_t, + n_token_count_out, # type: _Pointer[c_size_t] +) -> int: + return _lib.llama_load_session_file( + ctx, path_session, tokens_out, n_token_capacity, n_token_count_out + ) + + +_lib.llama_load_session_file.argtypes = [ + llama_context_p, + c_char_p, + llama_token_p, + c_size_t, + c_size_t_p, +] +_lib.llama_load_session_file.restype = c_size_t + + +# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); +def llama_save_session_file( + ctx: llama_context_p, + path_session: bytes, + tokens, # type: Array[llama_token] + n_token_count: c_size_t, +) -> int: + return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) + + +_lib.llama_save_session_file.argtypes = [ + llama_context_p, + c_char_p, + llama_token_p, + c_size_t, +] +_lib.llama_save_session_file.restype = c_size_t # Run the llama inference to obtain the logits and probabilities for the next token. # tokens + n_tokens is the provided batch of new tokens to process # n_past is the number of tokens to use from previous eval calls # Returns 0 on success +# LLAMA_API int llama_eval( +# struct llama_context * ctx, +# const llama_token * tokens, +# int n_tokens, +# int n_past, +# int n_threads); def llama_eval( ctx: llama_context_p, tokens, # type: Array[llama_token] n_tokens: c_int, n_past: c_int, n_threads: c_int, -) -> c_int: +) -> int: return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads) @@ -200,18 +600,45 @@ _lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int] _lib.llama_eval.restype = c_int +# // Same as llama_eval, but use float matrix input directly. +# LLAMA_API int llama_eval_embd( +# struct llama_context * ctx, +# const float * embd, +# int n_tokens, +# int n_past, +# int n_threads); +def llama_eval_embd( + ctx: llama_context_p, + embd, # type: Array[c_float] + n_tokens: c_int, + n_past: c_int, + n_threads: c_int, +) -> int: + return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads) + + +_lib.llama_eval_embd.argtypes = [llama_context_p, c_float_p, c_int, c_int, c_int] +_lib.llama_eval_embd.restype = c_int + + # Convert the provided text into tokens. # The tokens pointer must be large enough to hold the resulting tokens. # Returns the number of tokens on success, no more than n_max_tokens # Returns a negative number on failure - the number of tokens that would have been returned # TODO: not sure if correct +# LLAMA_API int llama_tokenize( +# struct llama_context * ctx, +# const char * text, +# llama_token * tokens, +# int n_max_tokens, +# bool add_bos); def llama_tokenize( ctx: llama_context_p, text: bytes, tokens, # type: Array[llama_token] n_max_tokens: c_int, add_bos: c_bool, -) -> c_int: +) -> int: return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos) @@ -219,7 +646,8 @@ _lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, _lib.llama_tokenize.restype = c_int -def llama_n_vocab(ctx: llama_context_p) -> c_int: +# LLAMA_API int llama_n_vocab(const struct llama_context * ctx); +def llama_n_vocab(ctx: llama_context_p) -> int: return _lib.llama_n_vocab(ctx) @@ -227,7 +655,8 @@ _lib.llama_n_vocab.argtypes = [llama_context_p] _lib.llama_n_vocab.restype = c_int -def llama_n_ctx(ctx: llama_context_p) -> c_int: +# LLAMA_API int llama_n_ctx (const struct llama_context * ctx); +def llama_n_ctx(ctx: llama_context_p) -> int: return _lib.llama_n_ctx(ctx) @@ -235,7 +664,8 @@ _lib.llama_n_ctx.argtypes = [llama_context_p] _lib.llama_n_ctx.restype = c_int -def llama_n_embd(ctx: llama_context_p) -> c_int: +# LLAMA_API int llama_n_embd (const struct llama_context * ctx); +def llama_n_embd(ctx: llama_context_p) -> int: return _lib.llama_n_embd(ctx) @@ -243,30 +673,57 @@ _lib.llama_n_embd.argtypes = [llama_context_p] _lib.llama_n_embd.restype = c_int +# // Get the vocabulary as output parameters. +# // Returns number of results. +# LLAMA_API int llama_get_vocab( +# const struct llama_context * ctx, +# const char * * strings, +# float * scores, +# int capacity); +def llama_get_vocab( + ctx: llama_context_p, + strings, # type: Array[c_char_p] # type: ignore + scores, # type: Array[c_float] # type: ignore + capacity: c_int, +) -> int: + return _lib.llama_get_vocab(ctx, strings, scores, capacity) + + +_lib.llama_get_vocab.argtypes = [llama_context_p, c_char_p, c_float, c_int] +_lib.llama_get_vocab.restype = c_int + + # Token logits obtained from the last call to llama_eval() # The logits for the last token are stored in the last row # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab -def llama_get_logits(ctx: llama_context_p): +# LLAMA_API float * llama_get_logits(struct llama_context * ctx); +def llama_get_logits( + ctx: llama_context_p, +): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_logits(ctx) _lib.llama_get_logits.argtypes = [llama_context_p] -_lib.llama_get_logits.restype = POINTER(c_float) +_lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input # shape: [n_embd] (1-dimensional) -def llama_get_embeddings(ctx: llama_context_p): +# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); +def llama_get_embeddings( + ctx: llama_context_p, +): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_embeddings(ctx) _lib.llama_get_embeddings.argtypes = [llama_context_p] -_lib.llama_get_embeddings.restype = POINTER(c_float) +_lib.llama_get_embeddings.restype = c_float_p # Token Id -> String. Uses the vocabulary in the provided context +# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: return _lib.llama_token_to_str(ctx, token) @@ -277,7 +734,8 @@ _lib.llama_token_to_str.restype = c_char_p # Special tokens -def llama_token_bos() -> llama_token: +# LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence +def llama_token_bos() -> int: return _lib.llama_token_bos() @@ -285,7 +743,8 @@ _lib.llama_token_bos.argtypes = [] _lib.llama_token_bos.restype = llama_token -def llama_token_eos() -> llama_token: +# LLAMA_API llama_token llama_token_eos(); // end-of-sentence +def llama_token_eos() -> int: return _lib.llama_token_eos() @@ -293,36 +752,285 @@ _lib.llama_token_eos.argtypes = [] _lib.llama_token_eos.restype = llama_token -# TODO: improve the last_n_tokens interface ? -def llama_sample_top_p_top_k( +# LLAMA_API llama_token llama_token_nl(); // next-line +def llama_token_nl() -> int: + return _lib.llama_token_nl() + + +_lib.llama_token_nl.argtypes = [] +_lib.llama_token_nl.restype = llama_token + + +# Sampling functions + + +# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. +# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); +def llama_sample_repetition_penalty( ctx: llama_context_p, - last_n_tokens_data, # type: Array[llama_token] - last_n_tokens_size: c_int, - top_k: c_int, - top_p: c_float, - temp: c_float, - repeat_penalty: c_float, -) -> llama_token: - return _lib.llama_sample_top_p_top_k( - ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty + candidates, # type: _Pointer[llama_token_data_array] + last_tokens_data, # type: Array[llama_token] + last_tokens_size: c_int, + penalty: c_float, +): + return _lib.llama_sample_repetition_penalty( + ctx, candidates, last_tokens_data, last_tokens_size, penalty ) -_lib.llama_sample_top_p_top_k.argtypes = [ +_lib.llama_sample_repetition_penalty.argtypes = [ llama_context_p, + llama_token_data_array_p, llama_token_p, c_int, - c_int, c_float, +] +_lib.llama_sample_repetition_penalty.restype = None + + +# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. +# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); +def llama_sample_frequency_and_presence_penalties( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + last_tokens_data, # type: Array[llama_token] + last_tokens_size: c_int, + alpha_frequency: c_float, + alpha_presence: c_float, +): + return _lib.llama_sample_frequency_and_presence_penalties( + ctx, + candidates, + last_tokens_data, + last_tokens_size, + alpha_frequency, + alpha_presence, + ) + + +_lib.llama_sample_frequency_and_presence_penalties.argtypes = [ + llama_context_p, + llama_token_data_array_p, + llama_token_p, + c_int, c_float, c_float, ] -_lib.llama_sample_top_p_top_k.restype = llama_token +_lib.llama_sample_frequency_and_presence_penalties.restype = None + + +# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. +# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); +def llama_sample_softmax( + ctx: llama_context_p, candidates # type: _Pointer[llama_token_data] +): + return _lib.llama_sample_softmax(ctx, candidates) + + +_lib.llama_sample_softmax.argtypes = [ + llama_context_p, + llama_token_data_array_p, +] +_lib.llama_sample_softmax.restype = None + + +# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); +def llama_sample_top_k( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + k: c_int, + min_keep: c_size_t, +): + return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) + + +_lib.llama_sample_top_k.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_int, + c_size_t, +] +_lib.llama_sample_top_k.restype = None + + +# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); +def llama_sample_top_p( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + p: c_float, + min_keep: c_size_t, +): + return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) + + +_lib.llama_sample_top_p.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_size_t, +] +_lib.llama_sample_top_p.restype = None + + +# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. +# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); +def llama_sample_tail_free( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + z: c_float, + min_keep: c_size_t, +): + return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) + + +_lib.llama_sample_tail_free.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_size_t, +] +_lib.llama_sample_tail_free.restype = None + + +# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. +# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); +def llama_sample_typical( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + p: c_float, + min_keep: c_size_t, +): + return _lib.llama_sample_typical(ctx, candidates, p, min_keep) + + +_lib.llama_sample_typical.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_size_t, +] +_lib.llama_sample_typical.restype = None + + +# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); +def llama_sample_temperature( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + temp: c_float, +): + return _lib.llama_sample_temperature(ctx, candidates, temp) + + +_lib.llama_sample_temperature.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, +] +_lib.llama_sample_temperature.restype = None + + +# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); +def llama_sample_token_mirostat( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + tau: c_float, + eta: c_float, + m: c_int, + mu, # type: _Pointer[c_float] +) -> int: + return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) + + +_lib.llama_sample_token_mirostat.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_float, + c_int, + c_float_p, +] +_lib.llama_sample_token_mirostat.restype = llama_token + + +# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); +def llama_sample_token_mirostat_v2( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + tau: c_float, + eta: c_float, + mu, # type: _Pointer[c_float] +) -> int: + return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) + + +_lib.llama_sample_token_mirostat_v2.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_float, + c_float_p, +] +_lib.llama_sample_token_mirostat_v2.restype = llama_token + + +# @details Selects the token with the highest probability. +# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); +def llama_sample_token_greedy( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] +) -> int: + return _lib.llama_sample_token_greedy(ctx, candidates) + + +_lib.llama_sample_token_greedy.argtypes = [ + llama_context_p, + llama_token_data_array_p, +] +_lib.llama_sample_token_greedy.restype = llama_token + + +# @details Randomly selects a token from the candidates based on their probabilities. +# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); +def llama_sample_token( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] +) -> int: + return _lib.llama_sample_token(ctx, candidates) + + +_lib.llama_sample_token.argtypes = [ + llama_context_p, + llama_token_data_array_p, +] +_lib.llama_sample_token.restype = llama_token # Performance information +# LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); +def llama_get_timings(ctx: llama_context_p) -> llama_timings: + return _lib.llama_get_timings(ctx) + + +_lib.llama_get_timings.argtypes = [llama_context_p] +_lib.llama_get_timings.restype = llama_timings + + +# LLAMA_API void llama_print_timings(struct llama_context * ctx); def llama_print_timings(ctx: llama_context_p): _lib.llama_print_timings(ctx) @@ -331,6 +1039,7 @@ _lib.llama_print_timings.argtypes = [llama_context_p] _lib.llama_print_timings.restype = None +# LLAMA_API void llama_reset_timings(struct llama_context * ctx); def llama_reset_timings(ctx: llama_context_p): _lib.llama_reset_timings(ctx) @@ -340,9 +1049,19 @@ _lib.llama_reset_timings.restype = None # Print system information +# LLAMA_API const char * llama_print_system_info(void); def llama_print_system_info() -> bytes: return _lib.llama_print_system_info() _lib.llama_print_system_info.argtypes = [] _lib.llama_print_system_info.restype = c_char_p + +################################################################################################### + + +_llama_initialized = False + +if not _llama_initialized: + llama_init_backend(c_bool(False)) + _llama_initialized = True diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index b62ff1b..6ba8023 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Dict, Union +from typing import Any, List, Optional, Dict, Union from typing_extensions import TypedDict, NotRequired, Literal @@ -22,9 +22,9 @@ class Embedding(TypedDict): class CompletionLogprobs(TypedDict): text_offset: List[int] - token_logprobs: List[float] + token_logprobs: List[Optional[float]] tokens: List[str] - top_logprobs: List[Dict[str, float]] + top_logprobs: List[Optional[Dict[str, float]]] class CompletionChoice(TypedDict): @@ -58,7 +58,7 @@ class Completion(TypedDict): class ChatCompletionMessage(TypedDict): - role: Union[Literal["assistant"], Literal["user"], Literal["system"]] + role: Literal["assistant", "user", "system"] content: str user: NotRequired[str] @@ -77,6 +77,8 @@ class ChatCompletion(TypedDict): choices: List[ChatCompletionChoice] usage: CompletionUsage +class ChatCompletionChunkDeltaEmpty(TypedDict): + pass class ChatCompletionChunkDelta(TypedDict): role: NotRequired[Literal["assistant"]] @@ -85,7 +87,7 @@ class ChatCompletionChunkDelta(TypedDict): class ChatCompletionChunkChoice(TypedDict): index: int - delta: ChatCompletionChunkDelta + delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty] finish_reason: Optional[str] diff --git a/llama_cpp/server/__init__.py b/llama_cpp/server/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 44ee1f0..2110db3 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -3,267 +3,48 @@ To run this example: ```bash -pip install fastapi uvicorn sse-starlette +pip install fastapi uvicorn sse-starlette pydantic-settings export MODEL=../models/7B/... -uvicorn fastapi_server_chat:app --reload +``` + +Then run: +``` +uvicorn llama_cpp.server.app:app --reload +``` + +or + +``` +python3 -m llama_cpp.server ``` Then visit http://localhost:8000/docs to see the interactive API docs. """ import os -import json -from typing import List, Optional, Literal, Union, Iterator, Dict -from typing_extensions import TypedDict +import argparse -import llama_cpp - -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict -from sse_starlette.sse import EventSourceResponse - - -class Settings(BaseSettings): - model: str - n_ctx: int = 2048 - n_batch: int = 8 - n_threads: int = ((os.cpu_count() or 2) // 2) or 1 - f16_kv: bool = True - use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... - embedding: bool = True - last_n_tokens_size: int = 64 - - -app = FastAPI( - title="🦙 llama.cpp Python API", - version="0.0.1", -) -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) -settings = Settings() -llama = llama_cpp.Llama( - settings.model, - f16_kv=settings.f16_kv, - use_mlock=settings.use_mlock, - embedding=settings.embedding, - n_threads=settings.n_threads, - n_batch=settings.n_batch, - n_ctx=settings.n_ctx, - last_n_tokens_size=settings.last_n_tokens_size, -) - - -class CreateCompletionRequest(BaseModel): - prompt: Union[str, List[str]] - suffix: Optional[str] = Field(None) - max_tokens: int = 16 - temperature: float = 0.8 - top_p: float = 0.95 - echo: bool = False - stop: List[str] = [] - stream: bool = False - - # ignored or currently unsupported - model: Optional[str] = Field(None) - n: Optional[int] = 1 - logprobs: Optional[int] = Field(None) - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - best_of: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) - - # llama.cpp specific parameters - top_k: int = 40 - repeat_penalty: float = 1.1 - - class Config: - schema_extra = { - "example": { - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": ["\n", "###"], - } - } - - -CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) - - -@app.post( - "/v1/completions", - response_model=CreateCompletionResponse, -) -def create_completion(request: CreateCompletionRequest): - if isinstance(request.prompt, list): - request.prompt = "".join(request.prompt) - - completion_or_chunks = llama( - **request.dict( - exclude={ - "model", - "n", - "logprobs", - "frequency_penalty", - "presence_penalty", - "best_of", - "logit_bias", - "user", - } - ) - ) - if request.stream: - chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks # type: ignore - return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks) - completion: llama_cpp.Completion = completion_or_chunks # type: ignore - return completion - - -class CreateEmbeddingRequest(BaseModel): - model: Optional[str] - input: str - user: Optional[str] - - class Config: - schema_extra = { - "example": { - "input": "The food was delicious and the waiter...", - } - } - - -CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) - - -@app.post( - "/v1/embeddings", - response_model=CreateEmbeddingResponse, -) -def create_embedding(request: CreateEmbeddingRequest): - return llama.create_embedding(**request.dict(exclude={"model", "user"})) - - -class ChatCompletionRequestMessage(BaseModel): - role: Union[Literal["system"], Literal["user"], Literal["assistant"]] - content: str - user: Optional[str] = None - - -class CreateChatCompletionRequest(BaseModel): - model: Optional[str] - messages: List[ChatCompletionRequestMessage] - temperature: float = 0.8 - top_p: float = 0.95 - stream: bool = False - stop: List[str] = [] - max_tokens: int = 128 - - # ignored or currently unsupported - model: Optional[str] = Field(None) - n: Optional[int] = 1 - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) - - # llama.cpp specific parameters - repeat_penalty: float = 1.1 - - class Config: - schema_extra = { - "example": { - "messages": [ - ChatCompletionRequestMessage( - role="system", content="You are a helpful assistant." - ), - ChatCompletionRequestMessage( - role="user", content="What is the capital of France?" - ), - ] - } - } - - -CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) - - -@app.post( - "/v1/chat/completions", - response_model=CreateChatCompletionResponse, -) -async def create_chat_completion( - request: CreateChatCompletionRequest, -) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: - completion_or_chunks = llama.create_chat_completion( - **request.dict( - exclude={ - "model", - "n", - "presence_penalty", - "frequency_penalty", - "logit_bias", - "user", - } - ), - ) - - if request.stream: - - async def server_sent_events( - chat_chunks: Iterator[llama_cpp.ChatCompletionChunk], - ): - for chat_chunk in chat_chunks: - yield dict(data=json.dumps(chat_chunk)) - yield dict(data="[DONE]") - - chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore - - return EventSourceResponse( - server_sent_events(chunks), - ) - completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore - return completion - - -class ModelData(TypedDict): - id: str - object: Literal["model"] - owned_by: str - permissions: List[str] - - -class ModelList(TypedDict): - object: Literal["list"] - data: List[ModelData] - - -GetModelResponse = create_model_from_typeddict(ModelList) - - -@app.get("/v1/models", response_model=GetModelResponse) -def get_models() -> ModelList: - return { - "object": "list", - "data": [ - { - "id": llama.model_path, - "object": "model", - "owned_by": "me", - "permissions": [], - } - ], - } +import uvicorn +from llama_cpp.server.app import create_app, Settings if __name__ == "__main__": - import os - import uvicorn + parser = argparse.ArgumentParser() + for name, field in Settings.__model_fields__.items(): + description = field.field_info.description + if field.default is not None and description is not None: + description += f" (default: {field.default})" + parser.add_argument( + f"--{name}", + dest=name, + type=field.type_, + help=description, + ) + + args = parser.parse_args() + settings = Settings(**{k: v for k, v in vars(args).items() if v is not None}) + app = create_app(settings=settings) uvicorn.run( - app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) + app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port)) ) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py new file mode 100644 index 0000000..ffd07fa --- /dev/null +++ b/llama_cpp/server/app.py @@ -0,0 +1,568 @@ +import json +import multiprocessing +from threading import Lock +from functools import partial +from typing import Iterator, List, Optional, Union, Dict +from typing_extensions import TypedDict, Literal + +import llama_cpp + +import anyio +from anyio.streams.memory import MemoryObjectSendStream +from starlette.concurrency import run_in_threadpool, iterate_in_threadpool +from fastapi import Depends, FastAPI, APIRouter, Request +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, Field +from pydantic_settings import BaseSettings +from sse_starlette.sse import EventSourceResponse + + +class Settings(BaseSettings): + model: str = Field( + description="The path to the model to use for generating completions." + ) + model_alias: Optional[str] = Field( + default=None, + description="The alias of the model to use for generating completions.", + ) + n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_gpu_layers: int = Field( + default=0, + ge=0, + description="The number of layers to put on the GPU. The rest will be on the CPU.", + ) + seed: int = Field( + default=1337, description="Random seed. -1 for random." + ) + n_batch: int = Field( + default=512, ge=1, description="The batch size to use per eval." + ) + n_threads: int = Field( + default=max(multiprocessing.cpu_count() // 2, 1), + ge=1, + description="The number of threads to use.", + ) + f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") + use_mlock: bool = Field( + default=llama_cpp.llama_mlock_supported(), + description="Use mlock.", + ) + use_mmap: bool = Field( + default=llama_cpp.llama_mmap_supported(), + description="Use mmap.", + ) + embedding: bool = Field(default=True, description="Whether to use embeddings.") + low_vram: bool = Field( + default=False, + description="Whether to use less VRAM. This will reduce performance.", + ) + last_n_tokens_size: int = Field( + default=64, + ge=0, + description="Last n tokens to keep for repeat penalty calculation.", + ) + logits_all: bool = Field(default=True, description="Whether to return logits.") + cache: bool = Field( + default=False, + description="Use a cache to reduce processing times for evaluated prompts.", + ) + cache_type: Literal["ram", "disk"] = Field( + default="ram", + description="The type of cache to use. Only used if cache is True.", + ) + cache_size: int = Field( + default=2 << 30, + description="The size of the cache in bytes. Only used if cache is True.", + ) + vocab_only: bool = Field( + default=False, description="Whether to only return the vocabulary." + ) + verbose: bool = Field( + default=True, description="Whether to print debug information." + ) + host: str = Field( + default="localhost", description="Listen address" + ) + port: int = Field( + default=8000, description="Listen port" + ) + interrupt_requests: bool = Field( + default=True, + description="Whether to interrupt requests when a new request is received.", + ) + + +router = APIRouter() + +settings: Optional[Settings] = None +llama: Optional[llama_cpp.Llama] = None + + +def create_app(settings: Optional[Settings] = None): + if settings is None: + settings = Settings() + app = FastAPI( + title="🦙 llama.cpp Python API", + version="0.0.1", + ) + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + app.include_router(router) + global llama + llama = llama_cpp.Llama( + model_path=settings.model, + n_gpu_layers=settings.n_gpu_layers, + seed=settings.seed, + f16_kv=settings.f16_kv, + use_mlock=settings.use_mlock, + use_mmap=settings.use_mmap, + embedding=settings.embedding, + logits_all=settings.logits_all, + n_threads=settings.n_threads, + n_batch=settings.n_batch, + n_ctx=settings.n_ctx, + last_n_tokens_size=settings.last_n_tokens_size, + vocab_only=settings.vocab_only, + verbose=settings.verbose, + ) + if settings.cache: + if settings.cache_type == "disk": + if settings.verbose: + print(f"Using disk cache with size {settings.cache_size}") + cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) + else: + if settings.verbose: + print(f"Using ram cache with size {settings.cache_size}") + cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) + + cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) + llama.set_cache(cache) + + def set_settings(_settings: Settings): + global settings + settings = _settings + + set_settings(settings) + return app + + +llama_outer_lock = Lock() +llama_inner_lock = Lock() + + +def get_llama(): + # NOTE: This double lock allows the currently streaming llama model to + # check if any other requests are pending in the same thread and cancel + # the stream if so. + llama_outer_lock.acquire() + release_outer_lock = True + try: + llama_inner_lock.acquire() + try: + llama_outer_lock.release() + release_outer_lock = False + yield llama + finally: + llama_inner_lock.release() + finally: + if release_outer_lock: + llama_outer_lock.release() + + +def get_settings(): + yield settings + + +model_field = Field(description="The model to use for generating completions.") + +max_tokens_field = Field( + default=16, ge=1, le=2048, description="The maximum number of tokens to generate." +) + +temperature_field = Field( + default=0.8, + ge=0.0, + le=2.0, + description="Adjust the randomness of the generated text.\n\n" + + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.", +) + +top_p_field = Field( + default=0.95, + ge=0.0, + le=1.0, + description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.", +) + +stop_field = Field( + default=None, + description="A list of tokens at which to stop generation. If None, no stop tokens are used.", +) + +stream_field = Field( + default=False, + description="Whether to stream the results as they are generated. Useful for chatbots.", +) + +top_k_field = Field( + default=40, + ge=0, + description="Limit the next token selection to the K most probable tokens.\n\n" + + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.", +) + +repeat_penalty_field = Field( + default=1.1, + ge=0.0, + description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.", +) + +presence_penalty_field = Field( + default=0.0, + ge=-2.0, + le=2.0, + description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", +) + +frequency_penalty_field = Field( + default=0.0, + ge=-2.0, + le=2.0, + description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", +) + +mirostat_mode_field = Field( + default=0, + ge=0, + le=2, + description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)" +) + +mirostat_tau_field = Field( + default=5.0, + ge=0.0, + le=10.0, + description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text" +) + +mirostat_eta_field = Field( + default=0.1, + ge=0.001, + le=1.0, + description="Mirostat learning rate" +) + + +class CreateCompletionRequest(BaseModel): + prompt: Union[str, List[str]] = Field( + default="", description="The prompt to generate completions for." + ) + suffix: Optional[str] = Field( + default=None, + description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.", + ) + max_tokens: int = max_tokens_field + temperature: float = temperature_field + top_p: float = top_p_field + mirostat_mode: int = mirostat_mode_field + mirostat_tau: float = mirostat_tau_field + mirostat_eta: float = mirostat_eta_field + echo: bool = Field( + default=False, + description="Whether to echo the prompt in the generated text. Useful for chatbots.", + ) + stop: Optional[Union[str, List[str]]] = stop_field + stream: bool = stream_field + logprobs: Optional[int] = Field( + default=None, + ge=0, + description="The number of logprobs to generate. If None, no logprobs are generated.", + ) + presence_penalty: Optional[float] = presence_penalty_field + frequency_penalty: Optional[float] = frequency_penalty_field + logit_bias: Optional[Dict[str, float]] = Field(None) + logprobs: Optional[int] = Field(None) + + # ignored or currently unsupported + model: Optional[str] = model_field + n: Optional[int] = 1 + best_of: Optional[int] = 1 + user: Optional[str] = Field(None) + + # llama.cpp specific parameters + top_k: int = top_k_field + repeat_penalty: float = repeat_penalty_field + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) + + class Config: + schema_extra = { + "example": { + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": ["\n", "###"], + } + } + + + + +def make_logit_bias_processor( + llama: llama_cpp.Llama, + logit_bias: Dict[str, float], + logit_bias_type: Optional[Literal["input_ids", "tokens"]], +): + if logit_bias_type is None: + logit_bias_type = "input_ids" + + to_bias: Dict[int, float] = {} + if logit_bias_type == "input_ids": + for input_id, score in logit_bias.items(): + input_id = int(input_id) + to_bias[input_id] = score + + elif logit_bias_type == "tokens": + for token, score in logit_bias.items(): + token = token.encode('utf-8') + for input_id in llama.tokenize(token, add_bos=False): + to_bias[input_id] = score + + def logit_bias_processor( + input_ids: List[int], + scores: List[float], + ) -> List[float]: + new_scores = [None] * len(scores) + for input_id, score in enumerate(scores): + new_scores[input_id] = score + to_bias.get(input_id, 0.0) + + return new_scores + + return logit_bias_processor + + +@router.post( + "/v1/completions", +) +async def create_completion( + request: Request, + body: CreateCompletionRequest, + llama: llama_cpp.Llama = Depends(get_llama), +): + if isinstance(body.prompt, list): + assert len(body.prompt) <= 1 + body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" + + exclude = { + "n", + "best_of", + "logit_bias", + "logit_bias_type", + "user", + } + kwargs = body.dict(exclude=exclude) + + if body.logit_bias is not None: + kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ + make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type), + ]) + + if body.stream: + send_chan, recv_chan = anyio.create_memory_object_stream(10) + + async def event_publisher(inner_send_chan: MemoryObjectSendStream): + async with inner_send_chan: + try: + iterator: Iterator[llama_cpp.CompletionChunk] = await run_in_threadpool(llama, **kwargs) # type: ignore + async for chunk in iterate_in_threadpool(iterator): + await inner_send_chan.send(dict(data=json.dumps(chunk))) + if await request.is_disconnected(): + raise anyio.get_cancelled_exc_class()() + if settings.interrupt_requests and llama_outer_lock.locked(): + await inner_send_chan.send(dict(data="[DONE]")) + raise anyio.get_cancelled_exc_class()() + await inner_send_chan.send(dict(data="[DONE]")) + except anyio.get_cancelled_exc_class() as e: + print("disconnected") + with anyio.move_on_after(1, shield=True): + print( + f"Disconnected from client (via refresh/close) {request.client}" + ) + raise e + + return EventSourceResponse( + recv_chan, data_sender_callable=partial(event_publisher, send_chan) + ) + else: + completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore + return completion + + +class CreateEmbeddingRequest(BaseModel): + model: Optional[str] = model_field + input: Union[str, List[str]] = Field(description="The input to embed.") + user: Optional[str] + + class Config: + schema_extra = { + "example": { + "input": "The food was delicious and the waiter...", + } + } + + + + +@router.post( + "/v1/embeddings", +) +async def create_embedding( + request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) +): + return await run_in_threadpool( + llama.create_embedding, **request.dict(exclude={"user"}) + ) + + +class ChatCompletionRequestMessage(BaseModel): + role: Literal["system", "user", "assistant"] = Field( + default="user", description="The role of the message." + ) + content: str = Field(default="", description="The content of the message.") + + +class CreateChatCompletionRequest(BaseModel): + messages: List[ChatCompletionRequestMessage] = Field( + default=[], description="A list of messages to generate completions for." + ) + max_tokens: int = max_tokens_field + temperature: float = temperature_field + top_p: float = top_p_field + mirostat_mode: int = mirostat_mode_field + mirostat_tau: float = mirostat_tau_field + mirostat_eta: float = mirostat_eta_field + stop: Optional[List[str]] = stop_field + stream: bool = stream_field + presence_penalty: Optional[float] = presence_penalty_field + frequency_penalty: Optional[float] = frequency_penalty_field + logit_bias: Optional[Dict[str, float]] = Field(None) + + # ignored or currently unsupported + model: Optional[str] = model_field + n: Optional[int] = 1 + user: Optional[str] = Field(None) + + # llama.cpp specific parameters + top_k: int = top_k_field + repeat_penalty: float = repeat_penalty_field + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) + + class Config: + schema_extra = { + "example": { + "messages": [ + ChatCompletionRequestMessage( + role="system", content="You are a helpful assistant." + ), + ChatCompletionRequestMessage( + role="user", content="What is the capital of France?" + ), + ] + } + } + + + + +@router.post( + "/v1/chat/completions", +) +async def create_chat_completion( + request: Request, + body: CreateChatCompletionRequest, + llama: llama_cpp.Llama = Depends(get_llama), + settings: Settings = Depends(get_settings), +) -> Union[llama_cpp.ChatCompletion]: # type: ignore + exclude = { + "n", + "logit_bias", + "logit_bias_type", + "user", + } + kwargs = body.dict(exclude=exclude) + + if body.logit_bias is not None: + kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ + make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type), + ]) + + if body.stream: + send_chan, recv_chan = anyio.create_memory_object_stream(10) + + async def event_publisher(inner_send_chan: MemoryObjectSendStream): + async with inner_send_chan: + try: + iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs) # type: ignore + async for chat_chunk in iterate_in_threadpool(iterator): + await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) + if await request.is_disconnected(): + raise anyio.get_cancelled_exc_class()() + if settings.interrupt_requests and llama_outer_lock.locked(): + await inner_send_chan.send(dict(data="[DONE]")) + raise anyio.get_cancelled_exc_class()() + await inner_send_chan.send(dict(data="[DONE]")) + except anyio.get_cancelled_exc_class() as e: + print("disconnected") + with anyio.move_on_after(1, shield=True): + print( + f"Disconnected from client (via refresh/close) {request.client}" + ) + raise e + + return EventSourceResponse( + recv_chan, + data_sender_callable=partial(event_publisher, send_chan), + ) + else: + completion: llama_cpp.ChatCompletion = await run_in_threadpool( + llama.create_chat_completion, **kwargs # type: ignore + ) + return completion + + +class ModelData(TypedDict): + id: str + object: Literal["model"] + owned_by: str + permissions: List[str] + + +class ModelList(TypedDict): + object: Literal["list"] + data: List[ModelData] + + + + +@router.get("/v1/models") +async def get_models( + settings: Settings = Depends(get_settings), +) -> ModelList: + assert llama is not None + return { + "object": "list", + "data": [ + { + "id": settings.model_alias + if settings.model_alias is not None + else llama.model_path, + "object": "model", + "owned_by": "me", + "permissions": [], + } + ], + } diff --git a/poetry.lock b/poetry.lock index 8a74d2f..9d12966 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,57 +1,57 @@ -# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] -name = "attrs" -version = "22.2.0" -description = "Classes Without Boilerplate" -category = "dev" +name = "anyio" +version = "3.6.2" +description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = false -python-versions = ">=3.6" +python-versions = ">=3.6.2" files = [ - {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"}, - {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"}, + {file = "anyio-3.6.2-py3-none-any.whl", hash = "sha256:fbbe32bd270d2a2ef3ed1c5d45041250284e31fc0a4df4a5a6071842051a51e3"}, + {file = "anyio-3.6.2.tar.gz", hash = "sha256:25ea0d673ae30af41a0c442f81cf3b38c7e79fdc7b60335a4c14e05eb0947421"}, ] +[package.dependencies] +idna = ">=2.8" +sniffio = ">=1.1" + [package.extras] -cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] -dev = ["attrs[docs,tests]"] -docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"] -tests = ["attrs[tests-no-zope]", "zope.interface"] -tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] +doc = ["packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["contextlib2", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (<0.15)", "uvloop (>=0.15)"] +trio = ["trio (>=0.16,<0.22)"] [[package]] name = "black" -version = "23.1.0" +version = "23.3.0" description = "The uncompromising code formatter." -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"}, - {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"}, - {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"}, - {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"}, - {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"}, - {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"}, - {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"}, - {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"}, - {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"}, - {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"}, - {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"}, - {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"}, - {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"}, - {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"}, - {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"}, - {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"}, + {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"}, + {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"}, + {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"}, + {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"}, + {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"}, + {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"}, + {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"}, + {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"}, + {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"}, + {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"}, + {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"}, + {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"}, + {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"}, ] [package.dependencies] @@ -73,7 +73,6 @@ uvloop = ["uvloop (>=0.15.2)"] name = "bleach" version = "6.0.0" description = "An easy safelist-based HTML-sanitizing tool." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -90,21 +89,19 @@ css = ["tinycss2 (>=1.1.0,<1.2)"] [[package]] name = "certifi" -version = "2022.12.7" +version = "2023.5.7" description = "Python package for providing Mozilla's CA Bundle." -category = "dev" optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"}, - {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"}, + {file = "certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"}, + {file = "certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"}, ] [[package]] name = "cffi" version = "1.15.1" description = "Foreign Function Interface for Python calling C code." -category = "dev" optional = false python-versions = "*" files = [ @@ -181,7 +178,6 @@ pycparser = "*" name = "charset-normalizer" version = "3.1.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "dev" optional = false python-versions = ">=3.7.0" files = [ @@ -266,7 +262,6 @@ files = [ name = "click" version = "8.1.3" description = "Composable command line interface toolkit" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -281,7 +276,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -291,35 +285,30 @@ files = [ [[package]] name = "cryptography" -version = "39.0.2" +version = "40.0.2" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." -category = "dev" optional = false python-versions = ">=3.6" files = [ - {file = "cryptography-39.0.2-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:2725672bb53bb92dc7b4150d233cd4b8c59615cd8288d495eaa86db00d4e5c06"}, - {file = "cryptography-39.0.2-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:23df8ca3f24699167daf3e23e51f7ba7334d504af63a94af468f468b975b7dd7"}, - {file = "cryptography-39.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:eb40fe69cfc6f5cdab9a5ebd022131ba21453cf7b8a7fd3631f45bbf52bed612"}, - {file = "cryptography-39.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc0521cce2c1d541634b19f3ac661d7a64f9555135e9d8af3980965be717fd4a"}, - {file = "cryptography-39.0.2-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffd394c7896ed7821a6d13b24657c6a34b6e2650bd84ae063cf11ccffa4f1a97"}, - {file = "cryptography-39.0.2-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:e8a0772016feeb106efd28d4a328e77dc2edae84dfbac06061319fdb669ff828"}, - {file = "cryptography-39.0.2-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8f35c17bd4faed2bc7797d2a66cbb4f986242ce2e30340ab832e5d99ae60e011"}, - {file = "cryptography-39.0.2-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b49a88ff802e1993b7f749b1eeb31134f03c8d5c956e3c125c75558955cda536"}, - {file = "cryptography-39.0.2-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5f8c682e736513db7d04349b4f6693690170f95aac449c56f97415c6980edef5"}, - {file = "cryptography-39.0.2-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:d7d84a512a59f4412ca8549b01f94be4161c94efc598bf09d027d67826beddc0"}, - {file = "cryptography-39.0.2-cp36-abi3-win32.whl", hash = "sha256:c43ac224aabcbf83a947eeb8b17eaf1547bce3767ee2d70093b461f31729a480"}, - {file = "cryptography-39.0.2-cp36-abi3-win_amd64.whl", hash = "sha256:788b3921d763ee35dfdb04248d0e3de11e3ca8eb22e2e48fef880c42e1f3c8f9"}, - {file = "cryptography-39.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:d15809e0dbdad486f4ad0979753518f47980020b7a34e9fc56e8be4f60702fac"}, - {file = "cryptography-39.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:50cadb9b2f961757e712a9737ef33d89b8190c3ea34d0fb6675e00edbe35d074"}, - {file = "cryptography-39.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:103e8f7155f3ce2ffa0049fe60169878d47a4364b277906386f8de21c9234aa1"}, - {file = "cryptography-39.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6236a9610c912b129610eb1a274bdc1350b5df834d124fa84729ebeaf7da42c3"}, - {file = "cryptography-39.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e944fe07b6f229f4c1a06a7ef906a19652bdd9fd54c761b0ff87e83ae7a30354"}, - {file = "cryptography-39.0.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:35d658536b0a4117c885728d1a7032bdc9a5974722ae298d6c533755a6ee3915"}, - {file = "cryptography-39.0.2-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:30b1d1bfd00f6fc80d11300a29f1d8ab2b8d9febb6ed4a38a76880ec564fae84"}, - {file = "cryptography-39.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e029b844c21116564b8b61216befabca4b500e6816fa9f0ba49527653cae2108"}, - {file = "cryptography-39.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fa507318e427169ade4e9eccef39e9011cdc19534f55ca2f36ec3f388c1f70f3"}, - {file = "cryptography-39.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:8bc0008ef798231fac03fe7d26e82d601d15bd16f3afaad1c6113771566570f3"}, - {file = "cryptography-39.0.2.tar.gz", hash = "sha256:bc5b871e977c8ee5a1bbc42fa8d19bcc08baf0c51cbf1586b0e87a2694dde42f"}, + {file = "cryptography-40.0.2-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:8f79b5ff5ad9d3218afb1e7e20ea74da5f76943ee5edb7f76e56ec5161ec782b"}, + {file = "cryptography-40.0.2-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:05dc219433b14046c476f6f09d7636b92a1c3e5808b9a6536adf4932b3b2c440"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4df2af28d7bedc84fe45bd49bc35d710aede676e2a4cb7fc6d103a2adc8afe4d"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dcca15d3a19a66e63662dc8d30f8036b07be851a8680eda92d079868f106288"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:a04386fb7bc85fab9cd51b6308633a3c271e3d0d3eae917eebab2fac6219b6d2"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:adc0d980fd2760c9e5de537c28935cc32b9353baaf28e0814df417619c6c8c3b"}, + {file = "cryptography-40.0.2-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d5a1bd0e9e2031465761dfa920c16b0065ad77321d8a8c1f5ee331021fda65e9"}, + {file = "cryptography-40.0.2-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a95f4802d49faa6a674242e25bfeea6fc2acd915b5e5e29ac90a32b1139cae1c"}, + {file = "cryptography-40.0.2-cp36-abi3-win32.whl", hash = "sha256:aecbb1592b0188e030cb01f82d12556cf72e218280f621deed7d806afd2113f9"}, + {file = "cryptography-40.0.2-cp36-abi3-win_amd64.whl", hash = "sha256:b12794f01d4cacfbd3177b9042198f3af1c856eedd0a98f10f141385c809a14b"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:142bae539ef28a1c76794cca7f49729e7c54423f615cfd9b0b1fa90ebe53244b"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:956ba8701b4ffe91ba59665ed170a2ebbdc6fc0e40de5f6059195d9f2b33ca0e"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4f01c9863da784558165f5d4d916093737a75203a5c5286fde60e503e4276c7a"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3daf9b114213f8ba460b829a02896789751626a2a4e7a43a28ee77c04b5e4958"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48f388d0d153350f378c7f7b41497a54ff1513c816bcbbcafe5b829e59b9ce5b"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c0764e72b36a3dc065c155e5b22f93df465da9c39af65516fe04ed3c68c92636"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:cbaba590180cba88cb99a5f76f90808a624f18b169b90a4abb40c1fd8c19420e"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7a38250f433cd41df7fcb763caa3ee9362777fdb4dc642b9a349721d2bf47404"}, + {file = "cryptography-40.0.2.tar.gz", hash = "sha256:c33c0d32b8594fa647d2e01dbccc303478e16fdd7cf98652d5b3ed11aa5e5c99"}, ] [package.dependencies] @@ -328,30 +317,50 @@ cffi = ">=1.12" [package.extras] docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] -pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"] +pep8test = ["black", "check-manifest", "mypy", "ruff"] sdist = ["setuptools-rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"] +test = ["iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist"] test-randomorder = ["pytest-randomly"] tox = ["tox"] +[[package]] +name = "diskcache" +version = "5.6.1" +description = "Disk Cache -- Disk and file backed persistent cache." +optional = false +python-versions = ">=3" +files = [ + {file = "diskcache-5.6.1-py3-none-any.whl", hash = "sha256:558c6a2d5d7c721bb00e40711803d6804850c9f76c426ed81ecc627fe9d2ce2d"}, + {file = "diskcache-5.6.1.tar.gz", hash = "sha256:e4c978532feff5814c4cc00fe1e11e40501985946643d73220d41ee7737c72c3"}, +] + +[[package]] +name = "distro" +version = "1.8.0" +description = "Distro - an OS platform information API" +optional = false +python-versions = ">=3.6" +files = [ + {file = "distro-1.8.0-py3-none-any.whl", hash = "sha256:99522ca3e365cac527b44bde033f64c6945d90eb9f769703caaec52b09bbd3ff"}, + {file = "distro-1.8.0.tar.gz", hash = "sha256:02e111d1dc6a50abb8eed6bf31c3e48ed8b0830d1ea2a1b78c61765c2513fdd8"}, +] + [[package]] name = "docutils" -version = "0.19" +version = "0.20" description = "Docutils -- Python Documentation Utilities" -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "docutils-0.19-py3-none-any.whl", hash = "sha256:5e1de4d849fee02c63b040a4a3fd567f4ab104defd8a5511fbbc24a8a017efbc"}, - {file = "docutils-0.19.tar.gz", hash = "sha256:33995a6753c30b7f577febfc2c50411fec6aac7f7ffeb7c4cfe5991072dcf9e6"}, + {file = "docutils-0.20-py3-none-any.whl", hash = "sha256:a428f10de4de4774389734c986a01b4af2d802d26717108b0f1b9356862937c5"}, + {file = "docutils-0.20.tar.gz", hash = "sha256:f75a5a52fbcacd81b47e42888ad2b380748aaccfb3f13af0fe69deb759f01eb6"}, ] [[package]] name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -362,11 +371,29 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "fastapi" +version = "0.99.1" +description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" +optional = true +python-versions = ">=3.7" +files = [ + {file = "fastapi-0.99.1-py3-none-any.whl", hash = "sha256:976df7bab51ac7beda9f68c4513b8c4490b5c1135c72aafd0a5ee4023ec5282e"}, + {file = "fastapi-0.99.1.tar.gz", hash = "sha256:ac78f717cd80d657bd183f94d33b9bda84aa376a46a9dab513586b8eef1dc6fc"}, +] + +[package.dependencies] +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0" +starlette = ">=0.27.0,<0.28.0" +typing-extensions = ">=4.5.0" + +[package.extras] +all = ["email-validator (>=1.1.1)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] + [[package]] name = "ghp-import" version = "2.1.0" description = "Copy your docs directly to the gh-pages branch." -category = "dev" optional = false python-versions = "*" files = [ @@ -382,27 +409,77 @@ dev = ["flake8", "markdown", "twine", "wheel"] [[package]] name = "griffe" -version = "0.25.5" +version = "0.27.3" description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "griffe-0.25.5-py3-none-any.whl", hash = "sha256:1fb9edff48e66d4873014a2ebf21aca5f271d0006a4c937826e3cf592ffb3706"}, - {file = "griffe-0.25.5.tar.gz", hash = "sha256:11ea3403ef0560a1cbcf7f302eb5d21cf4c1d8ed3f8a16a75aa9f6f458caf3f1"}, + {file = "griffe-0.27.3-py3-none-any.whl", hash = "sha256:094513b209d4acd4b2680c2415d3af5f8ed925714795380c2a7d070e222e0b27"}, + {file = "griffe-0.27.3.tar.gz", hash = "sha256:a3d0f75aa76b80f181f818cf605f658a69fccf287aaeeeafc7a6cf4e6a2ca27e"}, ] [package.dependencies] colorama = ">=0.4" +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[[package]] +name = "httpcore" +version = "0.17.0" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.7" +files = [ + {file = "httpcore-0.17.0-py3-none-any.whl", hash = "sha256:0fdfea45e94f0c9fd96eab9286077f9ff788dd186635ae61b312693e4d943599"}, + {file = "httpcore-0.17.0.tar.gz", hash = "sha256:cc045a3241afbf60ce056202301b4d8b6af08845e3294055eb26b09913ef903c"}, +] + +[package.dependencies] +anyio = ">=3.0,<5.0" +certifi = "*" +h11 = ">=0.13,<0.15" +sniffio = "==1.*" + [package.extras] -async = ["aiofiles (>=0.7,<1.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] + +[[package]] +name = "httpx" +version = "0.24.1" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.7" +files = [ + {file = "httpx-0.24.1-py3-none-any.whl", hash = "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd"}, + {file = "httpx-0.24.1.tar.gz", hash = "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"}, +] + +[package.dependencies] +certifi = "*" +httpcore = ">=0.15.0,<0.18.0" +idna = "*" +sniffio = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] [[package]] name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -412,14 +489,13 @@ files = [ [[package]] name = "importlib-metadata" -version = "6.1.0" +version = "6.6.0" description = "Read metadata from Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "importlib_metadata-6.1.0-py3-none-any.whl", hash = "sha256:ff80f3b5394912eb1b108fcfd444dc78b7f1f3e16b16188054bd01cb9cb86f09"}, - {file = "importlib_metadata-6.1.0.tar.gz", hash = "sha256:43ce9281e097583d758c2c708c4376371261a02c34682491a8e98352365aad20"}, + {file = "importlib_metadata-6.6.0-py3-none-any.whl", hash = "sha256:43dd286a2cd8995d5eaef7fee2066340423b818ed3fd70adf0bad5f1fac53fed"}, + {file = "importlib_metadata-6.6.0.tar.gz", hash = "sha256:92501cdf9cc66ebd3e612f1b4f0c0765dfa42f0fa38ffb319b6bd84dd675d705"}, ] [package.dependencies] @@ -434,7 +510,6 @@ testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packag name = "importlib-resources" version = "5.12.0" description = "Read resources from Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -453,7 +528,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -465,7 +539,6 @@ files = [ name = "jaraco-classes" version = "3.2.3" description = "Utility functions for Python class constructs" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -484,7 +557,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "jeepney" version = "0.8.0" description = "Low-level, pure Python DBus protocol wrapper." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -500,7 +572,6 @@ trio = ["async_generator", "trio"] name = "jinja2" version = "3.1.2" description = "A very fast and expressive template engine." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -518,7 +589,6 @@ i18n = ["Babel (>=2.7)"] name = "keyring" version = "23.13.1" description = "Store and access your passwords safely." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -543,7 +613,6 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "markdown" version = "3.3.7" description = "Python implementation of Markdown." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -561,7 +630,6 @@ testing = ["coverage", "pyyaml"] name = "markdown-it-py" version = "2.2.0" description = "Python port of markdown-it. Markdown parsing, done right!" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -586,7 +654,6 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] name = "markupsafe" version = "2.1.2" description = "Safely add untrusted strings to HTML/XML markup." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -646,7 +713,6 @@ files = [ name = "mdurl" version = "0.1.2" description = "Markdown URL utilities" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -658,7 +724,6 @@ files = [ name = "mergedeep" version = "1.3.4" description = "A deep merge function for 🐍." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -668,14 +733,13 @@ files = [ [[package]] name = "mkdocs" -version = "1.4.2" +version = "1.4.3" description = "Project documentation with Markdown." -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs-1.4.2-py3-none-any.whl", hash = "sha256:c8856a832c1e56702577023cd64cc5f84948280c1c0fcc6af4cd39006ea6aa8c"}, - {file = "mkdocs-1.4.2.tar.gz", hash = "sha256:8947af423a6d0facf41ea1195b8e1e8c85ad94ac95ae307fe11232e0424b11c5"}, + {file = "mkdocs-1.4.3-py3-none-any.whl", hash = "sha256:6ee46d309bda331aac915cd24aab882c179a933bd9e77b80ce7d2eaaa3f689dd"}, + {file = "mkdocs-1.4.3.tar.gz", hash = "sha256:5955093bbd4dd2e9403c5afaf57324ad8b04f16886512a3ee6ef828956481c57"}, ] [package.dependencies] @@ -699,7 +763,6 @@ min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-imp name = "mkdocs-autorefs" version = "0.4.1" description = "Automatically link across pages in MkDocs." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -713,14 +776,13 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.4" +version = "9.1.18" description = "Documentation that simply works" -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.4-py3-none-any.whl", hash = "sha256:4c92dcf9365068259bef3eed8e0dd5410056b6f7187bdea2d52848c0f94cd94c"}, - {file = "mkdocs_material-9.1.4.tar.gz", hash = "sha256:c3a8943e9e4a7d2624291da365bbccf0b9f88688aa6947a46260d8c165cd4389"}, + {file = "mkdocs_material-9.1.18-py3-none-any.whl", hash = "sha256:5bcf8fb79ac2f253c0ffe93fa181cba87718c6438f459dc4180ac7418cc9a450"}, + {file = "mkdocs_material-9.1.18.tar.gz", hash = "sha256:981dd39979723d4cda7cfc77bbbe5e54922d5761a7af23fb8ba9edb52f114b13"}, ] [package.dependencies] @@ -738,7 +800,6 @@ requests = ">=2.26" name = "mkdocs-material-extensions" version = "1.1.1" description = "Extension pack for Python Markdown and MkDocs Material." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -748,17 +809,17 @@ files = [ [[package]] name = "mkdocstrings" -version = "0.20.0" +version = "0.22.0" description = "Automatic documentation from sources, for MkDocs." -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocstrings-0.20.0-py3-none-any.whl", hash = "sha256:f17fc2c4f760ec302b069075ef9e31045aa6372ca91d2f35ded3adba8e25a472"}, - {file = "mkdocstrings-0.20.0.tar.gz", hash = "sha256:c757f4f646d4f939491d6bc9256bfe33e36c5f8026392f49eaa351d241c838e5"}, + {file = "mkdocstrings-0.22.0-py3-none-any.whl", hash = "sha256:2d4095d461554ff6a778fdabdca3c00c468c2f1459d469f7a7f622a2b23212ba"}, + {file = "mkdocstrings-0.22.0.tar.gz", hash = "sha256:82a33b94150ebb3d4b5c73bab4598c3e21468c79ec072eff6931c8f3bfc38256"}, ] [package.dependencies] +importlib-metadata = {version = ">=4.6", markers = "python_version < \"3.10\""} Jinja2 = ">=2.11.1" Markdown = ">=3.3" MarkupSafe = ">=1.1" @@ -766,6 +827,7 @@ mkdocs = ">=1.2" mkdocs-autorefs = ">=0.3.1" mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""} pymdown-extensions = ">=6.3" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.10\""} [package.extras] crystal = ["mkdocstrings-crystal (>=0.3.4)"] @@ -774,25 +836,23 @@ python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] [[package]] name = "mkdocstrings-python" -version = "0.8.3" +version = "0.10.1" description = "A Python handler for mkdocstrings." -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocstrings-python-0.8.3.tar.gz", hash = "sha256:9ae473f6dc599339b09eee17e4d2b05d6ac0ec29860f3fc9b7512d940fc61adf"}, - {file = "mkdocstrings_python-0.8.3-py3-none-any.whl", hash = "sha256:4e6e1cd6f37a785de0946ced6eb846eb2f5d891ac1cc2c7b832943d3529087a7"}, + {file = "mkdocstrings_python-0.10.1-py3-none-any.whl", hash = "sha256:ef239cee2c688e2b949a0a47e42a141d744dd12b7007311b3309dc70e3bafc5c"}, + {file = "mkdocstrings_python-0.10.1.tar.gz", hash = "sha256:b72301fff739070ec517b5b36bf2f7c49d1360a275896a64efb97fc17d3f3968"}, ] [package.dependencies] griffe = ">=0.24" -mkdocstrings = ">=0.19" +mkdocstrings = ">=0.20" [[package]] name = "more-itertools" version = "9.1.0" description = "More routines for operating on iterables, beyond itertools" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -804,7 +864,6 @@ files = [ name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -812,23 +871,58 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "numpy" +version = "1.24.4" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, + {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"}, + {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"}, + {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"}, + {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"}, + {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"}, + {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"}, + {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"}, + {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"}, + {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, + {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, +] + [[package]] name = "packaging" -version = "23.0" +version = "23.1" description = "Core utilities for Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, - {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, + {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, + {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] [[package]] name = "pathspec" version = "0.11.1" description = "Utility library for gitignore style pattern matching of file paths." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -840,7 +934,6 @@ files = [ name = "pkginfo" version = "1.9.6" description = "Query metadata from sdists / bdists / installed packages." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -853,25 +946,23 @@ testing = ["pytest", "pytest-cov"] [[package]] name = "platformdirs" -version = "3.1.1" +version = "3.5.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "platformdirs-3.1.1-py3-none-any.whl", hash = "sha256:e5986afb596e4bb5bde29a79ac9061aa955b94fca2399b7aaac4090860920dd8"}, - {file = "platformdirs-3.1.1.tar.gz", hash = "sha256:024996549ee88ec1a9aa99ff7f8fc819bb59e2c3477b410d90a16d32d6e707aa"}, + {file = "platformdirs-3.5.0-py3-none-any.whl", hash = "sha256:47692bc24c1958e8b0f13dd727307cff1db103fca36399f457da8e05f222fdc4"}, + {file = "platformdirs-3.5.0.tar.gz", hash = "sha256:7954a68d0ba23558d753f73437c55f89027cf8f5108c19844d4b82e5af396335"}, ] [package.extras] -docs = ["furo (>=2022.12.7)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.22,!=1.23.4)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.2.2)", "pytest (>=7.2.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] +docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] [[package]] name = "pluggy" version = "1.0.0" description = "plugin and hook calling mechanisms for python" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -887,7 +978,6 @@ testing = ["pytest", "pytest-benchmark"] name = "pycparser" version = "2.21" description = "C parser in Python" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -896,15 +986,66 @@ files = [ ] [[package]] -name = "pygments" -version = "2.14.0" -description = "Pygments is a syntax highlighting package written in Python." -category = "dev" -optional = false -python-versions = ">=3.6" +name = "pydantic" +version = "1.10.7" +description = "Data validation and settings management using python type hints" +optional = true +python-versions = ">=3.7" files = [ - {file = "Pygments-2.14.0-py3-none-any.whl", hash = "sha256:fa7bd7bd2771287c0de303af8bfdfc731f51bd2c6a47ab69d117138893b82717"}, - {file = "Pygments-2.14.0.tar.gz", hash = "sha256:b3ed06a9e8ac9a9aae5a6f5dbe78a8a58655d17b43b93c078f094ddc476ae297"}, + {file = "pydantic-1.10.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e79e999e539872e903767c417c897e729e015872040e56b96e67968c3b918b2d"}, + {file = "pydantic-1.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:01aea3a42c13f2602b7ecbbea484a98169fb568ebd9e247593ea05f01b884b2e"}, + {file = "pydantic-1.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:516f1ed9bc2406a0467dd777afc636c7091d71f214d5e413d64fef45174cfc7a"}, + {file = "pydantic-1.10.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae150a63564929c675d7f2303008d88426a0add46efd76c3fc797cd71cb1b46f"}, + {file = "pydantic-1.10.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ecbbc51391248116c0a055899e6c3e7ffbb11fb5e2a4cd6f2d0b93272118a209"}, + {file = "pydantic-1.10.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f4a2b50e2b03d5776e7f21af73e2070e1b5c0d0df255a827e7c632962f8315af"}, + {file = "pydantic-1.10.7-cp310-cp310-win_amd64.whl", hash = "sha256:a7cd2251439988b413cb0a985c4ed82b6c6aac382dbaff53ae03c4b23a70e80a"}, + {file = "pydantic-1.10.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:68792151e174a4aa9e9fc1b4e653e65a354a2fa0fed169f7b3d09902ad2cb6f1"}, + {file = "pydantic-1.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe2507b8ef209da71b6fb5f4e597b50c5a34b78d7e857c4f8f3115effaef5fe"}, + {file = "pydantic-1.10.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10a86d8c8db68086f1e30a530f7d5f83eb0685e632e411dbbcf2d5c0150e8dcd"}, + {file = "pydantic-1.10.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d75ae19d2a3dbb146b6f324031c24f8a3f52ff5d6a9f22f0683694b3afcb16fb"}, + {file = "pydantic-1.10.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:464855a7ff7f2cc2cf537ecc421291b9132aa9c79aef44e917ad711b4a93163b"}, + {file = "pydantic-1.10.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:193924c563fae6ddcb71d3f06fa153866423ac1b793a47936656e806b64e24ca"}, + {file = "pydantic-1.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:b4a849d10f211389502059c33332e91327bc154acc1845f375a99eca3afa802d"}, + {file = "pydantic-1.10.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cc1dde4e50a5fc1336ee0581c1612215bc64ed6d28d2c7c6f25d2fe3e7c3e918"}, + {file = "pydantic-1.10.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0cfe895a504c060e5d36b287ee696e2fdad02d89e0d895f83037245218a87fe"}, + {file = "pydantic-1.10.7-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:670bb4683ad1e48b0ecb06f0cfe2178dcf74ff27921cdf1606e527d2617a81ee"}, + {file = "pydantic-1.10.7-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:950ce33857841f9a337ce07ddf46bc84e1c4946d2a3bba18f8280297157a3fd1"}, + {file = "pydantic-1.10.7-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c15582f9055fbc1bfe50266a19771bbbef33dd28c45e78afbe1996fd70966c2a"}, + {file = "pydantic-1.10.7-cp37-cp37m-win_amd64.whl", hash = "sha256:82dffb306dd20bd5268fd6379bc4bfe75242a9c2b79fec58e1041fbbdb1f7914"}, + {file = "pydantic-1.10.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8c7f51861d73e8b9ddcb9916ae7ac39fb52761d9ea0df41128e81e2ba42886cd"}, + {file = "pydantic-1.10.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6434b49c0b03a51021ade5c4daa7d70c98f7a79e95b551201fff682fc1661245"}, + {file = "pydantic-1.10.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64d34ab766fa056df49013bb6e79921a0265204c071984e75a09cbceacbbdd5d"}, + {file = "pydantic-1.10.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:701daea9ffe9d26f97b52f1d157e0d4121644f0fcf80b443248434958fd03dc3"}, + {file = "pydantic-1.10.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cf135c46099ff3f919d2150a948ce94b9ce545598ef2c6c7bf55dca98a304b52"}, + {file = "pydantic-1.10.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b0f85904f73161817b80781cc150f8b906d521fa11e3cdabae19a581c3606209"}, + {file = "pydantic-1.10.7-cp38-cp38-win_amd64.whl", hash = "sha256:9f6f0fd68d73257ad6685419478c5aece46432f4bdd8d32c7345f1986496171e"}, + {file = "pydantic-1.10.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c230c0d8a322276d6e7b88c3f7ce885f9ed16e0910354510e0bae84d54991143"}, + {file = "pydantic-1.10.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:976cae77ba6a49d80f461fd8bba183ff7ba79f44aa5cfa82f1346b5626542f8e"}, + {file = "pydantic-1.10.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d45fc99d64af9aaf7e308054a0067fdcd87ffe974f2442312372dfa66e1001d"}, + {file = "pydantic-1.10.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2a5ebb48958754d386195fe9e9c5106f11275867051bf017a8059410e9abf1f"}, + {file = "pydantic-1.10.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:abfb7d4a7cd5cc4e1d1887c43503a7c5dd608eadf8bc615413fc498d3e4645cd"}, + {file = "pydantic-1.10.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:80b1fab4deb08a8292d15e43a6edccdffa5377a36a4597bb545b93e79c5ff0a5"}, + {file = "pydantic-1.10.7-cp39-cp39-win_amd64.whl", hash = "sha256:d71e69699498b020ea198468e2480a2f1e7433e32a3a99760058c6520e2bea7e"}, + {file = "pydantic-1.10.7-py3-none-any.whl", hash = "sha256:0cd181f1d0b1d00e2b705f1bf1ac7799a2d938cce3376b8007df62b29be3c2c6"}, + {file = "pydantic-1.10.7.tar.gz", hash = "sha256:cfc83c0678b6ba51b0532bea66860617c4cd4251ecf76e9846fa5a9f3454e97e"}, +] + +[package.dependencies] +typing-extensions = ">=4.2.0" + +[package.extras] +dotenv = ["python-dotenv (>=0.10.4)"] +email = ["email-validator (>=1.0.3)"] + +[[package]] +name = "pygments" +version = "2.15.1" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.7" +files = [ + {file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"}, + {file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"}, ] [package.extras] @@ -912,14 +1053,13 @@ plugins = ["importlib-metadata"] [[package]] name = "pymdown-extensions" -version = "9.10" +version = "9.11" description = "Extension pack for Python Markdown." -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "pymdown_extensions-9.10-py3-none-any.whl", hash = "sha256:31eaa76ce6f96aabfcea98787c2fff2c5c0611b20a53a94213970cfbf05f02b8"}, - {file = "pymdown_extensions-9.10.tar.gz", hash = "sha256:562c38eee4ce3f101ce631b804bfc2177a8a76c7e4dc908871fb6741a90257a7"}, + {file = "pymdown_extensions-9.11-py3-none-any.whl", hash = "sha256:a499191d8d869f30339de86fcf072a787e86c42b6f16f280f5c2cf174182b7f3"}, + {file = "pymdown_extensions-9.11.tar.gz", hash = "sha256:f7e86c1d3981f23d9dc43294488ecb54abadd05b0be4bf8f0e15efc90f7853ff"}, ] [package.dependencies] @@ -928,18 +1068,16 @@ pyyaml = "*" [[package]] name = "pytest" -version = "7.2.2" +version = "7.4.0" description = "pytest: simple powerful testing with Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.2.2-py3-none-any.whl", hash = "sha256:130328f552dcfac0b1cec75c12e3f005619dc5f874f0a06e8ff7263f0ee6225e"}, - {file = "pytest-7.2.2.tar.gz", hash = "sha256:c99ab0c73aceb050f68929bc93af19ab6db0558791c6a0715723abe9d0ade9d4"}, + {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, + {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, ] [package.dependencies] -attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" @@ -948,13 +1086,12 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -969,7 +1106,6 @@ six = ">=1.5" name = "pywin32-ctypes" version = "0.2.0" description = "" -category = "dev" optional = false python-versions = "*" files = [ @@ -981,7 +1117,6 @@ files = [ name = "pyyaml" version = "6.0" description = "YAML parser and emitter for Python" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1031,7 +1166,6 @@ files = [ name = "pyyaml-env-tag" version = "0.1" description = "A custom YAML tag for referencing environment variables in YAML files. " -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1046,7 +1180,6 @@ pyyaml = "*" name = "readme-renderer" version = "37.3" description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1064,91 +1197,117 @@ md = ["cmarkgfm (>=0.8.0)"] [[package]] name = "regex" -version = "2023.3.23" +version = "2023.5.5" description = "Alternative regular expression module, to replace re." -category = "dev" optional = false -python-versions = ">=3.8" +python-versions = ">=3.6" files = [ - {file = "regex-2023.3.23-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:845a5e2d84389c4ddada1a9b95c055320070f18bb76512608374aca00d22eca8"}, - {file = "regex-2023.3.23-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:87d9951f5a538dd1d016bdc0dcae59241d15fa94860964833a54d18197fcd134"}, - {file = "regex-2023.3.23-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37ae17d3be44c0b3f782c28ae9edd8b47c1f1776d4cabe87edc0b98e1f12b021"}, - {file = "regex-2023.3.23-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0b8eb1e3bca6b48dc721818a60ae83b8264d4089a4a41d62be6d05316ec38e15"}, - {file = "regex-2023.3.23-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df45fac182ebc3c494460c644e853515cc24f5ad9da05f8ffb91da891bfee879"}, - {file = "regex-2023.3.23-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7006105b10b59971d3b248ad75acc3651c7e4cf54d81694df5a5130a3c3f7ea"}, - {file = "regex-2023.3.23-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:93f3f1aa608380fe294aa4cb82e2afda07a7598e828d0341e124b8fd9327c715"}, - {file = "regex-2023.3.23-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:787954f541ab95d8195d97b0b8cf1dc304424adb1e07365967e656b92b38a699"}, - {file = "regex-2023.3.23-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:20abe0bdf03630fe92ccafc45a599bca8b3501f48d1de4f7d121153350a2f77d"}, - {file = "regex-2023.3.23-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:11d00c31aeab9a6e0503bc77e73ed9f4527b3984279d997eb145d7c7be6268fd"}, - {file = "regex-2023.3.23-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:d5bbe0e1511b844794a3be43d6c145001626ba9a6c1db8f84bdc724e91131d9d"}, - {file = "regex-2023.3.23-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ea3c0cb56eadbf4ab2277e7a095676370b3e46dbfc74d5c383bd87b0d6317910"}, - {file = "regex-2023.3.23-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d895b4c863059a4934d3e874b90998df774644a41b349ebb330f85f11b4ef2c0"}, - {file = "regex-2023.3.23-cp310-cp310-win32.whl", hash = "sha256:9d764514d19b4edcc75fd8cb1423448ef393e8b6cbd94f38cab983ab1b75855d"}, - {file = "regex-2023.3.23-cp310-cp310-win_amd64.whl", hash = "sha256:11d1f2b7a0696dc0310de0efb51b1f4d813ad4401fe368e83c0c62f344429f98"}, - {file = "regex-2023.3.23-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8a9c63cde0eaa345795c0fdeb19dc62d22e378c50b0bc67bf4667cd5b482d98b"}, - {file = "regex-2023.3.23-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dd7200b4c27b68cf9c9646da01647141c6db09f48cc5b51bc588deaf8e98a797"}, - {file = "regex-2023.3.23-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22720024b90a6ba673a725dcc62e10fb1111b889305d7c6b887ac7466b74bedb"}, - {file = "regex-2023.3.23-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b190a339090e6af25f4a5fd9e77591f6d911cc7b96ecbb2114890b061be0ac1"}, - {file = "regex-2023.3.23-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e76b6fc0d8e9efa39100369a9b3379ce35e20f6c75365653cf58d282ad290f6f"}, - {file = "regex-2023.3.23-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7868b8f218bf69a2a15402fde08b08712213a1f4b85a156d90473a6fb6b12b09"}, - {file = "regex-2023.3.23-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2472428efc4127374f494e570e36b30bb5e6b37d9a754f7667f7073e43b0abdd"}, - {file = "regex-2023.3.23-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c37df2a060cb476d94c047b18572ee2b37c31f831df126c0da3cd9227b39253d"}, - {file = "regex-2023.3.23-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4479f9e2abc03362df4045b1332d4a2b7885b245a30d4f4b051c4083b97d95d8"}, - {file = "regex-2023.3.23-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e2396e0678167f2d0c197da942b0b3fb48fee2f0b5915a0feb84d11b6686afe6"}, - {file = "regex-2023.3.23-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75f288c60232a5339e0ff2fa05779a5e9c74e9fc085c81e931d4a264501e745b"}, - {file = "regex-2023.3.23-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c869260aa62cee21c5eb171a466c0572b5e809213612ef8d495268cd2e34f20d"}, - {file = "regex-2023.3.23-cp311-cp311-win32.whl", hash = "sha256:25f0532fd0c53e96bad84664171969de9673b4131f2297f1db850d3918d58858"}, - {file = "regex-2023.3.23-cp311-cp311-win_amd64.whl", hash = "sha256:5ccfafd98473e007cebf7da10c1411035b7844f0f204015efd050601906dbb53"}, - {file = "regex-2023.3.23-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6572ff287176c0fb96568adb292674b421fa762153ed074d94b1d939ed92c253"}, - {file = "regex-2023.3.23-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a610e0adfcb0fc84ea25f6ea685e39e74cbcd9245a72a9a7aab85ff755a5ed27"}, - {file = "regex-2023.3.23-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:086afe222d58b88b62847bdbd92079b4699350b4acab892f88a935db5707c790"}, - {file = "regex-2023.3.23-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79e29fd62fa2f597a6754b247356bda14b866131a22444d67f907d6d341e10f3"}, - {file = "regex-2023.3.23-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c07ce8e9eee878a48ebeb32ee661b49504b85e164b05bebf25420705709fdd31"}, - {file = "regex-2023.3.23-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86b036f401895e854de9fefe061518e78d506d8a919cc250dc3416bca03f6f9a"}, - {file = "regex-2023.3.23-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78ac8dd8e18800bb1f97aad0d73f68916592dddf233b99d2b5cabc562088503a"}, - {file = "regex-2023.3.23-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:539dd010dc35af935b32f248099e38447bbffc10b59c2b542bceead2bed5c325"}, - {file = "regex-2023.3.23-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9bf4a5626f2a0ea006bf81e8963f498a57a47d58907eaa58f4b3e13be68759d8"}, - {file = "regex-2023.3.23-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cf86b4328c204c3f315074a61bc1c06f8a75a8e102359f18ce99fbcbbf1951f0"}, - {file = "regex-2023.3.23-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:2848bf76673c83314068241c8d5b7fa9ad9bed866c979875a0e84039349e8fa7"}, - {file = "regex-2023.3.23-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c125a02d22c555e68f7433bac8449992fa1cead525399f14e47c2d98f2f0e467"}, - {file = "regex-2023.3.23-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:cd1671e9d5ac05ce6aa86874dd8dfa048824d1dbe73060851b310c6c1a201a96"}, - {file = "regex-2023.3.23-cp38-cp38-win32.whl", hash = "sha256:fffe57312a358be6ec6baeb43d253c36e5790e436b7bf5b7a38df360363e88e9"}, - {file = "regex-2023.3.23-cp38-cp38-win_amd64.whl", hash = "sha256:dbb3f87e15d3dd76996d604af8678316ad2d7d20faa394e92d9394dfd621fd0c"}, - {file = "regex-2023.3.23-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c88e8c226473b5549fe9616980ea7ca09289246cfbdf469241edf4741a620004"}, - {file = "regex-2023.3.23-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6560776ec19c83f3645bbc5db64a7a5816c9d8fb7ed7201c5bcd269323d88072"}, - {file = "regex-2023.3.23-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b1fc2632c01f42e06173d8dd9bb2e74ab9b0afa1d698058c867288d2c7a31f3"}, - {file = "regex-2023.3.23-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fdf7ad455f1916b8ea5cdbc482d379f6daf93f3867b4232d14699867a5a13af7"}, - {file = "regex-2023.3.23-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5fc33b27b1d800fc5b78d7f7d0f287e35079ecabe68e83d46930cf45690e1c8c"}, - {file = "regex-2023.3.23-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c49552dc938e3588f63f8a78c86f3c9c75301e813bca0bef13bdb4b87ccf364"}, - {file = "regex-2023.3.23-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e152461e9a0aedec7d37fc66ec0fa635eca984777d3d3c3e36f53bf3d3ceb16e"}, - {file = "regex-2023.3.23-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:db034255e72d2995cf581b14bb3fc9c00bdbe6822b49fcd4eef79e1d5f232618"}, - {file = "regex-2023.3.23-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:55ae114da21b7a790b90255ea52d2aa3a0d121a646deb2d3c6a3194e722fc762"}, - {file = "regex-2023.3.23-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ef3f528fe1cc3d139508fe1b22523745aa77b9d6cb5b0bf277f48788ee0b993f"}, - {file = "regex-2023.3.23-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:a81c9ec59ca2303acd1ccd7b9ac409f1e478e40e96f8f79b943be476c5fdb8bb"}, - {file = "regex-2023.3.23-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cde09c4fdd070772aa2596d97e942eb775a478b32459e042e1be71b739d08b77"}, - {file = "regex-2023.3.23-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3cd9f5dd7b821f141d3a6ca0d5d9359b9221e4f051ca3139320adea9f1679691"}, - {file = "regex-2023.3.23-cp39-cp39-win32.whl", hash = "sha256:7304863f3a652dab5e68e6fb1725d05ebab36ec0390676d1736e0571ebb713ef"}, - {file = "regex-2023.3.23-cp39-cp39-win_amd64.whl", hash = "sha256:54c3fa855a3f7438149de3211738dd9b5f0c733f48b54ae05aa7fce83d48d858"}, - {file = "regex-2023.3.23.tar.gz", hash = "sha256:dc80df325b43ffea5cdea2e3eaa97a44f3dd298262b1c7fe9dbb2a9522b956a7"}, + {file = "regex-2023.5.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:48c9ec56579d4ba1c88f42302194b8ae2350265cb60c64b7b9a88dcb7fbde309"}, + {file = "regex-2023.5.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02f4541550459c08fdd6f97aa4e24c6f1932eec780d58a2faa2068253df7d6ff"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53e22e4460f0245b468ee645156a4f84d0fc35a12d9ba79bd7d79bdcd2f9629d"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b870b6f632fc74941cadc2a0f3064ed8409e6f8ee226cdfd2a85ae50473aa94"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:171c52e320fe29260da550d81c6b99f6f8402450dc7777ef5ced2e848f3b6f8f"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aad5524c2aedaf9aa14ef1bc9327f8abd915699dea457d339bebbe2f0d218f86"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a0f874ee8c0bc820e649c900243c6d1e6dc435b81da1492046716f14f1a2a96"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e645c757183ee0e13f0bbe56508598e2d9cd42b8abc6c0599d53b0d0b8dd1479"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a4c5da39bca4f7979eefcbb36efea04471cd68db2d38fcbb4ee2c6d440699833"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5e3f4468b8c6fd2fd33c218bbd0a1559e6a6fcf185af8bb0cc43f3b5bfb7d636"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:59e4b729eae1a0919f9e4c0fc635fbcc9db59c74ad98d684f4877be3d2607dd6"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ba73a14e9c8f9ac409863543cde3290dba39098fc261f717dc337ea72d3ebad2"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0bbd5dcb19603ab8d2781fac60114fb89aee8494f4505ae7ad141a3314abb1f9"}, + {file = "regex-2023.5.5-cp310-cp310-win32.whl", hash = "sha256:40005cbd383438aecf715a7b47fe1e3dcbc889a36461ed416bdec07e0ef1db66"}, + {file = "regex-2023.5.5-cp310-cp310-win_amd64.whl", hash = "sha256:59597cd6315d3439ed4b074febe84a439c33928dd34396941b4d377692eca810"}, + {file = "regex-2023.5.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8f08276466fedb9e36e5193a96cb944928301152879ec20c2d723d1031cd4ddd"}, + {file = "regex-2023.5.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cd46f30e758629c3ee91713529cfbe107ac50d27110fdcc326a42ce2acf4dafc"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2910502f718828cecc8beff004917dcf577fc5f8f5dd40ffb1ea7612124547b"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:445d6f4fc3bd9fc2bf0416164454f90acab8858cd5a041403d7a11e3356980e8"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18196c16a584619c7c1d843497c069955d7629ad4a3fdee240eb347f4a2c9dbe"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33d430a23b661629661f1fe8395be2004006bc792bb9fc7c53911d661b69dd7e"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72a28979cc667e5f82ef433db009184e7ac277844eea0f7f4d254b789517941d"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f764e4dfafa288e2eba21231f455d209f4709436baeebb05bdecfb5d8ddc3d35"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23d86ad2121b3c4fc78c58f95e19173790e22ac05996df69b84e12da5816cb17"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:690a17db524ee6ac4a27efc5406530dd90e7a7a69d8360235323d0e5dafb8f5b"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:1ecf3dcff71f0c0fe3e555201cbe749fa66aae8d18f80d2cc4de8e66df37390a"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:811040d7f3dd9c55eb0d8b00b5dcb7fd9ae1761c454f444fd9f37fe5ec57143a"}, + {file = "regex-2023.5.5-cp311-cp311-win32.whl", hash = "sha256:c8c143a65ce3ca42e54d8e6fcaf465b6b672ed1c6c90022794a802fb93105d22"}, + {file = "regex-2023.5.5-cp311-cp311-win_amd64.whl", hash = "sha256:586a011f77f8a2da4b888774174cd266e69e917a67ba072c7fc0e91878178a80"}, + {file = "regex-2023.5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b6365703e8cf1644b82104cdd05270d1a9f043119a168d66c55684b1b557d008"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a56c18f21ac98209da9c54ae3ebb3b6f6e772038681d6cb43b8d53da3b09ee81"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8b942d8b3ce765dbc3b1dad0a944712a89b5de290ce8f72681e22b3c55f3cc8"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:844671c9c1150fcdac46d43198364034b961bd520f2c4fdaabfc7c7d7138a2dd"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c2ce65bdeaf0a386bb3b533a28de3994e8e13b464ac15e1e67e4603dd88787fa"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fee0016cc35a8a91e8cc9312ab26a6fe638d484131a7afa79e1ce6165328a135"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:18f05d14f14a812fe9723f13afafefe6b74ca042d99f8884e62dbd34dcccf3e2"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:941b3f1b2392f0bcd6abf1bc7a322787d6db4e7457be6d1ffd3a693426a755f2"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:921473a93bcea4d00295799ab929522fc650e85c6b9f27ae1e6bb32a790ea7d3"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:e2205a81f815b5bb17e46e74cc946c575b484e5f0acfcb805fb252d67e22938d"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:385992d5ecf1a93cb85adff2f73e0402dd9ac29b71b7006d342cc920816e6f32"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:890a09cb0a62198bff92eda98b2b507305dd3abf974778bae3287f98b48907d3"}, + {file = "regex-2023.5.5-cp36-cp36m-win32.whl", hash = "sha256:821a88b878b6589c5068f4cc2cfeb2c64e343a196bc9d7ac68ea8c2a776acd46"}, + {file = "regex-2023.5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:7918a1b83dd70dc04ab5ed24c78ae833ae8ea228cef84e08597c408286edc926"}, + {file = "regex-2023.5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:338994d3d4ca4cf12f09822e025731a5bdd3a37aaa571fa52659e85ca793fb67"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a69cf0c00c4d4a929c6c7717fd918414cab0d6132a49a6d8fc3ded1988ed2ea"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f5e06df94fff8c4c85f98c6487f6636848e1dc85ce17ab7d1931df4a081f657"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8906669b03c63266b6a7693d1f487b02647beb12adea20f8840c1a087e2dfb5"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fda3e50abad8d0f48df621cf75adc73c63f7243cbe0e3b2171392b445401550"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ac2b7d341dc1bd102be849d6dd33b09701223a851105b2754339e390be0627a"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fb2b495dd94b02de8215625948132cc2ea360ae84fe6634cd19b6567709c8ae2"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:aa7d032c1d84726aa9edeb6accf079b4caa87151ca9fabacef31fa028186c66d"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:3d45864693351c15531f7e76f545ec35000d50848daa833cead96edae1665559"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21e90a288e6ba4bf44c25c6a946cb9b0f00b73044d74308b5e0afd190338297c"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:10250a093741ec7bf74bcd2039e697f519b028518f605ff2aa7ac1e9c9f97423"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6b8d0c153f07a953636b9cdb3011b733cadd4178123ef728ccc4d5969e67f3c2"}, + {file = "regex-2023.5.5-cp37-cp37m-win32.whl", hash = "sha256:10374c84ee58c44575b667310d5bbfa89fb2e64e52349720a0182c0017512f6c"}, + {file = "regex-2023.5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:9b320677521aabf666cdd6e99baee4fb5ac3996349c3b7f8e7c4eee1c00dfe3a"}, + {file = "regex-2023.5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:afb1c70ec1e594a547f38ad6bf5e3d60304ce7539e677c1429eebab115bce56e"}, + {file = "regex-2023.5.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cf123225945aa58b3057d0fba67e8061c62d14cc8a4202630f8057df70189051"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a99757ad7fe5c8a2bb44829fc57ced11253e10f462233c1255fe03888e06bc19"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a623564d810e7a953ff1357f7799c14bc9beeab699aacc8b7ab7822da1e952b8"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ced02e3bd55e16e89c08bbc8128cff0884d96e7f7a5633d3dc366b6d95fcd1d6"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1cbe6b5be3b9b698d8cc4ee4dee7e017ad655e83361cd0ea8e653d65e469468"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a6e4b0e0531223f53bad07ddf733af490ba2b8367f62342b92b39b29f72735a"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2e9c4f778514a560a9c9aa8e5538bee759b55f6c1dcd35613ad72523fd9175b8"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:256f7f4c6ba145f62f7a441a003c94b8b1af78cee2cccacfc1e835f93bc09426"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:bd7b68fd2e79d59d86dcbc1ccd6e2ca09c505343445daaa4e07f43c8a9cc34da"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4a5059bd585e9e9504ef9c07e4bc15b0a621ba20504388875d66b8b30a5c4d18"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:6893544e06bae009916a5658ce7207e26ed17385149f35a3125f5259951f1bbe"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c64d5abe91a3dfe5ff250c6bb267ef00dbc01501518225b45a5f9def458f31fb"}, + {file = "regex-2023.5.5-cp38-cp38-win32.whl", hash = "sha256:7923470d6056a9590247ff729c05e8e0f06bbd4efa6569c916943cb2d9b68b91"}, + {file = "regex-2023.5.5-cp38-cp38-win_amd64.whl", hash = "sha256:4035d6945cb961c90c3e1c1ca2feb526175bcfed44dfb1cc77db4fdced060d3e"}, + {file = "regex-2023.5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:50fd2d9b36938d4dcecbd684777dd12a407add4f9f934f235c66372e630772b0"}, + {file = "regex-2023.5.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d19e57f888b00cd04fc38f5e18d0efbd91ccba2d45039453ab2236e6eec48d4d"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd966475e963122ee0a7118ec9024388c602d12ac72860f6eea119a3928be053"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db09e6c18977a33fea26fe67b7a842f706c67cf8bda1450974d0ae0dd63570df"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6164d4e2a82f9ebd7752a06bd6c504791bedc6418c0196cd0a23afb7f3e12b2d"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84397d3f750d153ebd7f958efaa92b45fea170200e2df5e0e1fd4d85b7e3f58a"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c3efee9bb53cbe7b285760c81f28ac80dc15fa48b5fe7e58b52752e642553f1"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:144b5b017646b5a9392a5554a1e5db0000ae637be4971c9747566775fc96e1b2"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:1189fbbb21e2c117fda5303653b61905aeeeea23de4a94d400b0487eb16d2d60"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f83fe9e10f9d0b6cf580564d4d23845b9d692e4c91bd8be57733958e4c602956"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:72aa4746993a28c841e05889f3f1b1e5d14df8d3daa157d6001a34c98102b393"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:de2f780c3242ea114dd01f84848655356af4dd561501896c751d7b885ea6d3a1"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:290fd35219486dfbc00b0de72f455ecdd63e59b528991a6aec9fdfc0ce85672e"}, + {file = "regex-2023.5.5-cp39-cp39-win32.whl", hash = "sha256:732176f5427e72fa2325b05c58ad0b45af341c459910d766f814b0584ac1f9ac"}, + {file = "regex-2023.5.5-cp39-cp39-win_amd64.whl", hash = "sha256:1307aa4daa1cbb23823d8238e1f61292fd07e4e5d8d38a6efff00b67a7cdb764"}, + {file = "regex-2023.5.5.tar.gz", hash = "sha256:7d76a8a1fc9da08296462a18f16620ba73bcbf5909e42383b253ef34d9d5141e"}, ] [[package]] name = "requests" -version = "2.28.2" +version = "2.30.0" description = "Python HTTP for Humans." -category = "dev" optional = false -python-versions = ">=3.7, <4" +python-versions = ">=3.7" files = [ - {file = "requests-2.28.2-py3-none-any.whl", hash = "sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa"}, - {file = "requests-2.28.2.tar.gz", hash = "sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf"}, + {file = "requests-2.30.0-py3-none-any.whl", hash = "sha256:10e94cc4f3121ee6da529d358cdaeaff2f1c409cd377dbc72b825852f2f7e294"}, + {file = "requests-2.30.0.tar.gz", hash = "sha256:239d7d4458afcb28a692cdd298d87542235f4ca8d36d03a15bfc128a6559a2f4"}, ] [package.dependencies] certifi = ">=2017.4.17" charset-normalizer = ">=2,<4" idna = ">=2.5,<4" -urllib3 = ">=1.21.1,<1.27" +urllib3 = ">=1.21.1,<3" [package.extras] socks = ["PySocks (>=1.5.6,!=1.5.7)"] @@ -1156,14 +1315,13 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "requests-toolbelt" -version = "0.10.1" +version = "1.0.0" description = "A utility belt for advanced users of python-requests" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ - {file = "requests-toolbelt-0.10.1.tar.gz", hash = "sha256:62e09f7ff5ccbda92772a29f394a49c3ad6cb181d568b1337626b2abb628a63d"}, - {file = "requests_toolbelt-0.10.1-py2.py3-none-any.whl", hash = "sha256:18565aa58116d9951ac39baa288d3adb5b3ff975c4f25eee78555d89e8f247f7"}, + {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, + {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, ] [package.dependencies] @@ -1173,7 +1331,6 @@ requests = ">=2.0.1,<3.0.0" name = "rfc3986" version = "2.0.0" description = "Validating URI References per RFC 3986" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1186,14 +1343,13 @@ idna2008 = ["idna"] [[package]] name = "rich" -version = "13.3.2" +version = "13.3.5" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" -category = "dev" optional = false python-versions = ">=3.7.0" files = [ - {file = "rich-13.3.2-py3-none-any.whl", hash = "sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f"}, - {file = "rich-13.3.2.tar.gz", hash = "sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001"}, + {file = "rich-13.3.5-py3-none-any.whl", hash = "sha256:69cdf53799e63f38b95b9bf9c875f8c90e78dd62b2f00c13a911c7a3b9fa4704"}, + {file = "rich-13.3.5.tar.gz", hash = "sha256:2d11b9b8dd03868f09b4fffadc84a6a8cda574e40dc90821bd845720ebb8e89c"}, ] [package.dependencies] @@ -1204,11 +1360,34 @@ typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.9 [package.extras] jupyter = ["ipywidgets (>=7.5.1,<9)"] +[[package]] +name = "scikit-build" +version = "0.17.6" +description = "Improved build system generator for Python C/C++/Fortran/Cython extensions" +optional = false +python-versions = ">=3.7" +files = [ + {file = "scikit_build-0.17.6-py3-none-any.whl", hash = "sha256:18bd55e81841106eec93f30a297df4f301003791c41be46ef6428d58bd42d6b3"}, + {file = "scikit_build-0.17.6.tar.gz", hash = "sha256:b51a51a36b37c42650994b5047912f59b22e3210b23e321f287611f9ef6e5c9d"}, +] + +[package.dependencies] +distro = "*" +packaging = "*" +setuptools = ">=42.0.0" +tomli = {version = "*", markers = "python_version < \"3.11\""} +wheel = ">=0.32.0" + +[package.extras] +cov = ["coverage[toml] (>=4.2)", "pytest-cov (>=2.7.1)"] +docs = ["pygments", "sphinx (>=4)", "sphinx-issues", "sphinx-rtd-theme (>=1.0)", "sphinxcontrib-moderncmakedomain (>=3.19)"] +doctest = ["ubelt (>=0.8.2)", "xdoctest (>=0.10.0)"] +test = ["build (>=0.7)", "cython (>=0.25.1)", "importlib-metadata", "pytest (>=6.0.0)", "pytest-mock (>=1.10.4)", "requests", "virtualenv"] + [[package]] name = "secretstorage" version = "3.3.3" description = "Python bindings to FreeDesktop.org Secret Service API" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -1220,11 +1399,26 @@ files = [ cryptography = ">=2.0" jeepney = ">=0.6" +[[package]] +name = "setuptools" +version = "67.7.2" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "setuptools-67.7.2-py3-none-any.whl", hash = "sha256:23aaf86b85ca52ceb801d32703f12d77517b2556af839621c641fca11287952b"}, + {file = "setuptools-67.7.2.tar.gz", hash = "sha256:f104fa03692a2602fa0fec6c6a9e63b6c8a968de13e17c026957dd1f53d80990"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] + [[package]] name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -1232,11 +1426,53 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "sniffio" +version = "1.3.0" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, + {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, +] + +[[package]] +name = "sse-starlette" +version = "1.6.1" +description = "\"SSE plugin for Starlette\"" +optional = true +python-versions = ">=3.8" +files = [ + {file = "sse-starlette-1.6.1.tar.gz", hash = "sha256:6208af2bd7d0887c92f1379da14bd1f4db56bd1274cc5d36670c683d2aa1de6a"}, + {file = "sse_starlette-1.6.1-py3-none-any.whl", hash = "sha256:d8f18f1c633e355afe61cc5e9c92eea85badcb8b2d56ec8cfb0a006994aa55da"}, +] + +[package.dependencies] +starlette = "*" + +[[package]] +name = "starlette" +version = "0.27.0" +description = "The little ASGI library that shines." +optional = true +python-versions = ">=3.7" +files = [ + {file = "starlette-0.27.0-py3-none-any.whl", hash = "sha256:918416370e846586541235ccd38a474c08b80443ed31c578a418e2209b3eef91"}, + {file = "starlette-0.27.0.tar.gz", hash = "sha256:6a6b0d042acb8d469a01eba54e9cda6cbd24ac602c4cd016723117d6a7e73b75"}, +] + +[package.dependencies] +anyio = ">=3.4.0,<5" +typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""} + +[package.extras] +full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"] + [[package]] name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1248,7 +1484,6 @@ files = [ name = "twine" version = "4.0.2" description = "Collection of utilities for publishing packages on PyPI" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1269,38 +1504,54 @@ urllib3 = ">=1.26.0" [[package]] name = "typing-extensions" -version = "4.5.0" +version = "4.7.1" description = "Backported and Experimental Type Hints for Python 3.7+" -category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"}, - {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"}, + {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, + {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, ] [[package]] name = "urllib3" -version = "1.26.15" +version = "2.0.2" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +python-versions = ">=3.7" files = [ - {file = "urllib3-1.26.15-py2.py3-none-any.whl", hash = "sha256:aa751d169e23c7479ce47a0cb0da579e3ede798f994f5816a74e4f4500dcea42"}, - {file = "urllib3-1.26.15.tar.gz", hash = "sha256:8a388717b9476f934a21484e8c8e61875ab60644d29b9b39e11e4b9dc1c6b305"}, + {file = "urllib3-2.0.2-py3-none-any.whl", hash = "sha256:d055c2f9d38dc53c808f6fdc8eab7360b6fdbbde02340ed25cfbcd817c62469e"}, + {file = "urllib3-2.0.2.tar.gz", hash = "sha256:61717a1095d7e155cdb737ac7bb2f4324a858a1e2e6466f6d03ff630ca68d3cc"}, ] [package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] -socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + +[[package]] +name = "uvicorn" +version = "0.22.0" +description = "The lightning-fast ASGI server." +optional = true +python-versions = ">=3.7" +files = [ + {file = "uvicorn-0.22.0-py3-none-any.whl", hash = "sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996"}, + {file = "uvicorn-0.22.0.tar.gz", hash = "sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8"}, +] + +[package.dependencies] +click = ">=7.0" +h11 = ">=0.8" + +[package.extras] +standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] [[package]] name = "watchdog" version = "3.0.0" description = "Filesystem events monitoring" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1340,7 +1591,6 @@ watchmedo = ["PyYAML (>=3.10)"] name = "webencodings" version = "0.5.1" description = "Character encoding aliases for legacy web content" -category = "dev" optional = false python-versions = "*" files = [ @@ -1348,11 +1598,24 @@ files = [ {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, ] +[[package]] +name = "wheel" +version = "0.40.0" +description = "A built-package format for Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "wheel-0.40.0-py3-none-any.whl", hash = "sha256:d236b20e7cb522daf2390fa84c55eea81c5c30190f90f29ae2ca1ad8355bf247"}, + {file = "wheel-0.40.0.tar.gz", hash = "sha256:cd1196f3faee2b31968d626e1731c94f99cbdb67cf5a46e4f5656cbee7738873"}, +] + +[package.extras] +test = ["pytest (>=6.0.0)"] + [[package]] name = "zipp" version = "3.15.0" description = "Backport of pathlib-compatible object wrapper for zip files" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1364,7 +1627,10 @@ files = [ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] +[extras] +server = ["fastapi", "sse-starlette", "uvicorn"] + [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "cc9babcdfdc3679a4d84f68912408a005619a576947b059146ed1b428850ece9" +content-hash = "da42c48a426b64ce393b4febca1be0e2ea0fe9d48cedb2392b390d4a49276474" diff --git a/poetry.toml b/poetry.toml new file mode 100644 index 0000000..be97f1e --- /dev/null +++ b/poetry.toml @@ -0,0 +1,3 @@ +[virtualenvs] +in-project = true +prefer-active-python = true \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 89c8271..841a868 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.30" +version = "0.1.68" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" @@ -14,16 +14,25 @@ include = [ [tool.poetry.dependencies] python = "^3.8.1" -typing-extensions = "^4.5.0" - +typing-extensions = "^4.7.1" +numpy = "^1.24.4" +diskcache = "^5.6.1" +uvicorn = { version = "^0.22.0", optional = true } +fastapi = { version = "^0.99.1", optional = true } +sse-starlette = { version = "^1.6.1", optional = true } [tool.poetry.group.dev.dependencies] -black = "^23.1.0" +black = "^23.3.0" twine = "^4.0.2" -mkdocs = "^1.4.2" -mkdocstrings = {extras = ["python"], version = "^0.20.0"} -mkdocs-material = "^9.1.4" -pytest = "^7.2.2" +mkdocs = "^1.4.3" +mkdocstrings = {extras = ["python"], version = "^0.22.0"} +mkdocs-material = "^9.1.18" +pytest = "^7.4.0" +httpx = "^0.24.1" +scikit-build = "0.17.6" + +[tool.poetry.extras] +server = ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"] [build-system] requires = [ @@ -32,4 +41,4 @@ requires = [ "cmake>=3.18", "ninja", ] -build-backend = "setuptools.build_meta" +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/setup.py b/setup.py index 33c2e26..1d7ecbc 100644 --- a/setup.py +++ b/setup.py @@ -10,17 +10,15 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.30", + version="0.1.68", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"}, packages=["llama_cpp", "llama_cpp.server"], - install_requires=[ - "typing-extensions>=4.5.0", - ], + install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], extras_require={ - "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], + "server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"], }, python_requires=">=3.7", classifiers=[ diff --git a/tests/test_llama.py b/tests/test_llama.py index 6a50256..941287d 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -14,14 +14,22 @@ def test_llama(): assert llama.detokenize(llama.tokenize(text)) == text +# @pytest.mark.skip(reason="need to update sample mocking") def test_llama_patch(monkeypatch): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) + n_vocab = llama_cpp.llama_n_vocab(llama.ctx) ## Set up mock function def mock_eval(*args, **kwargs): return 0 + def mock_get_logits(*args, **kwargs): + return (llama_cpp.c_float * n_vocab)( + *[llama_cpp.c_float(0) for _ in range(n_vocab)] + ) + monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) + monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) output_text = " jumps over the lazy dog." output_tokens = llama.tokenize(output_text.encode("utf-8")) @@ -36,7 +44,7 @@ def test_llama_patch(monkeypatch): else: return token_eos - monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_top_p_top_k", mock_sample) + monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample) text = "The quick brown fox" @@ -82,6 +90,7 @@ def test_llama_patch(monkeypatch): def test_llama_pickle(): import pickle import tempfile + fp = tempfile.TemporaryFile() llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) pickle.dump(llama, fp) @@ -93,4 +102,70 @@ def test_llama_pickle(): text = b"Hello World" - assert llama.detokenize(llama.tokenize(text)) == text \ No newline at end of file + assert llama.detokenize(llama.tokenize(text)) == text + + +def test_utf8(monkeypatch): + llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) + n_vocab = llama_cpp.llama_n_vocab(llama.ctx) + + ## Set up mock function + def mock_eval(*args, **kwargs): + return 0 + + def mock_get_logits(*args, **kwargs): + return (llama_cpp.c_float * n_vocab)( + *[llama_cpp.c_float(0) for _ in range(n_vocab)] + ) + + monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) + monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) + + output_text = "😀" + output_tokens = llama.tokenize(output_text.encode("utf-8")) + token_eos = llama.token_eos() + n = 0 + + def mock_sample(*args, **kwargs): + nonlocal n + if n < len(output_tokens): + n += 1 + return output_tokens[n - 1] + else: + return token_eos + + monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample) + + ## Test basic completion with utf8 multibyte + n = 0 # reset + completion = llama.create_completion("", max_tokens=4) + assert completion["choices"][0]["text"] == output_text + + ## Test basic completion with incomplete utf8 multibyte + n = 0 # reset + completion = llama.create_completion("", max_tokens=1) + assert completion["choices"][0]["text"] == "" + + +def test_llama_server(): + from fastapi.testclient import TestClient + from llama_cpp.server.app import create_app, Settings + + settings = Settings( + model=MODEL, + vocab_only=True, + ) + app = create_app(settings) + client = TestClient(app) + response = client.get("/v1/models") + assert response.json() == { + "object": "list", + "data": [ + { + "id": MODEL, + "object": "model", + "owned_by": "me", + "permissions": [], + } + ], + } diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 180b693..061f5f8 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 180b693a47b6b825288ef9f2c39d24b6eea4eea6 +Subproject commit 061f5f8d2109bb7adcbd40f1b456d887c5a1df25