Merge branch 'main' into add_unlimited_max_tokens

2023-07-08 02:37:38 -04:00 · 2023-07-08 02:37:38 -04:00 · 5d756de314
commit 5d756de314
parent 90e1021154 236c4cf442
50 changed files with 5094 additions and 1205 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,166 @@
 _skbuild/
 .envrc
 models/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -0,0 +1,96 @@
 ---
 name: Bug report
 about: Create a report to help us improve
 title: ''
 labels: ''
 assignees: ''
 ---
 # Prerequisites
 Please answer the following questions for yourself before submitting an issue.
 - [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
 - [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md).
 - [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
 - [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share.
 # Expected Behavior
 Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do.
 # Current Behavior
 Please provide a detailed written description of what `llama-cpp-python` did, instead.
 # Environment and Context
 Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
 * Physical (or virtual) hardware you are using, e.g. for Linux:
 `$ lscpu`
 * Operating System, e.g. for Linux:
 `$ uname -a`
 * SDK version, e.g. for Linux:
 ```
 $ python3 --version
 $ make --version
 $ g++ --version
 ```
 # Failure Information (for bugs)
 Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
 # Steps to Reproduce
 Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
 1. step 1
 2. step 2
 3. step 3
 4. etc.
 **Note: Many issues seem to be regarding functional or performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.**
 Try the following:
 1. `git clone https://github.com/abetlen/llama-cpp-python`
 2. `cd llama-cpp-python`
 3. `rm -rf _skbuild/` # delete any old builds
 4. `python setup.py develop`
 5. `cd ./vendor/llama.cpp`
 6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp
 7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues)
 # Failure Logs
 Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
 Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
 Example environment info:
 ```
 llama-cpp-python$ git log | head -1
 commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2
 llama-cpp-python$ python3 --version
 Python 3.10.10
 llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy"
 fastapi                  0.95.0
 numpy                    1.24.3
 sse-starlette            1.3.3
 uvicorn                  0.21.1
 llama-cpp-python/vendor/llama.cpp$ git log | head -3
 commit 66874d4fbcc7866377246efbcee938e8cc9c7d76
 Author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
 Date:   Thu May 25 20:18:01 2023 -0600
 ```
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@ -0,0 +1,20 @@
 ---
 name: Feature request
 about: Suggest an idea for this project
 title: ''
 labels: ''
 assignees: ''
 ---
 **Is your feature request related to a problem? Please describe.**
 A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
 **Describe the solution you'd like**
 A clear and concise description of what you want to happen.
 **Describe alternatives you've considered**
 A clear and concise description of any alternative solutions or features you've considered.
 **Additional context**
 Add any other context or screenshots about the feature request here.
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,11 @@
 # To get started with Dependabot version updates, you'll need to specify which
 # package ecosystems to update and where the package manifests are located.
 # Please see the documentation for all configuration options:
 # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 version: 2
 updates:
  - package-ecosystem: "pip" # See documentation for possible values
    directory: "/" # Location of package manifests
    schedule:
      interval: "weekly"
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@ -0,0 +1,39 @@
 name: Build Docker
 on: workflow_dispatch
 permissions:
  contents: write
  packages: write
 jobs:
  docker:
    name: Build and push Docker image
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v3
        with:
          submodules: "true"
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v2
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v2
      - name: Login to GitHub Container Registry
        uses: docker/login-action@v2 
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Build and push
        uses: docker/build-push-action@v4
        with:
          context: .
          push: true # push to registry
          pull: true # always fetch the latest base images
          platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64
          tags: ghcr.io/abetlen/llama-cpp-python:latest
--- a/.github/workflows/test-pypi.yaml
+++ b/.github/workflows/test-pypi.yaml
@ -0,0 +1,64 @@
 name: Tests for PyPI package
 on: workflow_dispatch
 jobs:
  build-linux:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
    steps:
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        run: |
          python3 -m pip install --upgrade pip
          python3 -m pip install --verbose llama-cpp-python[server,test]
      - name: Test with pytest
        run: |
          python3 -c "import llama_cpp"
  build-windows:
    runs-on: windows-latest
    strategy:
      matrix:
        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
    steps:
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        run: |
          python3 -m pip install --upgrade pip
          python3 -m pip install --verbose llama-cpp-python[server,test]
      - name: Test with pytest
        run: |
          python3 -c "import llama_cpp"
  build-macos:
    runs-on: macos-latest
    strategy:
      matrix:
        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
    steps:
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        run: |
          python3 -m pip install --upgrade pip
          python3 -m pip install --verbose llama-cpp-python[server,test]
      - name: Test with pytest
        run: |
          python3 -c "import llama_cpp"
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -26,7 +26,7 @@ jobs:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
          pip install . -v
      - name: Test with pytest
        run: |
@ -49,7 +49,7 @@ jobs:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
          pip install . -v
      - name: Test with pytest
        run: |
@ -72,7 +72,7 @@ jobs:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
          pip install . -v
      - name: Test with pytest
        run: |
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 .vscode/
 _skbuild/
 .envrc
@ -11,6 +13,10 @@ __pycache__/
 # C extensions
 *.so
 *.dylib
 *.metal
 *.dll
 *.lib
 # Distribution / packaging
 .Python
@ -164,3 +170,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 # downloaded model .bin files
 docker/open_llama/*.bin
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,3 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
-	url = git@github.com:ggerganov/llama.cpp.git
+	url = https://github.com/ggerganov/llama.cpp.git
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -0,0 +1,24 @@
 # Read the Docs configuration file for MkDocs projects
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 # Required
 version: 2
 # Set the version of Python and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.11"
 mkdocs:
  configuration: mkdocs.yml
 python:
  install:
    - method: pip
      path: .
    - requirements: docs/requirements.txt
 submodules:
  include: all
  recursive: true
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,117 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [Unreleased]
 ## [Added]
 - (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting.
 ## [0.1.68]
 ## [Added]
 - (llama.cpp) Update llama.cpp
 ## [0.1.67]
 ## Fixed
 - Fix performance bug in Llama model by pre-allocating memory tokens and logits.
 - Fix bug in Llama model where the model was not free'd after use.
 ## [0.1.66]
 ## Added
 - (llama.cpp) New model API
 ## Fixed
 - Performance issue during eval caused by looped np.concatenate call
 - State pickling issue when saving cache to disk
 ## [0.1.65]
 ### Added
 - (llama.cpp) Fix struct misalignment bug
 ## [0.1.64]
 ### Added
 - (llama.cpp) Update llama.cpp
 - Fix docs for seed. Set -1 for random.
 ## [0.1.63]
 ### Added
 - (llama.cpp) Add full gpu utilisation in CUDA
 - (llama.cpp) Add get_vocab
 - (llama.cpp) Add low_vram parameter
 - (server) Add logit_bias parameter
 ## [0.1.62]
 ### Fixed
 - Metal support working
 - Cache re-enabled
 ## [0.1.61]
 ### Fixed
 - Fix broken pip installation
 ## [0.1.60]
 ### NOTE
 - This release was deleted due to a bug  with the packaging system that caused pip installations to fail.
 ### Fixed
 - Truncate max_tokens in create_completion so requested tokens doesn't exceed context size.
 - Temporarily disable cache for completion requests
 ## [v0.1.59]
 ### Added
 - (llama.cpp) k-quants support
 - (server) mirostat sampling parameters to server
 ### Fixed
 - Support both `.so` and `.dylib` for `libllama` on MacOS
 ## [v0.1.58]
 ### Added
 - (llama.cpp) Metal Silicon support
 ## [v0.1.57]
 ### Added
 - (llama.cpp) OpenLlama 3B support
 ## [v0.1.56]
 ### Added
 - (misc) Added first version of the changelog
 - (server) Use async routes
 - (python-api) Use numpy for internal buffers to reduce memory usage and improve performance.
 ### Fixed
 - (python-api) Performance bug in stop sequence check slowing down streaming.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,7 +2,11 @@ cmake_minimum_required(VERSION 3.4...3.22)
 project(llama_cpp)
-if (UNIX)
+option(FORCE_CMAKE "Force CMake build of Python bindings" OFF)
 set(FORCE_CMAKE $ENV{FORCE_CMAKE})
 if (UNIX AND NOT FORCE_CMAKE)
    add_custom_command(
        OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
        COMMAND make libllama.so
@ -23,5 +27,8 @@ else()
        TARGETS llama 
        LIBRARY DESTINATION llama_cpp
        RUNTIME DESTINATION llama_cpp
        ARCHIVE DESTINATION llama_cpp
        FRAMEWORK DESTINATION llama_cpp
        RESOURCE DESTINATION llama_cpp
    )
-endif(UNIX)
+endif()
--- a/66
+++ b/66
@ -0,0 +1,66 @@
 update:
 	poetry install
 	git submodule update --init --recursive
 update.vendor:
 	cd vendor/llama.cpp && git pull origin master
 build:
 	python3 setup.py develop
 build.cuda:
 	CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
 build.opencl:
 	CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop
 build.openblas:
 	CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
 build.blis:
 	CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop
 build.metal:
 	CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 setup.py develop
 build.sdist:
 	python3 setup.py sdist
 deploy.pypi:
 	python3 -m twine upload dist/*
 deploy.gh-docs:
 	mkdocs build
 	mkdocs gh-deploy
 test:
 	python3 -m pytest
 docker:
 	docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile .
 run-server:
 	uvicorn --factory llama.server:app --host ${HOST} --port ${PORT}
 clean:
 	- cd vendor/llama.cpp && make clean
 	- cd vendor/llama.cpp && rm libllama.so
 	- rm -rf _skbuild
 	- rm llama_cpp/*.so
 	- rm llama_cpp/*.dylib
 	- rm llama_cpp/*.metal
 	- rm llama_cpp/*.dll
 	- rm llama_cpp/*.lib
 .PHONY: \
 	update \
 	update.vendor \
 	build \
 	build.cuda \
 	build.opencl \
 	build.openblas \
 	build.sdist \
 	deploy.pypi \
 	deploy.gh-docs \
 	docker \
 	clean
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # 🦙 Python Bindings for `llama.cpp`
-[![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python)
+[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest)
 [![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
 [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
@ -15,16 +15,70 @@ This package provides:
  - OpenAI-like API
  - LangChain compatibility
-## Installation
+Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
-Install from PyPI:
+
 ## Installation from PyPI (recommended)
 Install from PyPI (requires a c compiler):
 ```bash
 pip install llama-cpp-python
 ```
 The above command will attempt to install the package and build `llama.cpp` from source.
 This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system.
 If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different  compiler options, please add the following flags to ensure that the package is rebuilt correctly:
 ```bash
 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir
 ```
 Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example:
 ```
 wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
 bash Miniforge3-MacOSX-arm64.sh
 ```
 Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
 ### Installation with OpenBLAS / cuBLAS / CLBlast / Metal
 `llama.cpp` supports multiple BLAS backends for faster processing.
 Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend.
 To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
 ```bash
 CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
 ```bash
 CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
 ```bash
 CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable before installing:
 ```bash
 CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](docs/install/macos.md)
 ## High-level API
 The high-level API provides a simple managed interface through the `Llama` class.
 Below is a short example demonstrating how to use the high-level API to generate text:
 ```python
 >>> from llama_cpp import Llama
 >>> llm = Llama(model_path="./models/7B/ggml-model.bin")
@ -51,6 +105,15 @@ pip install llama-cpp-python
 }
 ```
 ### Adjusting the Context Window
 The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.
 For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object:
 ```python
 llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
 ```
 ## Web Server
 `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
@ -60,16 +123,40 @@ To install the server package and get started:
 ```bash
 pip install llama-cpp-python[server]
-export MODEL=./models/7B/ggml-model.bin
+python3 -m llama_cpp.server --model models/7B/ggml-model.bin
 python3 -m llama_cpp.server
 ```
 Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.
 ## Docker image
 A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
 ```bash
 docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
 ```
 ## Low-level API
-The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`.
+The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`.
-The entire API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and should mirror [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
+The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
 Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
 ```python
 >>> import llama_cpp
 >>> import ctypes
 >>> params = llama_cpp.llama_context_default_params()
 # use bytes for char * params
 >>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params)
 >>> max_tokens = params.n_ctx
 # use ctypes arrays for array params
 >>> tokens = (llama_cpp.llama_token * int(max_tokens))()
 >>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True))
 >>> llama_cpp.llama_free(ctx)
 ```
 Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
 # Documentation
@ -84,8 +171,19 @@ This package is under active development and I welcome any contributions.
 To get started, clone the repository and install the package in development mode:
 ```bash
-git clone git@github.com:abetlen/llama-cpp-python.git
+git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
-git submodule update --init --recursive
+cd llama-cpp-python
 # Install with pip
 pip install -e .
 # if you want to use the fastapi / openapi server
 pip install -e .[server]
 # If you're a poetry user, installing will also include a virtual environment
 poetry install --all-extras
 . .venv/bin/activate
 # Will need to be re-run any time vendor/llama.cpp is updated
 python3 setup.py develop
 ```
--- a/docker/README.md
+++ b/docker/README.md
@ -0,0 +1,66 @@
 # Install Docker Server
 **Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
 [Install Docker Engine](https://docs.docker.com/engine/install)
 **Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
 # Simple Dockerfiles for building the llama-cpp-python server with external model bin files
 ## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image
 ```
 cd ./openblas_simple
 docker build -t openblas_simple .
 docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple
 ```
 where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
 ## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image
 ```
 cd ./cuda_simple
 docker build -t cuda_simple .
 docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple
 ```
 where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
 # "Open-Llama-in-a-box"
 ## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server
 ```
 $ cd ./open_llama
 ./build.sh
 ./start.sh
 ```
 # Manually choose your own Llama model from Hugging Face
 `python3 ./hug_model.py -a TheBloke -t llama`
 You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
 ```
 docker $ ls -lh *.bin
 -rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>q5_1.bin
 lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_1.bin
 ```
 **Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
 **TWICE** as much disk space as the size of the model:
 | Model |  Quantized size |
 |------:|----------------:|
 |    3B |            3 GB |
 |    7B |            5 GB |
 |   13B |           10 GB |
 |   33B |           25 GB |
 |   65B |           50 GB |
 **Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
 ## Use OpenBLAS
 Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
 ### Build:
 `docker build -t openblas .`
 ### Run:
 `docker run --cap-add SYS_RESOURCE -t openblas`
 ## Use CuBLAS
 ### Build:
 `docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
 ### Run:
 `docker run --cap-add SYS_RESOURCE -t cublas`
--- a/docker/cuda_simple/Dockerfile
+++ b/docker/cuda_simple/Dockerfile
@ -0,0 +1,16 @@
 ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
 FROM nvidia/cuda:${CUDA_IMAGE}
 # We need to set the host to 0.0.0.0 to allow outside access
 ENV HOST 0.0.0.0
 COPY . .
 # Install the package
 RUN apt update && apt install -y python3 python3-pip
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 RUN LLAMA_CUBLAS=1 pip install llama-cpp-python
 # Run the server
 CMD python3 -m llama_cpp.server
--- a/docker/open_llama/Dockerfile
+++ b/docker/open_llama/Dockerfile
@ -0,0 +1,51 @@
 # Define the image argument and provide a default value
 ARG IMAGE=python:3-slim-bullseye
 # Use the image as specified
 FROM ${IMAGE}
 # Re-declare the ARG after FROM
 ARG IMAGE
 # Update and upgrade the existing packages 
 RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
    python3 \
    python3-pip \
    ninja-build \
    build-essential
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 # Perform the conditional installations based on the image
 RUN echo "Image: ${IMAGE}" && \
    if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
    echo "OpenBLAS install:" && \
    apt-get install -y --no-install-recommends libopenblas-dev && \
    LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
 else \
    echo "CuBLAS install:" && \
    LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
 fi
 # Clean up apt cache
 RUN rm -rf /var/lib/apt/lists/*
 # Set a working directory for better clarity
 WORKDIR /app
 # Copy files to the app directory
 RUN echo "Installing model...this can take some time..."
 COPY ./model.bin /app/model.bin
 COPY ./start_server.sh /app/start_server.sh
 # Make the server start script executable
 RUN chmod +x /app/start_server.sh
 # Set environment variable for the host
 ENV HOST=0.0.0.0
 # Expose a port for the server
 EXPOSE 8000
 # Run the server start script
 CMD ["/bin/sh", "/app/start_server.sh"]
--- a/docker/open_llama/build.sh
+++ b/docker/open_llama/build.sh
@ -0,0 +1,14 @@
 #!/bin/sh
 MODEL="open_llama_3b"
 # Get  open_llama_3b_ggml q5_1 quantization
 python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
 ls -lh *.bin
 # Build the default OpenBLAS image
 docker build -t $MODEL .
 docker images | egrep "^(REPOSITORY|$MODEL)"
 echo
 echo "To start the docker container run:"
 echo "docker run -t -p 8000:8000 $MODEL"
--- a/docker/open_llama/hug_model.py
+++ b/docker/open_llama/hug_model.py
@ -0,0 +1,139 @@
 import requests
 import json
 import os
 import struct
 import argparse
 def make_request(url, params=None):
    print(f"Making request to {url}...")
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return json.loads(response.text)
    else:
        print(f"Request failed with status code {response.status_code}")
        return None
 def check_magic_and_version(filename):
    with open(filename, 'rb') as f:
        # Read the first 6 bytes from the file
        data = f.read(6)
    # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
    # and the next 2 bytes as a little-endian unsigned short
    magic, version = struct.unpack('<I H', data)
    print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
    return magic, version
 def download_file(url, destination):
    print(f"Downloading {url} to {destination}...")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(destination, 'wb') as f:
            total_downloaded = 0
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)
                    total_downloaded += len(chunk)
                    if total_downloaded >= 10485760:  # 10 MB
                        print('.', end='', flush=True)
                        total_downloaded = 0
        print("\nDownload complete.")
        # Creating a symbolic link from destination to "model.bin"
        if os.path.isfile("model.bin"):
            os.remove("model.bin")  # remove the existing link if any
        os.symlink(destination, "model.bin")
    else:
        print(f"Download failed with status code {response.status_code}")
 def get_user_choice(model_list):
    # Print the enumerated list
    print("\n")
    for i, (model_id, rfilename) in enumerate(model_list):
        print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
    # Get user's choice
    choice = input("Choose a model to download by entering the corresponding number: ")
    try:
        index = int(choice) - 1
        if 0 <= index < len(model_list):
            # Return the chosen model
            return model_list[index]
        else:
            print("Invalid choice.")
    except ValueError:
        print("Invalid input. Please enter a number corresponding to a model.")
    except IndexError:
        print("Invalid choice. Index out of range.")
    return None
 def main():
    # Create an argument parser
    parser = argparse.ArgumentParser(description='Process some parameters.')
    # Arguments
    parser.add_argument('-v', '--version', type=int, default=0x0003,
                        help='hexadecimal version number of ggml file')
    parser.add_argument('-a', '--author', type=str, default='TheBloke',
                        help='HuggingFace author filter')
    parser.add_argument('-t', '--tag', type=str, default='llama',
                        help='HuggingFace tag filter')
    parser.add_argument('-s', '--search', type=str, default='',
                        help='HuggingFace search filter')
    parser.add_argument('-f', '--filename', type=str, default='q5_1',
                        help='HuggingFace model repository filename substring match')
    # Parse the arguments
    args = parser.parse_args()
    # Define the parameters
    params = {
        "author": args.author,
        "tags": args.tag,
        "search": args.search
    }
    models = make_request('https://huggingface.co/api/models', params=params)
    if models is None:
        return
    model_list = []
    # Iterate over the models
    for model in models:
        model_id = model['id']
        model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
        if model_info is None:
            continue
        for sibling in model_info.get('siblings', []):
            rfilename = sibling.get('rfilename')
            if rfilename and args.filename in rfilename:
                model_list.append((model_id, rfilename))
    # Choose the model
    model_list.sort(key=lambda x: x[0])
    if len(model_list) == 0:
        print("No models found")
        exit(1)
    elif len(model_list) == 1:
        model_choice = model_list[0]
    else:
        model_choice = get_user_choice(model_list)
    if model_choice is not None:
        model_id, rfilename = model_choice
        url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
        dest = f"{model_id.replace('/', '_')}_{rfilename}"
        download_file(url, dest)
        _, version = check_magic_and_version(dest)
        if version != args.version:
             print(f"Warning: Expected version {args.version}, but found different version in the file.")
    else:
        print("Error - model choice was None")
        exit(2)
 if __name__ == '__main__':
    main()
--- a/docker/open_llama/start.sh
+++ b/docker/open_llama/start.sh
@ -0,0 +1,28 @@
 #!/bin/sh
 MODEL="open_llama_3b"
 # Start Docker container
 docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
 sleep 10
 echo
 docker ps | egrep "(^CONTAINER|$MODEL)"
 # Test the model works
 echo
 curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{
  "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
  "stop": [
    "\n",
    "###"
  ]
 }' | grep Paris
 if [ $? -eq 0 ]
 then
    echo
    echo "$MODEL is working!!"
 else
    echo
    echo "ERROR: $MODEL not replying."
    exit 1
 fi
--- a/docker/open_llama/start_server.sh
+++ b/docker/open_llama/start_server.sh
@ -0,0 +1,11 @@
 #!/bin/sh
 # For mlock support
 ulimit -l unlimited
 if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
    python3 -B -m llama_cpp.server --model /app/model.bin
 else
    # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
    python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
 fi
--- a/docker/openblas_simple/Dockerfile
+++ b/docker/openblas_simple/Dockerfile
@ -0,0 +1,15 @@
 FROM python:3-slim-bullseye
 # We need to set the host to 0.0.0.0 to allow outside access
 ENV HOST 0.0.0.0
 COPY . .
 # Install the package
 RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
 RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose
 # Run the server
 CMD python3 -m llama_cpp.server
--- a/docker/simple/Dockerfile
+++ b/docker/simple/Dockerfile
@ -0,0 +1,33 @@
 # Define the image argument and provide a default value
 ARG IMAGE=python:3-slim-bullseye
 # Use the image as specified
 FROM ${IMAGE}
 # Re-declare the ARG after FROM
 ARG IMAGE
 # Update and upgrade the existing packages 
 RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
    python3 \
    python3-pip \
    ninja-build \
    build-essential
 RUN mkdir /app
 WORKDIR /app
 COPY . /app
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 RUN make build && make clean
 # Set environment variable for the host
 ENV HOST=0.0.0.0
 ENV PORT=8000
 # Expose a port for the server
 EXPOSE 8000
 # Run the server start script
 CMD ["/bin/sh", "/app/docker/simple/run.sh"]
--- a/docker/simple/run.sh
+++ b/docker/simple/run.sh
@ -0,0 +1,4 @@
 #!/bin/bash
 make build
 uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@ -0,0 +1,53 @@
 ---
 title: API Reference
 ---
 ::: llama_cpp.Llama
    options:
        members:
            - __init__
            - tokenize
            - detokenize
            - reset
            - eval
            - sample
            - generate
            - create_embedding
            - embed
            - create_completion
            - __call__
            - create_chat_completion
            - set_cache
            - save_state
            - load_state
            - token_bos
            - token_eos
        show_root_heading: true
 ::: llama_cpp.LlamaCache
    options:
        show_root_heading: true
 ::: llama_cpp.LlamaState
    options:
        show_root_heading: true
 ::: llama_cpp.LogitsProcessor
    options:
        show_root_heading: true
 ::: llama_cpp.LogitsProcessorList
    options:
        show_root_heading: true
 ::: llama_cpp.StoppingCriteria
    options:
        show_root_heading: true
 ::: llama_cpp.StoppingCriteriaList
    options:
        show_root_heading: true
 ::: llama_cpp.llama_cpp
    options:
        show_if_no_docstring: true
--- a/docs/index.md
+++ b/docs/index.md
@ -87,31 +87,6 @@ git submodule update --init --recursive
 python3 setup.py develop
 ```
 ## API Reference
 ::: llama_cpp.Llama
    options:
        members:
            - __init__
            - tokenize
            - detokenize
            - reset
            - eval
            - sample
            - generate
            - create_embedding
            - embed
            - create_completion
            - __call__
            - create_chat_completion
            - token_bos
            - token_eos
        show_root_heading: true
 ::: llama_cpp.llama_cpp
    options:
        show_if_no_docstring: true
 ## License
 This project is licensed under the terms of the MIT license.
--- a/docs/install/macos.md
+++ b/docs/install/macos.md
@ -0,0 +1,59 @@
 ---
 title: MacOS Install with Metal GPU
 ---
 **(1) Make sure you have xcode installed... at least the command line parts**
 ```
 # check the path of your xcode install 
 xcode-select -p
 # xcode installed returns
 # /Applications/Xcode-beta.app/Contents/Developer
 # if xcode is missing then install it... it takes ages;
 xcode-select --install
 ```
 **(2) Install the conda version for MacOS that supports Metal GPU**
 ```
 wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
 bash Miniforge3-MacOSX-arm64.sh
 ```
 **(3) Make a conda environment**
 ```
 conda create -n llama python=3.9.16
 conda activate llama
 ```
 **(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62**  
    *(you needed xcode installed in order pip to build/compile the C++ code)*
 ```
 pip uninstall llama-cpp-python -y
 CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
 pip install 'llama-cpp-python[server]'
 # you should now have llama-cpp-python v0.1.62 or higher installed
 llama-cpp-python         0.1.68
 ```
 **(5) Download a v3 ggml model**
 - **ggmlv3**
 - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0
 https://huggingface.co/TheBloke/open-llama-7b-open-instruct-GGML
 **(6) run the llama-cpp-python API server with MacOS Metal GPU support**
 ```
 # config your ggml model path
 # make sure it is ggml v3
 # make sure it is q4_0
 export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin
 python3 -m llama_cpp.server --model $MODEL  --n_gpu_layers 1
 ```
 ***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used*
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -0,0 +1,3 @@
 mkdocs
 mkdocs-material
 mkdocstrings[python]
--- a/examples/high_level_api/fastapi_server.py
+++ b/examples/high_level_api/fastapi_server.py
@ -4,259 +4,34 @@ To run this example:
 ```bash
 pip install fastapi uvicorn sse-starlette
-export MODEL=../models/7B/ggml-model.bin
+export MODEL=../models/7B/...
-uvicorn fastapi_server_chat:app --reload
+```
 Then run:
 ```
 uvicorn llama_cpp.server.app:app --reload
 ```
 or
 ```
 python3 -m llama_cpp.server
 ```
 Then visit http://localhost:8000/docs to see the interactive API docs.
 To actually see the implementation of the server, see llama_cpp/server/app.py
 """
 import os
-import json
+import uvicorn
 from typing import List, Optional, Literal, Union, Iterator, Dict
 from typing_extensions import TypedDict
 import llama_cpp
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
 from sse_starlette.sse import EventSourceResponse
 class Settings(BaseSettings):
    model: str
    n_ctx: int = 2048
    n_batch: int = 8
    n_threads: int = int(os.cpu_count() / 2) or 1
    f16_kv: bool = True
    use_mlock: bool = False     # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
    embedding: bool = True
    last_n_tokens_size: int = 64
 app = FastAPI(
    title="🦙 llama.cpp Python API",
    version="0.0.1",
 )
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 settings = Settings()
 llama = llama_cpp.Llama(
    settings.model,
    f16_kv=settings.f16_kv,
    use_mlock=settings.use_mlock,
    embedding=settings.embedding,
    n_threads=settings.n_threads,
    n_batch=settings.n_batch,
    n_ctx=settings.n_ctx,
    last_n_tokens_size=settings.last_n_tokens_size,
 )
 class CreateCompletionRequest(BaseModel):
    prompt: str
    suffix: Optional[str] = Field(None)
    max_tokens: int = 16
    temperature: float = 0.8
    top_p: float = 0.95
    echo: bool = False
    stop: List[str] = []
    stream: bool = False
    # ignored or currently unsupported
    model: Optional[str] = Field(None)
    n: Optional[int] = 1
    logprobs: Optional[int] = Field(None)
    presence_penalty: Optional[float] = 0
    frequency_penalty: Optional[float] = 0
    best_of: Optional[int] = 1
    logit_bias: Optional[Dict[str, float]] = Field(None)
    user: Optional[str] = Field(None)
    # llama.cpp specific parameters
    top_k: int = 40
    repeat_penalty: float = 1.1
    class Config:
        schema_extra = {
            "example": {
                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
                "stop": ["\n", "###"],
            }
        }
 CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
@app.post(
    "/v1/completions",
    response_model=CreateCompletionResponse,
 )
 def create_completion(request: CreateCompletionRequest):
    if request.stream:
        chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict())  # type: ignore
        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
    return llama(
        **request.dict(
            exclude={
                "model",
                "n",
                "logprobs",
                "frequency_penalty",
                "presence_penalty",
                "best_of",
                "logit_bias",
                "user",
            }
        )
    )
 class CreateEmbeddingRequest(BaseModel):
    model: Optional[str]
    input: str
    user: Optional[str]
    class Config:
        schema_extra = {
            "example": {
                "input": "The food was delicious and the waiter...",
            }
        }
 CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
@app.post(
    "/v1/embeddings",
    response_model=CreateEmbeddingResponse,
 )
 def create_embedding(request: CreateEmbeddingRequest):
    return llama.create_embedding(**request.dict(exclude={"model", "user"}))
 class ChatCompletionRequestMessage(BaseModel):
    role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
    content: str
    user: Optional[str] = None
 class CreateChatCompletionRequest(BaseModel):
    model: Optional[str]
    messages: List[ChatCompletionRequestMessage]
    temperature: float = 0.8
    top_p: float = 0.95
    stream: bool = False
    stop: List[str] = []
    max_tokens: int = 128
    # ignored or currently unsupported
    model: Optional[str] = Field(None)
    n: Optional[int] = 1
    presence_penalty: Optional[float] = 0
    frequency_penalty: Optional[float] = 0
    logit_bias: Optional[Dict[str, float]] = Field(None)
    user: Optional[str] = Field(None)
    # llama.cpp specific parameters
    repeat_penalty: float = 1.1
    class Config:
        schema_extra = {
            "example": {
                "messages": [
                    ChatCompletionRequestMessage(
                        role="system", content="You are a helpful assistant."
                    ),
                    ChatCompletionRequestMessage(
                        role="user", content="What is the capital of France?"
                    ),
                ]
            }
        }
 CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)
@app.post(
    "/v1/chat/completions",
    response_model=CreateChatCompletionResponse,
 )
 async def create_chat_completion(
    request: CreateChatCompletionRequest,
 ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
    completion_or_chunks = llama.create_chat_completion(
        **request.dict(
            exclude={
                "model",
                "n",
                "presence_penalty",
                "frequency_penalty",
                "logit_bias",
                "user",
            }
        ),
    )
    if request.stream:
        async def server_sent_events(
            chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
        ):
            for chat_chunk in chat_chunks:
                yield dict(data=json.dumps(chat_chunk))
            yield dict(data="[DONE]")
        chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks  # type: ignore
        return EventSourceResponse(
            server_sent_events(chunks),
        )
    completion: llama_cpp.ChatCompletion = completion_or_chunks  # type: ignore
    return completion
 class ModelData(TypedDict):
    id: str
    object: Literal["model"]
    owned_by: str
    permissions: List[str]
 class ModelList(TypedDict):
    object: Literal["list"]
    data: List[ModelData]
 GetModelResponse = create_model_from_typeddict(ModelList)
@app.get("/v1/models", response_model=GetModelResponse)
 def get_models() -> ModelList:
    return {
        "object": "list",
        "data": [
            {
                "id": llama.model_path,
                "object": "model",
                "owned_by": "me",
                "permissions": [],
            }
        ],
    }
 from llama_cpp.server.app import create_app
 if __name__ == "__main__":
-    import os
+    app = create_app()
    import uvicorn
-    uvicorn.run(app, host=os.getenv("HOST", "localhost"), port=os.getenv("PORT", 8000))
+    uvicorn.run(
        app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
    )
--- a/examples/low_level_api/Chat.py
+++ b/examples/low_level_api/Chat.py
@ -0,0 +1,71 @@
 #!/bin/python
 import sys, os, datetime
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 def env_or_def(env, default):
 	if (env in os.environ):
 		return os.environ[env]
 	return default
 AI_NAME = env_or_def("AI_NAME", "ChatLLaMa")
 MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
 USER_NAME = env_or_def("USER_NAME", "USER")
 N_PREDICTS = int(env_or_def("N_PREDICTS", "2048"))
 N_THREAD = int(env_or_def("N_THREAD", "8"))
 today = datetime.datetime.today()
 DATE_YEAR=today.strftime("%Y")
 DATE_TIME=today.strftime("%H:%M")
 prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
 {AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision.
 There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
 The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
 The transcript only includes text, it does not include markup like HTML and Markdown.
 {USER_NAME}: Hello, {AI_NAME}!
 {AI_NAME}: Hello {USER_NAME}! How may I help you today?
 {USER_NAME}: What year is it?
 {AI_NAME}: We are in {DATE_YEAR}.
 {USER_NAME}: Please tell me the largest city in Europe.
 {AI_NAME}: The largest city in Europe is Moscow, the capital of Russia.
 {USER_NAME}: What can you tell me about Moscow?
 {AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center.
 {USER_NAME}: What is a cat?
 {AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
 {USER_NAME}: How do I pass command line arguments to a Node.js program?
 {AI_NAME}: The arguments are stored in process.argv.
    argv[0] is the path to the Node. js executable.
    argv[1] is the path to the script file.
    argv[2] is the first argument passed to the script.
    argv[3] is the second argument passed to the script and so on.
 {USER_NAME}: Name a color.
 {AI_NAME}: Blue.
 {USER_NAME}: What time is it?
 {AI_NAME}: It is {DATE_TIME}.
 {USER_NAME}:""" + " ".join(sys.argv[1:])
 print("Loading model...")
 params = GptParams(
 	n_ctx=2048,
 	temp=0.7,
 	top_k=40,
 	top_p=0.5,
 	repeat_last_n=256,
 	n_batch=1024,
 	repeat_penalty=1.17647,
 	model=MODEL,
 	n_threads=N_THREAD,
 	n_predict=N_PREDICTS,
 	use_color=True,
 	interactive=True,
 	antiprompt=[f"{USER_NAME}:"],
 	input_prefix=" ",
 	input_suffix=f"{AI_NAME}:",
 	prompt=prompt,
 )
 with LLaMAInteract(params) as m:
 	m.interact()
--- a/examples/low_level_api/Miku.py
+++ b/examples/low_level_api/Miku.py
@ -0,0 +1,59 @@
 #!/bin/python
 import sys, os
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 def env_or_def(env, default):
 	if (env in os.environ):
 		return os.environ[env]
 	return default
 AI_NAME = env_or_def("AI_NAME", "Miku")
 MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
 USER_NAME = env_or_def("USER_NAME", "Anon")
 N_PREDICTS = int(env_or_def("N_PREDICTS", "4096"))
 N_THREAD = int(env_or_def("N_THREAD", "0"))
 prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer.
 {AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
 {AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
 {AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
 {AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
 The conversation is only between {USER_NAME} and {AI_NAME}
 The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice.
 {AI_NAME} can only communicate through text, so she can't send images or videos.
 {USER_NAME}: Hello!
 {AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
 {AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
 {AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
 {USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
 {AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
 {AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that!
 {AI_NAME}: What do you like to do in your free time? ^_^
 {USER_NAME}:""" + " ".join(sys.argv[1:])
 print("Loading model...")
 params = GptParams(
 	n_batch=1024,
 	n_ctx=2048,
 	n_keep=-1,
 	repeat_last_n=256,
 	repeat_penalty=1.17647,
 	temp=0.7,
 	top_k=40,
 	top_p=0.5,
 	model=MODEL,
 	n_predict=N_PREDICTS,
 	use_color=True,
 	interactive=True,
 	antiprompt=[f"{USER_NAME}:"],
 	prompt=prompt,
 )
 if N_THREAD > 0:
 	params.n_threads = N_THREAD
 with LLaMAInteract(params) as m:
 	m.interact()
--- a/examples/low_level_api/ReasonAct.py
+++ b/examples/low_level_api/ReasonAct.py
@ -0,0 +1,49 @@
 #!/bin/python
 import sys, os, datetime
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 def env_or_def(env, default):
 	if (env in os.environ):
 		return os.environ[env]
 	return default
 MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
 prompt=f"""You run in a loop of Thought, Action, Observation.
 At the end of the loop either Answer or restate your Thought and Action.
 Use Thought to describe your thoughts about the question you have been asked.
 Use Action to run one of these actions available to you:
 - calculate[python math expression]
 Observation will be the result of running those actions
 Question: What is 4 * 7 / 3?
 Thought: Do I need to use an action? Yes, I use calculate to do math
 Action: calculate[4 * 7 / 3]
 Observation: 9.3333333333
 Thought: Do I need to use an action? No, have the result
 Answer: The calculate tool says it is 9.3333333333
 Question: What is capital of france?
 Thought: Do I need to use an action? No, I know the answer
 Answer: Paris is the capital of France
 Question:""" + " ".join(sys.argv[1:])
 print("Loading model...")
 params = GptParams(
 	interactive=True,
 	interactive_start=True,
 	top_k=10000,
 	temp=0.2,
 	repeat_penalty=1,
 	n_threads=7,
 	n_ctx=2048,
 	antiprompt=["Question:","Observation:"],
 	model=MODEL,
 	input_prefix=" ",
 	n_predict=-1,
 	prompt=prompt,
 )
 with LLaMAInteract(params) as m:
 	m.interact()
--- a/examples/low_level_api/common.py
+++ b/examples/low_level_api/common.py
@ -1,8 +1,9 @@
 import os
 import argparse
 import re
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import List
 # Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
@ -12,23 +13,36 @@ class GptParams:
    seed: int = -1
    n_threads: int = min(4, os.cpu_count() or 1)
    n_predict: int = 128
    repeat_last_n: int = 64
    n_parts: int = -1
    n_ctx: int = 512
    n_batch: int = 8
    n_keep: int = 0
    ignore_eos: bool = False
    logit_bias: dict[int, float] = field(default_factory=dict)
    top_k: int = 40
    top_p: float = 0.95
    tfs_z: float = 1.00
    typical_p: float = 1.00
    temp: float = 0.80
    repeat_penalty: float = 1.10
    repeat_last_n: int = 64
    frequency_penalty: float = 0.0
    presence_penalty: float = 0.0
    mirostat: int = 0
    mirostat_tau: float = 5.0
    mirostat_eta: float = 0.1
    model: str = "./models/llama-7B/ggml-model.bin"
    prompt: str = ""
    path_session: str = ""
    input_prefix: str = " "
-
+    input_suffix: str = ""
    antiprompt: List[str] = field(default_factory=list)
    lora_adapter: str = ""
    lora_base: str = ""
    memory_f16: bool = True
    random_prompt: bool = False
    use_color: bool = False
@ -38,7 +52,7 @@ class GptParams:
    interactive_start: bool = False
    instruct: bool = False
-    ignore_eos: bool = False
+    penalize_nl: bool = True
    perplexity: bool = False
    use_mmap: bool = True
    use_mlock: bool = False
@ -50,8 +64,7 @@ class GptParams:
    # If chat ended prematurely, append this to the conversation to fix it.
    # Set to "\nUser:" etc.
    # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:""
-    fix_prefix: str = " "
+    fix_prefix: str = ""
    output_postfix: str = ""
    input_echo: bool = True,
    # Default instructions for Alpaca
@ -61,59 +74,43 @@ class GptParams:
    instruct_inp_suffix: str="\n\n### Response:\n\n"
-def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
+def gpt_params_parse(argv = None):
    if params is None:
        params = GptParams()
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed")
    parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads")
-    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
+    parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict")
-    parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
+    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
    parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx")
    parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
    parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
    parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
    parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
    parser.add_argument("--n_predict", type=int, default=128, help="number of model parts",dest="n_predict")
    parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
    parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
    parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch")
    parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep")
    parser.add_argument(
        "-l",
        "--logit-bias",
        type=str,
        action='append',
        help="--logit-bias TOKEN_ID(+/-)BIAS",
        dest="logit_bias_str"
    )
    parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
    parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
    parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
    parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z")
    parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
    parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
    parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
    parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z")
    parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty")
    parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat")
    parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau")
    parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")
    parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
-    parser.add_argument(
+    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
-        "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
+    parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
-    )
+    parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session")
-    parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
+    parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
-    parser.add_argument(
+    parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix")
        "--interactive-start",
        action="store_true",
        help="run in interactive mode",
        dest="interactive"
    )
    parser.add_argument(
        "--interactive-first",
        action="store_true",
        help="run in interactive mode and wait for input right away",
        dest="interactive_start"
    )
    parser.add_argument(
        "-ins",
        "--instruct",
        action="store_true",
        help="run in instruction mode (use with Alpaca or Vicuna models)",
        dest="instruct"
    )
    parser.add_argument(
        "--color",
        action="store_true",
        help="colorise output to distinguish prompt and user input from generations",
        dest="use_color"
    )
    parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
    parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap")
    parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
    parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt")
    parser.add_argument(
        "-r",
        "--reverse-prompt",
@ -122,16 +119,70 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
        help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
        dest="antiprompt"
    )
-    parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
+    
-    parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
+    parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter")
-    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
+    parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base")
    parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
    parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt")
-    parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
+    parser.add_argument(
        "--color",
        action="store_true",
        help="colorise output to distinguish prompt and user input from generations",
        dest="use_color"
    )
    parser.add_argument(
        "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
    )
    parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
    parser.add_argument(
        "--interactive-first",
        action="store_true",
        help="run in interactive mode and wait for input right away",
        dest="interactive_start"
    )
    parser.add_argument(
        "-ins",
        "--instruct",
        action="store_true",
        help="run in instruction mode (use with Alpaca or Vicuna models)",
        dest="instruct"
    )
    parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl")
    parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
    parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap")
    parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
    parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
    parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt")
    #Custom args
    parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix")
    parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix")
    parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo")
    parser.add_argument(
        "--interactive-start",
        action="store_true",
        help="run in interactive mode",
        dest="interactive"
    )
    args = parser.parse_args(argv)
-    return args
+    
    logit_bias_str = args.logit_bias_str
    delattr(args, "logit_bias_str")
    params = GptParams(**vars(args))
    if (params.lora_adapter):
        params.use_mmap = False
    if (logit_bias_str != None):
        for i in logit_bias_str:
            if (m := re.match(r"(\d+)([-+]\d+)", i)):
                params.logit_bias[int(m.group(1))] = float(m.group(2))
    return params
 def gpt_random_prompt(rng):
    return [
@ -148,4 +199,4 @@ def gpt_random_prompt(rng):
    ][rng % 10]
 if __name__ == "__main__":
-    print(GptParams(gpt_params_parse()))
+    print(gpt_params_parse())
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@ -10,40 +10,14 @@ Quirks:
   You should also still be feeding the model with a "primer" prompt that 
   shows it the expected format.
 """
 import ctypes
 import sys
 from time import time
-from os import cpu_count
+from os import cpu_count, path
 import llama_cpp
 from common import GptParams, gpt_params_parse, gpt_random_prompt
-
+import util
 ANSI_COLOR_RESET = "\x1b[0m"
 ANSI_COLOR_YELLOW = "\x1b[33m"
 ANSI_BOLD = "\x1b[1m"
 ANSI_COLOR_GREEN = "\x1b[32m"
 CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
 CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
 CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
 # Iterative search
 # Actively searches and prevents a pattern from being returned
 class IterSearch:
 	def __init__(self, pattern):
 		self.pattern = list(pattern)
 		self.buffer = []
 	def __call__(self, char):
 		self.buffer += [char]
 		if (self.pattern[:len(self.buffer)] == self.buffer):
 			if (len(self.buffer) >= len(self.pattern)):
 				self.buffer.clear()
 			return []
 		_tmp = self.buffer[:]
 		self.buffer.clear()
 		return _tmp
 # A LLaMA interactive session
 class LLaMAInteract:
@ -77,9 +51,11 @@ specified) expect poor results""", file=sys.stderr)
 		# runtime args
 		self.input_consumed = 0
 		self.n_past = 0
 		self.n_session_consumed = 0
 		self.first_antiprompt = []
 		self.remaining_tokens = self.params.n_predict
 		self.output_echo = self.params.input_echo
 		self.multibyte_fix = []
 		# model load
 		self.lparams = llama_cpp.llama_context_default_params()
@ -94,6 +70,19 @@ specified) expect poor results""", file=sys.stderr)
 		if (not self.ctx):
 			raise RuntimeError(f"error: failed to load model '{self.params.model}'")
 		if (self.params.ignore_eos):
 			self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")
 		if (len(self.params.lora_adapter) > 0):
 			if (llama_cpp.llama_apply_lora_from_file(
 				self.ctx, 
 				self.params.lora_adapter.encode("utf8"), 
 				self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None,
 				self.params.n_threads
 			) != 0):
 				print("error: failed to apply lora adapter")
 				return
 		print(file=sys.stderr)
 		print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
 | {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr)
@ -117,13 +106,52 @@ specified) expect poor results""", file=sys.stderr)
 			with open(self.params.file) as f:
 				self.params.prompt = f.read()
 		self.session_tokens: list[llama_cpp.llama_token] = []
 		if (len(self.params.path_session) > 0):
 			print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr)
 			if (path.exists(self.params.path_session)):
 				_session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
 				_n_token_count_out = llama_cpp.c_size_t()
 				if (llama_cpp.llama_load_session_file(
 					self.ctx, 
 					self.params.path_session.encode("utf8"),
 					_session_tokens,
 					self.params.n_ctx,
 					ctypes.byref(_n_token_count_out)
 				) != 1):
 					print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr)
 					return
 				_n_token_count_out = _n_token_count_out.value
 				self.session_tokens = _session_tokens[:_n_token_count_out]
 				print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr)
 			else:
 				print(f"session file does not exist, will create", file=sys.stderr)
 		# tokenize the prompt
 		self.embd = []
 		self.embd_inp = self._tokenize(self.params.prompt)
-		if (len(self.embd_inp) > self.params.n_ctx - 4):
+		if (len(self.embd_inp) > self.n_ctx - 4):
 			raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})")
 		# debug message about similarity of saved session, if applicable
 		self.n_matching_session_tokens = 0
 		if len(self.session_tokens) > 0:
 			for id in self.session_tokens:
 				if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]:
 					break
 				self.n_matching_session_tokens += 1
 			if self.n_matching_session_tokens >= len(self.embd_inp):
 				print(f"session file has exact match for prompt!")
 			elif self.n_matching_session_tokens < (len(self.embd_inp) / 2):
 				print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated")
 			else:
 				print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt")
 		self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
 		# number of tokens to keep when resetting context
 		if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct):
 			self.params.n_keep = len(self.embd_inp)
@ -132,11 +160,12 @@ specified) expect poor results""", file=sys.stderr)
 		self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False)
 		# in instruct mode, we inject a prefix and a suffix to each input by the user
 		self.antiecho = None
 		if (self.params.instruct):
 			self.params.interactive_start = True
 			_ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False)
 			self.first_antiprompt.append(_ptn)
-			self.antiecho = IterSearch(_ptn)
+			self.antiecho = util.IterSearch(_ptn)
 		# enable interactive mode if reverse prompt or interactive start is specified
 		if (len(self.params.antiprompt) != 0 or self.params.interactive_start):
@ -144,6 +173,7 @@ specified) expect poor results""", file=sys.stderr)
 		# determine newline token
 		self.llama_token_newline = self._tokenize("\n", False)
 		self.llama_token_eot = self._tokenize(" [end of text]\n", False)
 		if (self.params.verbose_prompt):
 			print(f"""
@ -170,16 +200,24 @@ number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr)
 			if len(self.params.input_prefix) > 0:
 				print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr)
-		print(f"""sampling: temp = {self.params.temp},\
+		print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\
 repeat_penalty = {self.params.repeat_penalty},\
 presence_penalty = {self.params.presence_penalty},\
 frequency_penalty = {self.params.frequency_penalty},\
 top_k = {self.params.top_k},\
 tfs_z = {self.params.tfs_z},\
 top_p = {self.params.top_p},\
-repeat_last_n = {self.params.repeat_last_n},\
+typical_p = {self.params.typical_p},\
-repeat_penalty = {self.params.repeat_penalty}
+temp = {self.params.temp},\
 mirostat = {self.params.mirostat},\
 mirostat_lr = {self.params.mirostat_eta},\
 mirostat_ent = {self.params.mirostat_tau},\
-generate: n_ctx = {self.n_ctx}, \
+generate: n_ctx = {self.n_ctx},\
-n_batch = {self.params.n_batch}, \
+n_batch = {self.params.n_batch},\
-n_predict = {self.params.n_predict}, \
+n_predict = {self.params.n_predict},\
 n_keep = {self.params.n_keep}
 """, file=sys.stderr)
 		# determine antiprompt tokens
@ -195,24 +233,24 @@ n_keep = {self.params.n_keep}
 - If you want to submit another line, end your input in '\\'.
 """, file=sys.stderr)
-		self.set_color(CONSOLE_COLOR_PROMPT)
+		self.set_color(util.CONSOLE_COLOR_PROMPT)
 	# tokenize a prompt
 	def _tokenize(self, prompt, bos=True):
-		_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
+		_arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
-		_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos)
+		_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
 		return _arr[:_n]
 	def use_antiprompt(self):
 		return len(self.first_antiprompt) > 0
 	def set_color(self, c):
 		if (self.params.use_color):
 			print(c, end="")
 	def use_antiprompt(self):
 		return len(self.first_antiprompt) > 0
 	# generate tokens
 	def generate(self):
-		while self.remaining_tokens > 0 or self.params.interactive:
+		while self.remaining_tokens > 0 or self.params.interactive or self.params.n_predict == -1:
 			# predict
 			if len(self.embd) > 0:
 				# infinite text generation via context swapping
@ -228,43 +266,131 @@ n_keep = {self.params.n_keep}
 						self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd)
 					]
 					self.embd = _insert + self.embd
 					self.params.path_session = ""
 				# try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
 				if self.n_session_consumed < len(self.session_tokens):
 					for i in range(len(self.embd)):
 						if self.embd[i] != self.session_tokens[self.n_session_consumed]:
 							self.session_tokens = self.session_tokens[:self.n_session_consumed]
 							break
 						self.n_past += 1
 						self.n_session_consumed += 1
 						if self.n_session_consumed >= len(self.session_tokens):
 							i += 1
 							break
 					if i > 0:
 						self.embd = self.embd[i:]
 				# evaluate tokens in batches
 				# embd is typically prepared beforehand to fit within a batch, but not always
 				#TODO BUG: The batching code causes nonsensical generation
 				"""for i in range(0, len(self.embd), self.params.n_batch):
 					n_eval = self.params.n_batch
 					_arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval])
 					if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0:
 						print(f"failed to eval")
 						return
 					self.n_past += n_eval"""
 				if (llama_cpp.llama_eval(
 					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
 				) != 0):
 					raise Exception("Failed to llama_eval!")
 				if len(self.embd) > 0 and len(self.params.path_session) > 0:
 					self.session_tokens.extend(self.embd)
 					self.n_session_consumed = len(self.session_tokens)
 			self.n_past += len(self.embd)
 			self.embd = []
-			if len(self.embd_inp) <= self.input_consumed:
+			if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting
 				# out of user input, sample next token
 				top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k
 				repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n
-				if (self.params.ignore_eos):
+				# optionally save the session on first sample (for faster prompt loading next time)
-					logits = llama_cpp.llama_get_logits(self.ctx)
+				if len(self.params.path_session) > 0 and self.need_to_save_session:
-					logits[llama_cpp.llama_token_eos()] = llama_cpp.c_float(0)
+					self.need_to_save_session = False
 					llama_cpp.llama_save_session_file(
 						self.ctx,
 						self.params.path_session.encode("utf8"),
 						(llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens),
 						len(self.session_tokens)
 					)
 				id = 0
 				logits = llama_cpp.llama_get_logits(self.ctx)
 				n_vocab = llama_cpp.llama_n_vocab(self.ctx)
 				# Apply params.logit_bias map
 				for key, value in self.params.logit_bias.items():
 					logits[key] += value
 				_arr = (llama_cpp.llama_token_data * n_vocab)(*[
 					llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
 					for token_id in range(n_vocab)
 				])
 				candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
 				# Apply penalties
 				nl_logit = logits[llama_cpp.llama_token_nl()]
 				last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
 				_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
 				llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
 					_arr,
 					last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty))
 				llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
 					_arr,
 					last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
 				if not self.params.penalize_nl:
 					logits[llama_cpp.llama_token_nl()] = nl_logit
 				if self.params.temp <= 0:
 					# Greedy sampling
 					id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p)
 				else:
 					if self.params.mirostat == 1:
 						mirostat_mu = 2.0 * self.params.mirostat_tau
 						mirostat_m = 100
 						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
 						id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu))
 					elif self.params.mirostat == 2:
 						mirostat_mu = 2.0 * self.params.mirostat_tau
 						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
 						id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
 					else:
 						# Temperature sampling
 						llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1))
 						llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), min_keep=llama_cpp.c_size_t(1))
 						llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), min_keep=llama_cpp.c_size_t(1))
 						llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), min_keep=llama_cpp.c_size_t(1))
 						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
 						id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
 				# print("`{}`".format(candidates_p.size))
 				_arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):]
 				id = llama_cpp.llama_sample_top_p_top_k(
 					self.ctx,
 					(llama_cpp.llama_token * len(_arr))(*_arr),
 					len(_arr),
 					self.params.top_k,
 					self.params.top_p,
 					self.params.temp,
 					self.params.repeat_penalty,
 				)
 				self.last_n_tokens.pop(0)
 				self.last_n_tokens.append(id)
 				# replace end of text token with newline token when in interactive mode
 				if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
 					id = self.llama_token_newline[0]
 					self.embd.append(id)
 					if (self.use_antiprompt()):
 						# tokenize and inject first reverse prompt
 						self.embd_inp += self.first_antiprompt[0]
-
+						for id in self.first_antiprompt[0]:
-				# add it to the context
+							self.embd.append(id)
-				self.embd.append(id)
+				else:
 					# add it to the context
 					self.embd.append(id)
 				# echo this to console
 				self.output_echo = True
@ -287,7 +413,7 @@ n_keep = {self.params.n_keep}
 			# display tokens
 			if self.output_echo:
 				for id in self.embd:
-					if self.params.instruct:
+					if self.antiecho != None:
 						for r in self.antiecho(id):
 							yield r
 					else:
@ -295,7 +421,7 @@ n_keep = {self.params.n_keep}
 			# reset color to default if we there is no pending user input
 			if (self.params.input_echo and len(self.embd_inp) == self.input_consumed):
-				self.set_color(CONSOLE_COLOR_DEFAULT)
+				self.set_color(util.CONSOLE_COLOR_DEFAULT)
 			if (self.params.interactive and len(self.embd_inp) <= self.input_consumed):
 				# if antiprompt is present, stop
@ -313,9 +439,9 @@ n_keep = {self.params.n_keep}
 			# end of text token
 			if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
 				if (not self.params.instruct):
-					for i in " [end of text]\n":
+					for i in self.llama_token_eot:
 						yield i
-				break
+					break
 			# respect n_predict even if antiprompt is present
 			if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1):
@ -336,12 +462,12 @@ n_keep = {self.params.n_keep}
 	def exit(self):
 		llama_cpp.llama_free(self.ctx)
-		self.set_color(CONSOLE_COLOR_DEFAULT)
+		self.set_color(util.CONSOLE_COLOR_DEFAULT)
 	# return past text
 	def past(self):
 		for id in self.last_n_tokens[-self.n_past:]:
-			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
+			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf8", errors="ignore")
 	# write input
 	def input(self, prompt: str):
@ -355,7 +481,29 @@ n_keep = {self.params.n_keep}
 	def output(self):
 		self.remaining_tokens = self.params.n_predict
 		for id in self.generate():
-			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
+			cur_char = llama_cpp.llama_token_to_str(self.ctx, id)
 			# Add remainder of missing bytes
 			if None in self.multibyte_fix:
 				self.multibyte_fix[self.multibyte_fix.index(None)] = cur_char
 			# Return completed utf char
 			if len(self.multibyte_fix) > 0 and not None in self.multibyte_fix:
 				yield (b"".join(self.multibyte_fix)).decode("utf8")
 				self.multibyte_fix = []
 				continue
 			# Contains multi-byte UTF8
 			for num, pattern in [(2, 192), (3, 224), (4, 240)]:
 				# Bitwise AND check
 				if pattern & int.from_bytes(cur_char, 'little') == pattern:
 					self.multibyte_fix = [cur_char] + ([None] * (num-1))
 			# Stop incomplete bytes from passing
 			if len(self.multibyte_fix) > 0:
 				continue
 			yield cur_char.decode("utf8")
 	# read user input
 	def read_input(self):
@ -371,21 +519,21 @@ n_keep = {self.params.n_keep}
 		self.params.input_echo = False
 		while self.params.interactive:
-			self.set_color(CONSOLE_COLOR_USER_INPUT)
+			self.set_color(util.CONSOLE_COLOR_USER_INPUT)
 			if (self.params.instruct):
 				print('\n> ', end="")
 				self.input(self.read_input())
 			else:
 				print(self.params.input_prefix, end="")
-				self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.output_postfix}")
+				self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}")
-				print(self.params.output_postfix,end="")
+				print(self.params.input_suffix,end="")
-			self.set_color(CONSOLE_COLOR_DEFAULT)
+			self.set_color(util.CONSOLE_COLOR_DEFAULT)
 			try:
 				for i in self.output():
 					print(i,end="",flush=True)
 			except KeyboardInterrupt:
-				self.set_color(CONSOLE_COLOR_DEFAULT)
+				self.set_color(util.CONSOLE_COLOR_DEFAULT)
 				if not self.params.instruct:
 					print(self.params.fix_prefix,end="")
 					self.input(self.params.fix_prefix)
@ -414,8 +562,7 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {USER_NAME}: Name a color.
 {AI_NAME}: Blue
 {USER_NAME}:"""
-	args = gpt_params_parse()
+	params = gpt_params_parse()
 	params = GptParams(**vars(args))
 	with LLaMAInteract(params) as m:
 		m.interact()
--- a/examples/low_level_api/low_level_api_llama_cpp.py
+++ b/examples/low_level_api/low_level_api_llama_cpp.py
@ -37,6 +37,10 @@ embd = []
 last_n_size = 64
 last_n_tokens_data = [0] * last_n_size
 n_batch = 24
 last_n_repeat = 64
 repeat_penalty = 1
 frequency_penalty = 0.0
 presence_penalty = 0.0
 while remaining_tokens > 0:
    if len(embd) > 0:
@ -47,15 +51,28 @@ while remaining_tokens > 0:
    n_past += len(embd)
    embd = []
    if len(embd_inp) <= input_consumed:
-        id = llama_cpp.llama_sample_top_p_top_k(
+        logits = llama_cpp.llama_get_logits(ctx)
-            ctx,
+        n_vocab = llama_cpp.llama_n_vocab(ctx)
-            (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data),
+
-            len(last_n_tokens_data),
+        _arr = (llama_cpp.llama_token_data * n_vocab)(*[
-            40,
+            llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
-            0.8,
+            for token_id in range(n_vocab)
-            0.2,
+        ])
-            1.0 / 0.85,
+        candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
-        )
+
        _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
        llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
            _arr,
            last_n_repeat, repeat_penalty)
        llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p,
            _arr,
            last_n_repeat, frequency_penalty, presence_penalty)
        llama_cpp.llama_sample_top_k(ctx, candidates_p, 40)
        llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8)
        llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2)
        id = llama_cpp.llama_sample_token(ctx, candidates_p)
        last_n_tokens_data = last_n_tokens_data[1:] + [id]
        embd.append(id)
        input_noecho = False
@ -70,7 +87,7 @@ while remaining_tokens > 0:
    if not input_noecho:
        for id in embd:
            print(
-                llama_cpp.llama_token_to_str(ctx, id).decode("utf-8"),
+                llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"),
                end="",
                flush=True,
            )
--- a/examples/low_level_api/util.py
+++ b/examples/low_level_api/util.py
@ -0,0 +1,95 @@
 ANSI_COLOR_RESET = "\x1b[0m"
 ANSI_COLOR_YELLOW = "\x1b[33m"
 ANSI_BOLD = "\x1b[1m"
 ANSI_COLOR_GREEN = "\x1b[32m"
 CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
 CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
 CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
 # Iterative search
 # Actively searches and prevents a pattern from being returned
 class IterSearch:
 	def __init__(self, pattern):
 		self.pattern = list(pattern)
 		self.buffer = []
 	def __call__(self, char):
 		self.buffer += [char]
 		if (self.pattern[:len(self.buffer)] == self.buffer):
 			if (len(self.buffer) >= len(self.pattern)):
 				self.buffer.clear()
 			return []
 		_tmp = self.buffer[:]
 		self.buffer.clear()
 		return _tmp
 class Circle:
 	def __init__(self, size, default=0):
 		self.list = [default] * size
 		self.maxsize = size
 		self.size = 0
 		self.offset = 0
 	def append(self, elem):
 		if self.size < self.maxsize:
 			self.list[self.size] = elem
 			self.size += 1
 		else:
 			self.list[self.offset] = elem
 			self.offset = (self.offset + 1) % self.maxsize
 	def __getitem__(self, val):
 		if isinstance(val, int):
 			if 0 > val or val >= self.size:
 				raise IndexError('Index out of range')
 			return self.list[val] if self.size < self.maxsize else self.list[(self.offset + val) % self.maxsize]
 		elif isinstance(val, slice):
 			start, stop, step = val.start, val.stop, val.step
 			if step is None:
 				step = 1
 			if start is None:
 				start = 0
 			if stop is None:
 				stop = self.size
 			if start < 0:
 				start = self.size + start
 			if stop < 0:
 				stop = self.size + stop
 			indices = range(start, stop, step)
 			return [self.list[(self.offset + i) % self.maxsize] for i in indices if i < self.size]
 		else:
 			raise TypeError('Invalid argument type')
 if __name__ == "__main__":
 	c = Circle(5)
 	c.append(1)
 	print(c.list)
 	print(c[:])
 	assert c[0] == 1
 	assert c[:5] == [1]
 	for i in range(2,5+1):
 		c.append(i)
 	print(c.list)
 	print(c[:])
 	assert c[0] == 1
 	assert c[:5] == [1,2,3,4,5]
 	for i in range(5+1,9+1):
 		c.append(i)
 	print(c.list)
 	print(c[:])
 	assert c[0] == 5
 	assert c[:5] == [5,6,7,8,9]
 	#assert c[:-5] == [5,6,7,8,9]
 	assert c[:10] == [5,6,7,8,9]
--- a/examples/notebooks/Guidance.ipynb
+++ b/examples/notebooks/Guidance.ipynb
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@ -1,4 +1,4 @@
-from typing import List, Optional, Dict, Union
+from typing import Any, List, Optional, Dict, Union
 from typing_extensions import TypedDict, NotRequired, Literal
@ -22,9 +22,9 @@ class Embedding(TypedDict):
 class CompletionLogprobs(TypedDict):
    text_offset: List[int]
-    token_logprobs: List[float]
+    token_logprobs: List[Optional[float]]
    tokens: List[str]
-    top_logprobs: List[Dict[str, float]]
+    top_logprobs: List[Optional[Dict[str, float]]]
 class CompletionChoice(TypedDict):
@ -58,7 +58,7 @@ class Completion(TypedDict):
 class ChatCompletionMessage(TypedDict):
-    role: Union[Literal["assistant"], Literal["user"], Literal["system"]]
+    role: Literal["assistant", "user", "system"]
    content: str
    user: NotRequired[str]
@ -77,6 +77,8 @@ class ChatCompletion(TypedDict):
    choices: List[ChatCompletionChoice]
    usage: CompletionUsage
 class ChatCompletionChunkDeltaEmpty(TypedDict):
    pass
 class ChatCompletionChunkDelta(TypedDict):
    role: NotRequired[Literal["assistant"]]
@ -85,7 +87,7 @@ class ChatCompletionChunkDelta(TypedDict):
 class ChatCompletionChunkChoice(TypedDict):
    index: int
-    delta: ChatCompletionChunkDelta
+    delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty]
    finish_reason: Optional[str]
--- a/llama_cpp/server/init.py
+++ b/llama_cpp/server/init.py
--- a/llama_cpp/server/main.py
+++ b/llama_cpp/server/main.py
@ -3,267 +3,48 @@
 To run this example:
 ```bash
-pip install fastapi uvicorn sse-starlette
+pip install fastapi uvicorn sse-starlette pydantic-settings
 export MODEL=../models/7B/...
-uvicorn fastapi_server_chat:app --reload
+```
 Then run:
 ```
 uvicorn llama_cpp.server.app:app --reload
 ```
 or
 ```
 python3 -m llama_cpp.server
 ```
 Then visit http://localhost:8000/docs to see the interactive API docs.
 """
 import os
-import json
+import argparse
 from typing import List, Optional, Literal, Union, Iterator, Dict
 from typing_extensions import TypedDict
-import llama_cpp
+import uvicorn
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
 from sse_starlette.sse import EventSourceResponse
 class Settings(BaseSettings):
    model: str
    n_ctx: int = 2048
    n_batch: int = 8
    n_threads: int = ((os.cpu_count() or 2) // 2) or 1
    f16_kv: bool = True
    use_mlock: bool = False  # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
    embedding: bool = True
    last_n_tokens_size: int = 64
 app = FastAPI(
    title="🦙 llama.cpp Python API",
    version="0.0.1",
 )
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 settings = Settings()
 llama = llama_cpp.Llama(
    settings.model,
    f16_kv=settings.f16_kv,
    use_mlock=settings.use_mlock,
    embedding=settings.embedding,
    n_threads=settings.n_threads,
    n_batch=settings.n_batch,
    n_ctx=settings.n_ctx,
    last_n_tokens_size=settings.last_n_tokens_size,
 )
 class CreateCompletionRequest(BaseModel):
    prompt: Union[str, List[str]]
    suffix: Optional[str] = Field(None)
    max_tokens: int = 16
    temperature: float = 0.8
    top_p: float = 0.95
    echo: bool = False
    stop: List[str] = []
    stream: bool = False
    # ignored or currently unsupported
    model: Optional[str] = Field(None)
    n: Optional[int] = 1
    logprobs: Optional[int] = Field(None)
    presence_penalty: Optional[float] = 0
    frequency_penalty: Optional[float] = 0
    best_of: Optional[int] = 1
    logit_bias: Optional[Dict[str, float]] = Field(None)
    user: Optional[str] = Field(None)
    # llama.cpp specific parameters
    top_k: int = 40
    repeat_penalty: float = 1.1
    class Config:
        schema_extra = {
            "example": {
                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
                "stop": ["\n", "###"],
            }
        }
 CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
@app.post(
    "/v1/completions",
    response_model=CreateCompletionResponse,
 )
 def create_completion(request: CreateCompletionRequest):
    if isinstance(request.prompt, list):
        request.prompt = "".join(request.prompt)
    completion_or_chunks = llama(
        **request.dict(
            exclude={
                "model",
                "n",
                "logprobs",
                "frequency_penalty",
                "presence_penalty",
                "best_of",
                "logit_bias",
                "user",
            }
        )
    )
    if request.stream:
        chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks  # type: ignore
        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
    completion: llama_cpp.Completion = completion_or_chunks  # type: ignore
    return completion
 class CreateEmbeddingRequest(BaseModel):
    model: Optional[str]
    input: str
    user: Optional[str]
    class Config:
        schema_extra = {
            "example": {
                "input": "The food was delicious and the waiter...",
            }
        }
 CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
@app.post(
    "/v1/embeddings",
    response_model=CreateEmbeddingResponse,
 )
 def create_embedding(request: CreateEmbeddingRequest):
    return llama.create_embedding(**request.dict(exclude={"model", "user"}))
 class ChatCompletionRequestMessage(BaseModel):
    role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
    content: str
    user: Optional[str] = None
 class CreateChatCompletionRequest(BaseModel):
    model: Optional[str]
    messages: List[ChatCompletionRequestMessage]
    temperature: float = 0.8
    top_p: float = 0.95
    stream: bool = False
    stop: List[str] = []
    max_tokens: int = 128
    # ignored or currently unsupported
    model: Optional[str] = Field(None)
    n: Optional[int] = 1
    presence_penalty: Optional[float] = 0
    frequency_penalty: Optional[float] = 0
    logit_bias: Optional[Dict[str, float]] = Field(None)
    user: Optional[str] = Field(None)
    # llama.cpp specific parameters
    repeat_penalty: float = 1.1
    class Config:
        schema_extra = {
            "example": {
                "messages": [
                    ChatCompletionRequestMessage(
                        role="system", content="You are a helpful assistant."
                    ),
                    ChatCompletionRequestMessage(
                        role="user", content="What is the capital of France?"
                    ),
                ]
            }
        }
 CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)
@app.post(
    "/v1/chat/completions",
    response_model=CreateChatCompletionResponse,
 )
 async def create_chat_completion(
    request: CreateChatCompletionRequest,
 ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
    completion_or_chunks = llama.create_chat_completion(
        **request.dict(
            exclude={
                "model",
                "n",
                "presence_penalty",
                "frequency_penalty",
                "logit_bias",
                "user",
            }
        ),
    )
    if request.stream:
        async def server_sent_events(
            chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
        ):
            for chat_chunk in chat_chunks:
                yield dict(data=json.dumps(chat_chunk))
            yield dict(data="[DONE]")
        chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks  # type: ignore
        return EventSourceResponse(
            server_sent_events(chunks),
        )
    completion: llama_cpp.ChatCompletion = completion_or_chunks  # type: ignore
    return completion
 class ModelData(TypedDict):
    id: str
    object: Literal["model"]
    owned_by: str
    permissions: List[str]
 class ModelList(TypedDict):
    object: Literal["list"]
    data: List[ModelData]
 GetModelResponse = create_model_from_typeddict(ModelList)
@app.get("/v1/models", response_model=GetModelResponse)
 def get_models() -> ModelList:
    return {
        "object": "list",
        "data": [
            {
                "id": llama.model_path,
                "object": "model",
                "owned_by": "me",
                "permissions": [],
            }
        ],
    }
 from llama_cpp.server.app import create_app, Settings
 if __name__ == "__main__":
-    import os
+    parser = argparse.ArgumentParser()
-    import uvicorn
+    for name, field in Settings.__model_fields__.items():
        description = field.field_info.description
        if field.default is not None and description is not None:
            description += f" (default: {field.default})"
        parser.add_argument(
            f"--{name}",
            dest=name,
            type=field.type_,
            help=description,
        )
    args = parser.parse_args()
    settings = Settings(**{k: v for k, v in vars(args).items() if v is not None})
    app = create_app(settings=settings)
    uvicorn.run(
-        app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
+        app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port))
    )
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@ -0,0 +1,568 @@
 import json
 import multiprocessing
 from threading import Lock
 from functools import partial
 from typing import Iterator, List, Optional, Union, Dict
 from typing_extensions import TypedDict, Literal
 import llama_cpp
 import anyio
 from anyio.streams.memory import MemoryObjectSendStream
 from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
 from fastapi import Depends, FastAPI, APIRouter, Request
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 from pydantic_settings import BaseSettings
 from sse_starlette.sse import EventSourceResponse
 class Settings(BaseSettings):
    model: str = Field(
        description="The path to the model to use for generating completions."
    )
    model_alias: Optional[str] = Field(
        default=None,
        description="The alias of the model to use for generating completions.",
    )
    n_ctx: int = Field(default=2048, ge=1, description="The context size.")
    n_gpu_layers: int = Field(
        default=0,
        ge=0,
        description="The number of layers to put on the GPU. The rest will be on the CPU.",
    )
    seed: int = Field(
        default=1337, description="Random seed. -1 for random."
    )
    n_batch: int = Field(
        default=512, ge=1, description="The batch size to use per eval."
    )
    n_threads: int = Field(
        default=max(multiprocessing.cpu_count() // 2, 1),
        ge=1,
        description="The number of threads to use.",
    )
    f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
    use_mlock: bool = Field(
        default=llama_cpp.llama_mlock_supported(),
        description="Use mlock.",
    )
    use_mmap: bool = Field(
        default=llama_cpp.llama_mmap_supported(),
        description="Use mmap.",
    )
    embedding: bool = Field(default=True, description="Whether to use embeddings.")
    low_vram: bool = Field(
        default=False,
        description="Whether to use less VRAM. This will reduce performance.",
    )
    last_n_tokens_size: int = Field(
        default=64,
        ge=0,
        description="Last n tokens to keep for repeat penalty calculation.",
    )
    logits_all: bool = Field(default=True, description="Whether to return logits.")
    cache: bool = Field(
        default=False,
        description="Use a cache to reduce processing times for evaluated prompts.",
    )
    cache_type: Literal["ram", "disk"] = Field(
        default="ram",
        description="The type of cache to use. Only used if cache is True.",
    )
    cache_size: int = Field(
        default=2 << 30,
        description="The size of the cache in bytes. Only used if cache is True.",
    )
    vocab_only: bool = Field(
        default=False, description="Whether to only return the vocabulary."
    )
    verbose: bool = Field(
        default=True, description="Whether to print debug information."
    )
    host: str = Field(
        default="localhost", description="Listen address"
    )
    port: int = Field(
        default=8000, description="Listen port"
    )
    interrupt_requests: bool = Field(
        default=True,
        description="Whether to interrupt requests when a new request is received.",
    )
 router = APIRouter()
 settings: Optional[Settings] = None
 llama: Optional[llama_cpp.Llama] = None
 def create_app(settings: Optional[Settings] = None):
    if settings is None:
        settings = Settings()
    app = FastAPI(
        title="🦙 llama.cpp Python API",
        version="0.0.1",
    )
    app.add_middleware(
        CORSMiddleware,
        allow_origins=["*"],
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )
    app.include_router(router)
    global llama
    llama = llama_cpp.Llama(
        model_path=settings.model,
        n_gpu_layers=settings.n_gpu_layers,
        seed=settings.seed,
        f16_kv=settings.f16_kv,
        use_mlock=settings.use_mlock,
        use_mmap=settings.use_mmap,
        embedding=settings.embedding,
        logits_all=settings.logits_all,
        n_threads=settings.n_threads,
        n_batch=settings.n_batch,
        n_ctx=settings.n_ctx,
        last_n_tokens_size=settings.last_n_tokens_size,
        vocab_only=settings.vocab_only,
        verbose=settings.verbose,
    )
    if settings.cache:
        if settings.cache_type == "disk":
            if settings.verbose:
                print(f"Using disk cache with size {settings.cache_size}")
            cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
        else:
            if settings.verbose:
                print(f"Using ram cache with size {settings.cache_size}")
            cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
        cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size)
        llama.set_cache(cache)
    def set_settings(_settings: Settings):
        global settings
        settings = _settings
    set_settings(settings)
    return app
 llama_outer_lock = Lock()
 llama_inner_lock = Lock()
 def get_llama():
    # NOTE: This double lock allows the currently streaming llama model to
    # check if any other requests are pending in the same thread and cancel
    # the stream if so.
    llama_outer_lock.acquire()
    release_outer_lock = True
    try:
        llama_inner_lock.acquire()
        try:
            llama_outer_lock.release()
            release_outer_lock = False
            yield llama
        finally:
            llama_inner_lock.release()
    finally:
        if release_outer_lock:
            llama_outer_lock.release()
 def get_settings():
    yield settings
 model_field = Field(description="The model to use for generating completions.")
 max_tokens_field = Field(
    default=16, ge=1, le=2048, description="The maximum number of tokens to generate."
 )
 temperature_field = Field(
    default=0.8,
    ge=0.0,
    le=2.0,
    description="Adjust the randomness of the generated text.\n\n"
    + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
 )
 top_p_field = Field(
    default=0.95,
    ge=0.0,
    le=1.0,
    description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n"
    + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.",
 )
 stop_field = Field(
    default=None,
    description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
 )
 stream_field = Field(
    default=False,
    description="Whether to stream the results as they are generated. Useful for chatbots.",
 )
 top_k_field = Field(
    default=40,
    ge=0,
    description="Limit the next token selection to the K most probable tokens.\n\n"
    + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.",
 )
 repeat_penalty_field = Field(
    default=1.1,
    ge=0.0,
    description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n"
    + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
 )
 presence_penalty_field = Field(
    default=0.0,
    ge=-2.0,
    le=2.0,
    description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
 )
 frequency_penalty_field = Field(
    default=0.0,
    ge=-2.0,
    le=2.0,
    description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
 )
 mirostat_mode_field = Field(
    default=0,
    ge=0,
    le=2,
    description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)"
 )
 mirostat_tau_field = Field(
    default=5.0,
    ge=0.0,
    le=10.0,
    description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text"
 )
 mirostat_eta_field = Field(
    default=0.1,
    ge=0.001,
    le=1.0,
    description="Mirostat learning rate"
 )
 class CreateCompletionRequest(BaseModel):
    prompt: Union[str, List[str]] = Field(
        default="", description="The prompt to generate completions for."
    )
    suffix: Optional[str] = Field(
        default=None,
        description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
    )
    max_tokens: int = max_tokens_field
    temperature: float = temperature_field
    top_p: float = top_p_field
    mirostat_mode: int = mirostat_mode_field
    mirostat_tau: float = mirostat_tau_field
    mirostat_eta: float = mirostat_eta_field
    echo: bool = Field(
        default=False,
        description="Whether to echo the prompt in the generated text. Useful for chatbots.",
    )
    stop: Optional[Union[str, List[str]]] = stop_field
    stream: bool = stream_field
    logprobs: Optional[int] = Field(
        default=None,
        ge=0,
        description="The number of logprobs to generate. If None, no logprobs are generated.",
    )
    presence_penalty: Optional[float] = presence_penalty_field
    frequency_penalty: Optional[float] = frequency_penalty_field
    logit_bias: Optional[Dict[str, float]] = Field(None)
    logprobs: Optional[int] = Field(None)
    # ignored or currently unsupported
    model: Optional[str] = model_field
    n: Optional[int] = 1
    best_of: Optional[int] = 1
    user: Optional[str] = Field(None)
    # llama.cpp specific parameters
    top_k: int = top_k_field
    repeat_penalty: float = repeat_penalty_field
    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
    class Config:
        schema_extra = {
            "example": {
                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
                "stop": ["\n", "###"],
            }
        }
 def make_logit_bias_processor(
    llama: llama_cpp.Llama,
    logit_bias: Dict[str, float],
    logit_bias_type: Optional[Literal["input_ids", "tokens"]],
 ):
    if logit_bias_type is None:
        logit_bias_type = "input_ids"
    to_bias: Dict[int, float] = {}
    if logit_bias_type == "input_ids":
        for input_id, score in logit_bias.items():
            input_id = int(input_id)
            to_bias[input_id] = score
    elif logit_bias_type == "tokens":
        for token, score in logit_bias.items():
            token = token.encode('utf-8')
            for input_id in llama.tokenize(token, add_bos=False):
                to_bias[input_id] = score
    def logit_bias_processor(
        input_ids: List[int],
        scores: List[float],
    ) -> List[float]:
        new_scores = [None] * len(scores)
        for input_id, score in enumerate(scores):
            new_scores[input_id] = score + to_bias.get(input_id, 0.0)
        return new_scores
    return logit_bias_processor
@router.post(
    "/v1/completions",
 )
 async def create_completion(
    request: Request,
    body: CreateCompletionRequest,
    llama: llama_cpp.Llama = Depends(get_llama),
 ):
    if isinstance(body.prompt, list):
        assert len(body.prompt) <= 1
        body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
    exclude = {
        "n",
        "best_of",
        "logit_bias",
        "logit_bias_type",
        "user",
    }
    kwargs = body.dict(exclude=exclude)
    if body.logit_bias is not None:
        kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
            make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
        ])
    if body.stream:
        send_chan, recv_chan = anyio.create_memory_object_stream(10)
        async def event_publisher(inner_send_chan: MemoryObjectSendStream):
            async with inner_send_chan:
                try:
                    iterator: Iterator[llama_cpp.CompletionChunk] = await run_in_threadpool(llama, **kwargs)  # type: ignore
                    async for chunk in iterate_in_threadpool(iterator):
                        await inner_send_chan.send(dict(data=json.dumps(chunk)))
                        if await request.is_disconnected():
                            raise anyio.get_cancelled_exc_class()()
                        if settings.interrupt_requests and llama_outer_lock.locked():
                            await inner_send_chan.send(dict(data="[DONE]"))
                            raise anyio.get_cancelled_exc_class()()
                    await inner_send_chan.send(dict(data="[DONE]"))
                except anyio.get_cancelled_exc_class() as e:
                    print("disconnected")
                    with anyio.move_on_after(1, shield=True):
                        print(
                            f"Disconnected from client (via refresh/close) {request.client}"
                        )
                        raise e
        return EventSourceResponse(
            recv_chan, data_sender_callable=partial(event_publisher, send_chan)
        )
    else:
        completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs)  # type: ignore
        return completion
 class CreateEmbeddingRequest(BaseModel):
    model: Optional[str] = model_field
    input: Union[str, List[str]] = Field(description="The input to embed.")
    user: Optional[str]
    class Config:
        schema_extra = {
            "example": {
                "input": "The food was delicious and the waiter...",
            }
        }
@router.post(
    "/v1/embeddings",
 )
 async def create_embedding(
    request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
    return await run_in_threadpool(
        llama.create_embedding, **request.dict(exclude={"user"})
    )
 class ChatCompletionRequestMessage(BaseModel):
    role: Literal["system", "user", "assistant"] = Field(
        default="user", description="The role of the message."
    )
    content: str = Field(default="", description="The content of the message.")
 class CreateChatCompletionRequest(BaseModel):
    messages: List[ChatCompletionRequestMessage] = Field(
        default=[], description="A list of messages to generate completions for."
    )
    max_tokens: int = max_tokens_field
    temperature: float = temperature_field
    top_p: float = top_p_field
    mirostat_mode: int = mirostat_mode_field
    mirostat_tau: float = mirostat_tau_field
    mirostat_eta: float = mirostat_eta_field
    stop: Optional[List[str]] = stop_field
    stream: bool = stream_field
    presence_penalty: Optional[float] = presence_penalty_field
    frequency_penalty: Optional[float] = frequency_penalty_field
    logit_bias: Optional[Dict[str, float]] = Field(None)
    # ignored or currently unsupported
    model: Optional[str] = model_field
    n: Optional[int] = 1
    user: Optional[str] = Field(None)
    # llama.cpp specific parameters
    top_k: int = top_k_field
    repeat_penalty: float = repeat_penalty_field
    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
    class Config:
        schema_extra = {
            "example": {
                "messages": [
                    ChatCompletionRequestMessage(
                        role="system", content="You are a helpful assistant."
                    ),
                    ChatCompletionRequestMessage(
                        role="user", content="What is the capital of France?"
                    ),
                ]
            }
        }
@router.post(
    "/v1/chat/completions",
 )
 async def create_chat_completion(
    request: Request,
    body: CreateChatCompletionRequest,
    llama: llama_cpp.Llama = Depends(get_llama),
    settings: Settings = Depends(get_settings),
 ) -> Union[llama_cpp.ChatCompletion]: # type: ignore
    exclude = {
        "n",
        "logit_bias",
        "logit_bias_type",
        "user",
    }
    kwargs = body.dict(exclude=exclude)
    if body.logit_bias is not None:
        kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
            make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
        ])
    if body.stream:
        send_chan, recv_chan = anyio.create_memory_object_stream(10)
        async def event_publisher(inner_send_chan: MemoryObjectSendStream):
            async with inner_send_chan:
                try:
                    iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs)  # type: ignore
                    async for chat_chunk in iterate_in_threadpool(iterator):
                        await inner_send_chan.send(dict(data=json.dumps(chat_chunk)))
                        if await request.is_disconnected():
                            raise anyio.get_cancelled_exc_class()()
                        if settings.interrupt_requests and llama_outer_lock.locked():
                            await inner_send_chan.send(dict(data="[DONE]"))
                            raise anyio.get_cancelled_exc_class()()
                    await inner_send_chan.send(dict(data="[DONE]"))
                except anyio.get_cancelled_exc_class() as e:
                    print("disconnected")
                    with anyio.move_on_after(1, shield=True):
                        print(
                            f"Disconnected from client (via refresh/close) {request.client}"
                        )
                        raise e
        return EventSourceResponse(
            recv_chan,
            data_sender_callable=partial(event_publisher, send_chan),
        )
    else:
        completion: llama_cpp.ChatCompletion = await run_in_threadpool(
            llama.create_chat_completion, **kwargs  # type: ignore
        )
        return completion
 class ModelData(TypedDict):
    id: str
    object: Literal["model"]
    owned_by: str
    permissions: List[str]
 class ModelList(TypedDict):
    object: Literal["list"]
    data: List[ModelData]
@router.get("/v1/models")
 async def get_models(
    settings: Settings = Depends(get_settings),
 ) -> ModelList:
    assert llama is not None
    return {
        "object": "list",
        "data": [
            {
                "id": settings.model_alias
                if settings.model_alias is not None
                else llama.model_path,
                "object": "model",
                "owned_by": "me",
                "permissions": [],
            }
        ],
    }
--- a/poetry.lock
+++ b/poetry.lock
--- a/poetry.toml
+++ b/poetry.toml
@ -0,0 +1,3 @@
 [virtualenvs]
 in-project = true
 prefer-active-python = true
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.30"
+version = "0.1.68"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
@ -14,16 +14,25 @@ include = [
 [tool.poetry.dependencies]
 python = "^3.8.1"
-typing-extensions = "^4.5.0"
+typing-extensions = "^4.7.1"
-
+numpy = "^1.24.4"
 diskcache = "^5.6.1"
 uvicorn = { version = "^0.22.0", optional = true }
 fastapi = { version = "^0.99.1", optional = true }
 sse-starlette = { version = "^1.6.1", optional = true }
 [tool.poetry.group.dev.dependencies]
-black = "^23.1.0"
+black = "^23.3.0"
 twine = "^4.0.2"
-mkdocs = "^1.4.2"
+mkdocs = "^1.4.3"
-mkdocstrings = {extras = ["python"], version = "^0.20.0"}
+mkdocstrings = {extras = ["python"], version = "^0.22.0"}
-mkdocs-material = "^9.1.4"
+mkdocs-material = "^9.1.18"
-pytest = "^7.2.2"
+pytest = "^7.4.0"
 httpx = "^0.24.1"
 scikit-build = "0.17.6"
 [tool.poetry.extras]
 server = ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"]
 [build-system]
 requires = [
--- a/setup.py
+++ b/setup.py
@ -10,17 +10,15 @@ setup(
    description="A Python wrapper for llama.cpp",
    long_description=long_description,
    long_description_content_type="text/markdown",
-    version="0.1.30",
+    version="0.1.68",
    author="Andrei Betlen",
    author_email="abetlen@gmail.com",
    license="MIT",
    package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"},
    packages=["llama_cpp", "llama_cpp.server"],
-    install_requires=[
+    install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"],
        "typing-extensions>=4.5.0",
    ],
    extras_require={
-        "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"],
+        "server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
    },
    python_requires=">=3.7",
    classifiers=[
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@ -14,14 +14,22 @@ def test_llama():
    assert llama.detokenize(llama.tokenize(text)) == text
 # @pytest.mark.skip(reason="need to update sample mocking")
 def test_llama_patch(monkeypatch):
    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
    n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
    ## Set up mock function
    def mock_eval(*args, **kwargs):
        return 0
    def mock_get_logits(*args, **kwargs):
        return (llama_cpp.c_float * n_vocab)(
            *[llama_cpp.c_float(0) for _ in range(n_vocab)]
        )
    monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval)
    monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
    output_text = " jumps over the lazy dog."
    output_tokens = llama.tokenize(output_text.encode("utf-8"))
@ -36,7 +44,7 @@ def test_llama_patch(monkeypatch):
        else:
            return token_eos
-    monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_top_p_top_k", mock_sample)
+    monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample)
    text = "The quick brown fox"
@ -82,6 +90,7 @@ def test_llama_patch(monkeypatch):
 def test_llama_pickle():
    import pickle
    import tempfile
    fp = tempfile.TemporaryFile()
    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
    pickle.dump(llama, fp)
@ -94,3 +103,69 @@ def test_llama_pickle():
    text = b"Hello World"
    assert llama.detokenize(llama.tokenize(text)) == text
 def test_utf8(monkeypatch):
    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
    n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
    ## Set up mock function
    def mock_eval(*args, **kwargs):
        return 0
    def mock_get_logits(*args, **kwargs):
        return (llama_cpp.c_float * n_vocab)(
            *[llama_cpp.c_float(0) for _ in range(n_vocab)]
        )
    monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval)
    monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
    output_text = "😀"
    output_tokens = llama.tokenize(output_text.encode("utf-8"))
    token_eos = llama.token_eos()
    n = 0
    def mock_sample(*args, **kwargs):
        nonlocal n
        if n < len(output_tokens):
            n += 1
            return output_tokens[n - 1]
        else:
            return token_eos
    monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample)
    ## Test basic completion with utf8 multibyte
    n = 0  # reset
    completion = llama.create_completion("", max_tokens=4)
    assert completion["choices"][0]["text"] == output_text
    ## Test basic completion with incomplete utf8 multibyte
    n = 0  # reset
    completion = llama.create_completion("", max_tokens=1)
    assert completion["choices"][0]["text"] == ""
 def test_llama_server():
    from fastapi.testclient import TestClient
    from llama_cpp.server.app import create_app, Settings
    settings = Settings(
        model=MODEL,
        vocab_only=True,
    )
    app = create_app(settings)
    client = TestClient(app)
    response = client.get("/v1/models")
    assert response.json() == {
        "object": "list",
        "data": [
            {
                "id": MODEL,
                "object": "model",
                "owned_by": "me",
                "permissions": [],
            }
        ],
    }
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 180b693a47b6b825288ef9f2c39d24b6eea4eea6
+Subproject commit 061f5f8d2109bb7adcbd40f1b456d887c5a1df25
		`@ -1 +1 @@`
			`Subproject commit 180b693a47b6b825288ef9f2c39d24b6eea4eea6`				`Subproject commit 061f5f8d2109bb7adcbd40f1b456d887c5a1df25`