Merge branch 'main' into add_unlimited_max_tokens

2023-07-08 02:37:38 -04:00 · 2023-07-08 02:37:38 -04:00 · 5d756de314
commit 5d756de314
parent 90e1021154 236c4cf442
50 changed files with 5094 additions and 1205 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,166 @@
+_skbuild/
+
+.envrc
+
+models/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -0,0 +1,96 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+# Prerequisites
+
+Please answer the following questions for yourself before submitting an issue.
+
+- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
+- [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md).
+- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
+- [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share.
+
+# Expected Behavior
+
+Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do.
+
+# Current Behavior
+
+Please provide a detailed written description of what `llama-cpp-python` did, instead.
+
+# Environment and Context
+
+Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
+
+* Physical (or virtual) hardware you are using, e.g. for Linux:
+
+`$ lscpu`
+
+* Operating System, e.g. for Linux:
+
+`$ uname -a`
+
+* SDK version, e.g. for Linux:
+
+```
+$ python3 --version
+$ make --version
+$ g++ --version
+```
+
+# Failure Information (for bugs)
+
+Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
+
+# Steps to Reproduce
+
+Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
+
+1. step 1
+2. step 2
+3. step 3
+4. etc.
+
+**Note: Many issues seem to be regarding functional or performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.**
+
+Try the following:
+
+1. `git clone https://github.com/abetlen/llama-cpp-python`
+2. `cd llama-cpp-python`
+3. `rm -rf _skbuild/` # delete any old builds
+4. `python setup.py develop`
+5. `cd ./vendor/llama.cpp`
+6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp
+7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues)
+
+# Failure Logs
+
+Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
+
+Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
+
+Example environment info:
+```
+llama-cpp-python$ git log | head -1
+commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2
+
+llama-cpp-python$ python3 --version
+Python 3.10.10
+
+llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy"
+fastapi                  0.95.0
+numpy                    1.24.3
+sse-starlette            1.3.3
+uvicorn                  0.21.1
+
+llama-cpp-python/vendor/llama.cpp$ git log | head -3
+commit 66874d4fbcc7866377246efbcee938e8cc9c7d76
+Author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
+Date:   Thu May 25 20:18:01 2023 -0600
+```
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,11 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@ -0,0 +1,39 @@
+name: Build Docker
+
+on: workflow_dispatch
+
+permissions:
+  contents: write
+  packages: write
+
+jobs:
+  docker:
+    name: Build and push Docker image
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: "true"
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v2 
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          push: true # push to registry
+          pull: true # always fetch the latest base images
+          platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64
+          tags: ghcr.io/abetlen/llama-cpp-python:latest
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@ -28,4 +28,4 @@ jobs:
      # if: startsWith(github.ref, 'refs/tags')
      uses: pypa/gh-action-pypi-publish@release/v1
      with:
-        password: ${{ secrets.PYPI_API_TOKEN }}
+        password: ${{ secrets.PYPI_API_TOKEN }}
--- a/.github/workflows/test-pypi.yaml
+++ b/.github/workflows/test-pypi.yaml
@ -0,0 +1,64 @@
+name: Tests for PyPI package
+
+on: workflow_dispatch
+
+jobs:
+  build-linux:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+
+    steps:
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install --verbose llama-cpp-python[server,test]
+      - name: Test with pytest
+        run: |
+          python3 -c "import llama_cpp"
+
+  build-windows:
+
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+
+    steps:
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install --verbose llama-cpp-python[server,test]
+      - name: Test with pytest
+        run: |
+          python3 -c "import llama_cpp"
+
+  build-macos:
+
+    runs-on: macos-latest
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+
+    steps:
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install --verbose llama-cpp-python[server,test]
+      - name: Test with pytest
+        run: |
+          python3 -c "import llama_cpp"
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -26,7 +26,7 @@ jobs:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
          pip install . -v
      - name: Test with pytest
        run: |
@ -49,7 +49,7 @@ jobs:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
          pip install . -v
      - name: Test with pytest
        run: |
@ -72,7 +72,7 @@ jobs:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
          pip install . -v
      - name: Test with pytest
        run: |
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
+.vscode/
+
 _skbuild/

 .envrc
@ -11,6 +13,10 @@ __pycache__/

 # C extensions
 *.so
+*.dylib
+*.metal
+*.dll
+*.lib

 # Distribution / packaging
 .Python
@ -164,3 +170,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+
+# downloaded model .bin files
+docker/open_llama/*.bin
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,3 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
-	url = git@github.com:ggerganov/llama.cpp.git
+	url = https://github.com/ggerganov/llama.cpp.git
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -0,0 +1,24 @@
+# Read the Docs configuration file for MkDocs projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+
+mkdocs:
+  configuration: mkdocs.yml
+
+python:
+  install:
+    - method: pip
+      path: .
+    - requirements: docs/requirements.txt
+
+submodules:
+  include: all
+  recursive: true
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,117 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+## [Added]
+
+- (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting.
+
+## [0.1.68]
+
+## [Added]
+
+- (llama.cpp) Update llama.cpp
+
+## [0.1.67]
+
+## Fixed
+
+- Fix performance bug in Llama model by pre-allocating memory tokens and logits.
+- Fix bug in Llama model where the model was not free'd after use.
+
+## [0.1.66]
+
+## Added
+
+- (llama.cpp) New model API
+
+## Fixed
+
+- Performance issue during eval caused by looped np.concatenate call
+- State pickling issue when saving cache to disk
+
+## [0.1.65]
+
+### Added
+
+- (llama.cpp) Fix struct misalignment bug
+
+## [0.1.64]
+
+### Added
+
+- (llama.cpp) Update llama.cpp
+- Fix docs for seed. Set -1 for random.
+
+## [0.1.63]
+
+### Added
+
+- (llama.cpp) Add full gpu utilisation in CUDA
+- (llama.cpp) Add get_vocab
+- (llama.cpp) Add low_vram parameter
+- (server) Add logit_bias parameter
+
+## [0.1.62]
+
+### Fixed
+
+- Metal support working
+- Cache re-enabled
+
+## [0.1.61]
+
+### Fixed
+
+- Fix broken pip installation
+
+## [0.1.60]
+
+### NOTE
+
+- This release was deleted due to a bug  with the packaging system that caused pip installations to fail.
+
+### Fixed
+
+- Truncate max_tokens in create_completion so requested tokens doesn't exceed context size.
+- Temporarily disable cache for completion requests
+
+## [v0.1.59]
+
+### Added
+
+- (llama.cpp) k-quants support
+- (server) mirostat sampling parameters to server
+
+### Fixed
+
+- Support both `.so` and `.dylib` for `libllama` on MacOS
+
+## [v0.1.58]
+
+### Added
+
+- (llama.cpp) Metal Silicon support
+
+## [v0.1.57]
+
+### Added
+
+- (llama.cpp) OpenLlama 3B support
+
+## [v0.1.56]
+
+### Added
+
+- (misc) Added first version of the changelog
+- (server) Use async routes
+- (python-api) Use numpy for internal buffers to reduce memory usage and improve performance.
+
+### Fixed
+
+- (python-api) Performance bug in stop sequence check slowing down streaming.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,7 +2,11 @@ cmake_minimum_required(VERSION 3.4...3.22)

 project(llama_cpp)

-if (UNIX)
+option(FORCE_CMAKE "Force CMake build of Python bindings" OFF)
+
+set(FORCE_CMAKE $ENV{FORCE_CMAKE})
+
+if (UNIX AND NOT FORCE_CMAKE)
    add_custom_command(
        OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
        COMMAND make libllama.so
@ -23,5 +27,8 @@ else()
        TARGETS llama 
        LIBRARY DESTINATION llama_cpp
        RUNTIME DESTINATION llama_cpp
+        ARCHIVE DESTINATION llama_cpp
+        FRAMEWORK DESTINATION llama_cpp
+        RESOURCE DESTINATION llama_cpp
    )
-endif(UNIX)
+endif()
--- a/66
+++ b/66
@ -0,0 +1,66 @@
+update:
+	poetry install
+	git submodule update --init --recursive
+
+update.vendor:
+	cd vendor/llama.cpp && git pull origin master
+
+build:
+	python3 setup.py develop
+
+build.cuda:
+	CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
+
+build.opencl:
+	CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop
+
+build.openblas:
+	CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
+
+build.blis:
+	CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop
+
+build.metal:
+	CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 setup.py develop
+
+build.sdist:
+	python3 setup.py sdist
+
+deploy.pypi:
+	python3 -m twine upload dist/*
+
+deploy.gh-docs:
+	mkdocs build
+	mkdocs gh-deploy
+
+test:
+	python3 -m pytest
+
+docker:
+	docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile .
+
+run-server:
+	uvicorn --factory llama.server:app --host ${HOST} --port ${PORT}
+
+clean:
+	- cd vendor/llama.cpp && make clean
+	- cd vendor/llama.cpp && rm libllama.so
+	- rm -rf _skbuild
+	- rm llama_cpp/*.so
+	- rm llama_cpp/*.dylib
+	- rm llama_cpp/*.metal
+	- rm llama_cpp/*.dll
+	- rm llama_cpp/*.lib
+
+.PHONY: \
+	update \
+	update.vendor \
+	build \
+	build.cuda \
+	build.opencl \
+	build.openblas \
+	build.sdist \
+	deploy.pypi \
+	deploy.gh-docs \
+	docker \
+	clean
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # 🦙 Python Bindings for `llama.cpp`

-[![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python)
+[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest)
 [![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
 [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
@ -15,16 +15,70 @@ This package provides:
  - OpenAI-like API
  - LangChain compatibility

-## Installation
+Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).

-Install from PyPI:
+
+## Installation from PyPI (recommended)
+
+Install from PyPI (requires a c compiler):

 ```bash
 pip install llama-cpp-python
 ```

+The above command will attempt to install the package and build `llama.cpp` from source.
+This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system.
+
+If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different  compiler options, please add the following flags to ensure that the package is rebuilt correctly:
+
+```bash
+pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir
+```
+
+Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example:
+```
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
+bash Miniforge3-MacOSX-arm64.sh
+```
+Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
+
+### Installation with OpenBLAS / cuBLAS / CLBlast / Metal
+
+`llama.cpp` supports multiple BLAS backends for faster processing.
+Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend.
+
+To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
+
+```bash
+CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
+```
+
+To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
+
+```bash
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
+```
+
+To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
+
+```bash
+CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
+```
+
+To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable before installing:
+
+```bash
+CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
+```
+
+Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](docs/install/macos.md)
+
 ## High-level API

+The high-level API provides a simple managed interface through the `Llama` class.
+
+Below is a short example demonstrating how to use the high-level API to generate text:
+
 ```python
 >>> from llama_cpp import Llama
 >>> llm = Llama(model_path="./models/7B/ggml-model.bin")
@ -51,6 +105,15 @@ pip install llama-cpp-python
 }
 ```

+### Adjusting the Context Window
+The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.
+
+For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object:
+
+```python
+llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
+```
+
 ## Web Server

 `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
@ -60,16 +123,40 @@ To install the server package and get started:

 ```bash
 pip install llama-cpp-python[server]
-export MODEL=./models/7B/ggml-model.bin
-python3 -m llama_cpp.server
+python3 -m llama_cpp.server --model models/7B/ggml-model.bin
 ```

 Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.

+## Docker image
+
+A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
+
+```bash
+docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
+```
+
 ## Low-level API

-The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`.
-The entire API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and should mirror [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
+The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`.
+The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
+
+Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
+
+```python
+>>> import llama_cpp
+>>> import ctypes
+>>> params = llama_cpp.llama_context_default_params()
+# use bytes for char * params
+>>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params)
+>>> max_tokens = params.n_ctx
+# use ctypes arrays for array params
+>>> tokens = (llama_cpp.llama_token * int(max_tokens))()
+>>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True))
+>>> llama_cpp.llama_free(ctx)
+```
+
+Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.


 # Documentation
@ -84,8 +171,19 @@ This package is under active development and I welcome any contributions.
 To get started, clone the repository and install the package in development mode:

 ```bash
-git clone git@github.com:abetlen/llama-cpp-python.git
-git submodule update --init --recursive
+git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
+cd llama-cpp-python
+
+# Install with pip
+pip install -e .
+
+# if you want to use the fastapi / openapi server
+pip install -e .[server]
+
+# If you're a poetry user, installing will also include a virtual environment
+poetry install --all-extras
+. .venv/bin/activate
+
 # Will need to be re-run any time vendor/llama.cpp is updated
 python3 setup.py develop
 ```
--- a/docker/README.md
+++ b/docker/README.md
@ -0,0 +1,66 @@
+# Install Docker Server
+
+**Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
+
+[Install Docker Engine](https://docs.docker.com/engine/install)
+
+**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
+
+# Simple Dockerfiles for building the llama-cpp-python server with external model bin files
+## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image
+```
+cd ./openblas_simple
+docker build -t openblas_simple .
+docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple
+```
+where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
+
+## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image
+```
+cd ./cuda_simple
+docker build -t cuda_simple .
+docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple
+```
+where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
+
+# "Open-Llama-in-a-box"
+## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server
+```
+$ cd ./open_llama
+./build.sh
+./start.sh
+```
+
+# Manually choose your own Llama model from Hugging Face
+`python3 ./hug_model.py -a TheBloke -t llama`
+You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
+```
+docker $ ls -lh *.bin
+-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>q5_1.bin
+lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_1.bin
+```
+**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
+**TWICE** as much disk space as the size of the model:
+
+| Model |  Quantized size |
+|------:|----------------:|
+|    3B |            3 GB |
+|    7B |            5 GB |
+|   13B |           10 GB |
+|   33B |           25 GB |
+|   65B |           50 GB |
+
+**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
+
+## Use OpenBLAS
+Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
+### Build:
+`docker build -t openblas .`
+### Run:
+`docker run --cap-add SYS_RESOURCE -t openblas`
+
+## Use CuBLAS
+### Build:
+`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
+### Run:
+`docker run --cap-add SYS_RESOURCE -t cublas`
--- a/docker/cuda_simple/Dockerfile
+++ b/docker/cuda_simple/Dockerfile
@ -0,0 +1,16 @@
+ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
+FROM nvidia/cuda:${CUDA_IMAGE}
+
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+
+COPY . .
+
+# Install the package
+RUN apt update && apt install -y python3 python3-pip
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+
+RUN LLAMA_CUBLAS=1 pip install llama-cpp-python
+
+# Run the server
+CMD python3 -m llama_cpp.server
--- a/docker/open_llama/Dockerfile
+++ b/docker/open_llama/Dockerfile
@ -0,0 +1,51 @@
+# Define the image argument and provide a default value
+ARG IMAGE=python:3-slim-bullseye
+
+# Use the image as specified
+FROM ${IMAGE}
+
+# Re-declare the ARG after FROM
+ARG IMAGE
+
+# Update and upgrade the existing packages 
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    ninja-build \
+    build-essential
+
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+
+# Perform the conditional installations based on the image
+RUN echo "Image: ${IMAGE}" && \
+    if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
+    echo "OpenBLAS install:" && \
+    apt-get install -y --no-install-recommends libopenblas-dev && \
+    LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
+else \
+    echo "CuBLAS install:" && \
+    LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
+fi
+
+# Clean up apt cache
+RUN rm -rf /var/lib/apt/lists/*
+
+# Set a working directory for better clarity
+WORKDIR /app
+
+# Copy files to the app directory
+RUN echo "Installing model...this can take some time..."
+COPY ./model.bin /app/model.bin
+COPY ./start_server.sh /app/start_server.sh
+
+# Make the server start script executable
+RUN chmod +x /app/start_server.sh
+
+# Set environment variable for the host
+ENV HOST=0.0.0.0
+
+# Expose a port for the server
+EXPOSE 8000
+
+# Run the server start script
+CMD ["/bin/sh", "/app/start_server.sh"]
--- a/docker/open_llama/build.sh
+++ b/docker/open_llama/build.sh
@ -0,0 +1,14 @@
+#!/bin/sh
+
+MODEL="open_llama_3b"
+# Get  open_llama_3b_ggml q5_1 quantization
+python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
+ls -lh *.bin
+
+# Build the default OpenBLAS image
+docker build -t $MODEL .
+docker images | egrep "^(REPOSITORY|$MODEL)"
+
+echo
+echo "To start the docker container run:"
+echo "docker run -t -p 8000:8000 $MODEL"
--- a/docker/open_llama/hug_model.py
+++ b/docker/open_llama/hug_model.py
@ -0,0 +1,139 @@
+import requests
+import json
+import os
+import struct
+import argparse
+
+def make_request(url, params=None):
+    print(f"Making request to {url}...")
+    response = requests.get(url, params=params)
+    if response.status_code == 200:
+        return json.loads(response.text)
+    else:
+        print(f"Request failed with status code {response.status_code}")
+        return None
+
+def check_magic_and_version(filename):
+    with open(filename, 'rb') as f:
+        # Read the first 6 bytes from the file
+        data = f.read(6)
+
+    # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
+    # and the next 2 bytes as a little-endian unsigned short
+    magic, version = struct.unpack('<I H', data)
+
+    print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
+
+    return magic, version
+
+def download_file(url, destination):
+    print(f"Downloading {url} to {destination}...")
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        with open(destination, 'wb') as f:
+            total_downloaded = 0
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:  # filter out keep-alive new chunks
+                    f.write(chunk)
+                    total_downloaded += len(chunk)
+                    if total_downloaded >= 10485760:  # 10 MB
+                        print('.', end='', flush=True)
+                        total_downloaded = 0
+        print("\nDownload complete.")
+        
+        # Creating a symbolic link from destination to "model.bin"
+        if os.path.isfile("model.bin"):
+            os.remove("model.bin")  # remove the existing link if any
+        os.symlink(destination, "model.bin")
+    else:
+        print(f"Download failed with status code {response.status_code}")
+
+def get_user_choice(model_list):
+    # Print the enumerated list
+    print("\n")
+    for i, (model_id, rfilename) in enumerate(model_list):
+        print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
+
+    # Get user's choice
+    choice = input("Choose a model to download by entering the corresponding number: ")
+    try:
+        index = int(choice) - 1
+        if 0 <= index < len(model_list):
+            # Return the chosen model
+            return model_list[index]
+        else:
+            print("Invalid choice.")
+    except ValueError:
+        print("Invalid input. Please enter a number corresponding to a model.")
+    except IndexError:
+        print("Invalid choice. Index out of range.")
+    
+    return None
+
+def main():
+    # Create an argument parser
+    parser = argparse.ArgumentParser(description='Process some parameters.')
+
+    # Arguments
+    parser.add_argument('-v', '--version', type=int, default=0x0003,
+                        help='hexadecimal version number of ggml file')
+    parser.add_argument('-a', '--author', type=str, default='TheBloke',
+                        help='HuggingFace author filter')
+    parser.add_argument('-t', '--tag', type=str, default='llama',
+                        help='HuggingFace tag filter')
+    parser.add_argument('-s', '--search', type=str, default='',
+                        help='HuggingFace search filter')
+    parser.add_argument('-f', '--filename', type=str, default='q5_1',
+                        help='HuggingFace model repository filename substring match')
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Define the parameters
+    params = {
+        "author": args.author,
+        "tags": args.tag,
+        "search": args.search
+    }
+
+    models = make_request('https://huggingface.co/api/models', params=params)
+    if models is None:
+        return
+
+    model_list = []
+    # Iterate over the models
+    for model in models:
+        model_id = model['id']
+        model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
+        if model_info is None:
+            continue
+
+        for sibling in model_info.get('siblings', []):
+            rfilename = sibling.get('rfilename')
+            if rfilename and args.filename in rfilename:
+                model_list.append((model_id, rfilename))
+
+    # Choose the model
+    model_list.sort(key=lambda x: x[0])
+    if len(model_list) == 0:
+        print("No models found")
+        exit(1)
+    elif len(model_list) == 1:
+        model_choice = model_list[0]
+    else:
+        model_choice = get_user_choice(model_list)
+
+    if model_choice is not None:
+        model_id, rfilename = model_choice
+        url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
+        dest = f"{model_id.replace('/', '_')}_{rfilename}"
+        download_file(url, dest)
+        _, version = check_magic_and_version(dest)
+        if version != args.version:
+             print(f"Warning: Expected version {args.version}, but found different version in the file.")
+    else:
+        print("Error - model choice was None")
+        exit(2)
+
+if __name__ == '__main__':
+    main()
--- a/docker/open_llama/start.sh
+++ b/docker/open_llama/start.sh
@ -0,0 +1,28 @@
+#!/bin/sh
+
+MODEL="open_llama_3b"
+
+# Start Docker container
+docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
+sleep 10
+echo
+docker ps | egrep "(^CONTAINER|$MODEL)"
+
+# Test the model works
+echo
+curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{
+  "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+  "stop": [
+    "\n",
+    "###"
+  ]
+}' | grep Paris
+if [ $? -eq 0 ]
+then
+    echo
+    echo "$MODEL is working!!"
+else
+    echo
+    echo "ERROR: $MODEL not replying."
+    exit 1
+fi
--- a/docker/open_llama/start_server.sh
+++ b/docker/open_llama/start_server.sh
@ -0,0 +1,11 @@
+#!/bin/sh
+
+# For mlock support
+ulimit -l unlimited
+
+if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
+    python3 -B -m llama_cpp.server --model /app/model.bin
+else
+    # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
+    python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
+fi
--- a/docker/openblas_simple/Dockerfile
+++ b/docker/openblas_simple/Dockerfile
@ -0,0 +1,15 @@
+FROM python:3-slim-bullseye
+
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+
+COPY . .
+
+# Install the package
+RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
+RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+
+RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose
+
+# Run the server
+CMD python3 -m llama_cpp.server
--- a/docker/simple/Dockerfile
+++ b/docker/simple/Dockerfile
@ -0,0 +1,33 @@
+# Define the image argument and provide a default value
+ARG IMAGE=python:3-slim-bullseye
+
+# Use the image as specified
+FROM ${IMAGE}
+
+# Re-declare the ARG after FROM
+ARG IMAGE
+
+# Update and upgrade the existing packages 
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    ninja-build \
+    build-essential
+
+RUN mkdir /app
+WORKDIR /app
+COPY . /app
+
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+
+RUN make build && make clean
+
+# Set environment variable for the host
+ENV HOST=0.0.0.0
+ENV PORT=8000
+
+# Expose a port for the server
+EXPOSE 8000
+
+# Run the server start script
+CMD ["/bin/sh", "/app/docker/simple/run.sh"]
--- a/docker/simple/run.sh
+++ b/docker/simple/run.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+make build
+uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@ -0,0 +1,53 @@
+---
+title: API Reference
+---
+
+::: llama_cpp.Llama
+    options:
+        members:
+            - __init__
+            - tokenize
+            - detokenize
+            - reset
+            - eval
+            - sample
+            - generate
+            - create_embedding
+            - embed
+            - create_completion
+            - __call__
+            - create_chat_completion
+            - set_cache
+            - save_state
+            - load_state
+            - token_bos
+            - token_eos
+        show_root_heading: true
+
+::: llama_cpp.LlamaCache
+    options:
+        show_root_heading: true
+
+::: llama_cpp.LlamaState
+    options:
+        show_root_heading: true
+
+::: llama_cpp.LogitsProcessor
+    options:
+        show_root_heading: true
+
+::: llama_cpp.LogitsProcessorList
+    options:
+        show_root_heading: true
+
+::: llama_cpp.StoppingCriteria
+    options:
+        show_root_heading: true
+
+::: llama_cpp.StoppingCriteriaList
+    options:
+        show_root_heading: true
+
+::: llama_cpp.llama_cpp
+    options:
+        show_if_no_docstring: true
--- a/docs/index.md
+++ b/docs/index.md
@ -87,31 +87,6 @@ git submodule update --init --recursive
 python3 setup.py develop
 ```

-## API Reference
-
-::: llama_cpp.Llama
-    options:
-        members:
-            - __init__
-            - tokenize
-            - detokenize
-            - reset
-            - eval
-            - sample
-            - generate
-            - create_embedding
-            - embed
-            - create_completion
-            - __call__
-            - create_chat_completion
-            - token_bos
-            - token_eos
-        show_root_heading: true
-
-::: llama_cpp.llama_cpp
-    options:
-        show_if_no_docstring: true
-
 ## License

 This project is licensed under the terms of the MIT license.
--- a/docs/install/macos.md
+++ b/docs/install/macos.md
@ -0,0 +1,59 @@
+---
+title: MacOS Install with Metal GPU
+---
+
+**(1) Make sure you have xcode installed... at least the command line parts**
+```
+# check the path of your xcode install 
+xcode-select -p
+
+# xcode installed returns
+# /Applications/Xcode-beta.app/Contents/Developer
+
+# if xcode is missing then install it... it takes ages;
+xcode-select --install
+```
+
+**(2) Install the conda version for MacOS that supports Metal GPU**
+```
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
+bash Miniforge3-MacOSX-arm64.sh
+```
+
+**(3) Make a conda environment**
+```
+conda create -n llama python=3.9.16
+conda activate llama
+```
+
+**(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62**  
+    *(you needed xcode installed in order pip to build/compile the C++ code)*
+```
+pip uninstall llama-cpp-python -y
+CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
+pip install 'llama-cpp-python[server]'
+
+# you should now have llama-cpp-python v0.1.62 or higher installed
+llama-cpp-python         0.1.68
+
+```
+
+**(5) Download a v3 ggml model**
+ - **ggmlv3**
+ - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0
+
+https://huggingface.co/TheBloke/open-llama-7b-open-instruct-GGML
+
+
+**(6) run the llama-cpp-python API server with MacOS Metal GPU support**
+```
+# config your ggml model path
+# make sure it is ggml v3
+# make sure it is q4_0
+export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin
+python3 -m llama_cpp.server --model $MODEL  --n_gpu_layers 1
+```
+
+***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used*
+
+
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -0,0 +1,3 @@
+mkdocs
+mkdocs-material
+mkdocstrings[python]
--- a/examples/high_level_api/fastapi_server.py
+++ b/examples/high_level_api/fastapi_server.py
@ -4,259 +4,34 @@ To run this example:

 ```bash
 pip install fastapi uvicorn sse-starlette
-export MODEL=../models/7B/ggml-model.bin
-uvicorn fastapi_server_chat:app --reload
+export MODEL=../models/7B/...
+```
+
+Then run:
+```
+uvicorn llama_cpp.server.app:app --reload
+```
+
+or
+
+```
+python3 -m llama_cpp.server
 ```

 Then visit http://localhost:8000/docs to see the interactive API docs.

+
+To actually see the implementation of the server, see llama_cpp/server/app.py
+
 """
 import os
-import json
-from typing import List, Optional, Literal, Union, Iterator, Dict
-from typing_extensions import TypedDict
-
-import llama_cpp
-
-from fastapi import FastAPI
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
-from sse_starlette.sse import EventSourceResponse
-
-
-class Settings(BaseSettings):
-    model: str
-    n_ctx: int = 2048
-    n_batch: int = 8
-    n_threads: int = int(os.cpu_count() / 2) or 1
-    f16_kv: bool = True
-    use_mlock: bool = False     # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
-    embedding: bool = True
-    last_n_tokens_size: int = 64
-
-
-app = FastAPI(
-    title="🦙 llama.cpp Python API",
-    version="0.0.1",
-)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-settings = Settings()
-llama = llama_cpp.Llama(
-    settings.model,
-    f16_kv=settings.f16_kv,
-    use_mlock=settings.use_mlock,
-    embedding=settings.embedding,
-    n_threads=settings.n_threads,
-    n_batch=settings.n_batch,
-    n_ctx=settings.n_ctx,
-    last_n_tokens_size=settings.last_n_tokens_size,
-)
-
-
-class CreateCompletionRequest(BaseModel):
-    prompt: str
-    suffix: Optional[str] = Field(None)
-    max_tokens: int = 16
-    temperature: float = 0.8
-    top_p: float = 0.95
-    echo: bool = False
-    stop: List[str] = []
-    stream: bool = False
-
-    # ignored or currently unsupported
-    model: Optional[str] = Field(None)
-    n: Optional[int] = 1
-    logprobs: Optional[int] = Field(None)
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
-    best_of: Optional[int] = 1
-    logit_bias: Optional[Dict[str, float]] = Field(None)
-    user: Optional[str] = Field(None)
-
-    # llama.cpp specific parameters
-    top_k: int = 40
-    repeat_penalty: float = 1.1
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
-                "stop": ["\n", "###"],
-            }
-        }
-
-
-CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
-
-
-@app.post(
-    "/v1/completions",
-    response_model=CreateCompletionResponse,
-)
-def create_completion(request: CreateCompletionRequest):
-    if request.stream:
-        chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict())  # type: ignore
-        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
-    return llama(
-        **request.dict(
-            exclude={
-                "model",
-                "n",
-                "logprobs",
-                "frequency_penalty",
-                "presence_penalty",
-                "best_of",
-                "logit_bias",
-                "user",
-            }
-        )
-    )
-
-
-class CreateEmbeddingRequest(BaseModel):
-    model: Optional[str]
-    input: str
-    user: Optional[str]
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "input": "The food was delicious and the waiter...",
-            }
-        }
-
-
-CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
-
-
-@app.post(
-    "/v1/embeddings",
-    response_model=CreateEmbeddingResponse,
-)
-def create_embedding(request: CreateEmbeddingRequest):
-    return llama.create_embedding(**request.dict(exclude={"model", "user"}))
-
-
-class ChatCompletionRequestMessage(BaseModel):
-    role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
-    content: str
-    user: Optional[str] = None
-
-
-class CreateChatCompletionRequest(BaseModel):
-    model: Optional[str]
-    messages: List[ChatCompletionRequestMessage]
-    temperature: float = 0.8
-    top_p: float = 0.95
-    stream: bool = False
-    stop: List[str] = []
-    max_tokens: int = 128
-
-    # ignored or currently unsupported
-    model: Optional[str] = Field(None)
-    n: Optional[int] = 1
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
-    logit_bias: Optional[Dict[str, float]] = Field(None)
-    user: Optional[str] = Field(None)
-
-    # llama.cpp specific parameters
-    repeat_penalty: float = 1.1
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "messages": [
-                    ChatCompletionRequestMessage(
-                        role="system", content="You are a helpful assistant."
-                    ),
-                    ChatCompletionRequestMessage(
-                        role="user", content="What is the capital of France?"
-                    ),
-                ]
-            }
-        }
-
-
-CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)
-
-
-@app.post(
-    "/v1/chat/completions",
-    response_model=CreateChatCompletionResponse,
-)
-async def create_chat_completion(
-    request: CreateChatCompletionRequest,
-) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
-    completion_or_chunks = llama.create_chat_completion(
-        **request.dict(
-            exclude={
-                "model",
-                "n",
-                "presence_penalty",
-                "frequency_penalty",
-                "logit_bias",
-                "user",
-            }
-        ),
-    )
-
-    if request.stream:
-
-        async def server_sent_events(
-            chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
-        ):
-            for chat_chunk in chat_chunks:
-                yield dict(data=json.dumps(chat_chunk))
-            yield dict(data="[DONE]")
-
-        chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks  # type: ignore
-
-        return EventSourceResponse(
-            server_sent_events(chunks),
-        )
-    completion: llama_cpp.ChatCompletion = completion_or_chunks  # type: ignore
-    return completion
-
-
-class ModelData(TypedDict):
-    id: str
-    object: Literal["model"]
-    owned_by: str
-    permissions: List[str]
-
-
-class ModelList(TypedDict):
-    object: Literal["list"]
-    data: List[ModelData]
-
-
-GetModelResponse = create_model_from_typeddict(ModelList)
-
-
-@app.get("/v1/models", response_model=GetModelResponse)
-def get_models() -> ModelList:
-    return {
-        "object": "list",
-        "data": [
-            {
-                "id": llama.model_path,
-                "object": "model",
-                "owned_by": "me",
-                "permissions": [],
-            }
-        ],
-    }
+import uvicorn

+from llama_cpp.server.app import create_app

 if __name__ == "__main__":
-    import os
-    import uvicorn
+    app = create_app()

-    uvicorn.run(app, host=os.getenv("HOST", "localhost"), port=os.getenv("PORT", 8000))
+    uvicorn.run(
+        app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
+    )
--- a/examples/low_level_api/Chat.py
+++ b/examples/low_level_api/Chat.py
@ -0,0 +1,71 @@
+#!/bin/python
+import sys, os, datetime
+from common import GptParams
+from low_level_api_chat_cpp import LLaMAInteract
+
+def env_or_def(env, default):
+	if (env in os.environ):
+		return os.environ[env]
+	return default
+
+AI_NAME = env_or_def("AI_NAME", "ChatLLaMa")
+MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
+USER_NAME = env_or_def("USER_NAME", "USER")
+N_PREDICTS = int(env_or_def("N_PREDICTS", "2048"))
+N_THREAD = int(env_or_def("N_THREAD", "8"))
+
+today = datetime.datetime.today()
+DATE_YEAR=today.strftime("%Y")
+DATE_TIME=today.strftime("%H:%M")
+
+prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
+{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision.
+There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
+The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+
+{USER_NAME}: Hello, {AI_NAME}!
+{AI_NAME}: Hello {USER_NAME}! How may I help you today?
+{USER_NAME}: What year is it?
+{AI_NAME}: We are in {DATE_YEAR}.
+{USER_NAME}: Please tell me the largest city in Europe.
+{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia.
+{USER_NAME}: What can you tell me about Moscow?
+{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center.
+{USER_NAME}: What is a cat?
+{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
+{USER_NAME}: How do I pass command line arguments to a Node.js program?
+{AI_NAME}: The arguments are stored in process.argv.
+
+    argv[0] is the path to the Node. js executable.
+    argv[1] is the path to the script file.
+    argv[2] is the first argument passed to the script.
+    argv[3] is the second argument passed to the script and so on.
+{USER_NAME}: Name a color.
+{AI_NAME}: Blue.
+{USER_NAME}: What time is it?
+{AI_NAME}: It is {DATE_TIME}.
+{USER_NAME}:""" + " ".join(sys.argv[1:])
+
+print("Loading model...")
+params = GptParams(
+	n_ctx=2048,
+	temp=0.7,
+	top_k=40,
+	top_p=0.5,
+	repeat_last_n=256,
+	n_batch=1024,
+	repeat_penalty=1.17647,
+	model=MODEL,
+	n_threads=N_THREAD,
+	n_predict=N_PREDICTS,
+	use_color=True,
+	interactive=True,
+	antiprompt=[f"{USER_NAME}:"],
+	input_prefix=" ",
+	input_suffix=f"{AI_NAME}:",
+	prompt=prompt,
+)
+
+with LLaMAInteract(params) as m:
+	m.interact()
--- a/examples/low_level_api/Miku.py
+++ b/examples/low_level_api/Miku.py
@ -0,0 +1,59 @@
+#!/bin/python
+import sys, os
+from common import GptParams
+from low_level_api_chat_cpp import LLaMAInteract
+
+def env_or_def(env, default):
+	if (env in os.environ):
+		return os.environ[env]
+	return default
+
+AI_NAME = env_or_def("AI_NAME", "Miku")
+MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
+USER_NAME = env_or_def("USER_NAME", "Anon")
+N_PREDICTS = int(env_or_def("N_PREDICTS", "4096"))
+N_THREAD = int(env_or_def("N_THREAD", "0"))
+
+prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer.
+{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
+{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
+{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
+{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
+The conversation is only between {USER_NAME} and {AI_NAME}
+The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice.
+{AI_NAME} can only communicate through text, so she can't send images or videos.
+
+
+{USER_NAME}: Hello!
+{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
+{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
+{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
+{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
+{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
+{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that!
+{AI_NAME}: What do you like to do in your free time? ^_^
+{USER_NAME}:""" + " ".join(sys.argv[1:])
+
+print("Loading model...")
+params = GptParams(
+	n_batch=1024,
+	n_ctx=2048,
+	n_keep=-1,
+	repeat_last_n=256,
+	repeat_penalty=1.17647,
+	temp=0.7,
+	top_k=40,
+	top_p=0.5,
+	model=MODEL,
+	n_predict=N_PREDICTS,
+	use_color=True,
+	interactive=True,
+	antiprompt=[f"{USER_NAME}:"],
+	prompt=prompt,
+)
+
+if N_THREAD > 0:
+	params.n_threads = N_THREAD
+
+with LLaMAInteract(params) as m:
+	m.interact()
--- a/examples/low_level_api/ReasonAct.py
+++ b/examples/low_level_api/ReasonAct.py
@ -0,0 +1,49 @@
+#!/bin/python
+import sys, os, datetime
+from common import GptParams
+from low_level_api_chat_cpp import LLaMAInteract
+
+def env_or_def(env, default):
+	if (env in os.environ):
+		return os.environ[env]
+	return default
+
+MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
+
+prompt=f"""You run in a loop of Thought, Action, Observation.
+At the end of the loop either Answer or restate your Thought and Action.
+Use Thought to describe your thoughts about the question you have been asked.
+Use Action to run one of these actions available to you:
+- calculate[python math expression]
+Observation will be the result of running those actions
+
+
+Question: What is 4 * 7 / 3?
+Thought: Do I need to use an action? Yes, I use calculate to do math
+Action: calculate[4 * 7 / 3]
+Observation: 9.3333333333
+Thought: Do I need to use an action? No, have the result
+Answer: The calculate tool says it is 9.3333333333
+Question: What is capital of france?
+Thought: Do I need to use an action? No, I know the answer
+Answer: Paris is the capital of France
+Question:""" + " ".join(sys.argv[1:])
+
+print("Loading model...")
+params = GptParams(
+	interactive=True,
+	interactive_start=True,
+	top_k=10000,
+	temp=0.2,
+	repeat_penalty=1,
+	n_threads=7,
+	n_ctx=2048,
+	antiprompt=["Question:","Observation:"],
+	model=MODEL,
+	input_prefix=" ",
+	n_predict=-1,
+	prompt=prompt,
+)
+
+with LLaMAInteract(params) as m:
+	m.interact()
--- a/examples/low_level_api/common.py
+++ b/examples/low_level_api/common.py
@ -1,8 +1,9 @@
 import os
 import argparse
+import re

 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import List

 # Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp

@ -12,23 +13,36 @@ class GptParams:
    seed: int = -1
    n_threads: int = min(4, os.cpu_count() or 1)
    n_predict: int = 128
-    repeat_last_n: int = 64
    n_parts: int = -1
    n_ctx: int = 512
    n_batch: int = 8
    n_keep: int = 0

+    ignore_eos: bool = False
+    logit_bias: dict[int, float] = field(default_factory=dict)
    top_k: int = 40
    top_p: float = 0.95
+    tfs_z: float = 1.00
+    typical_p: float = 1.00
    temp: float = 0.80
    repeat_penalty: float = 1.10
+    repeat_last_n: int = 64
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    mirostat: int = 0
+    mirostat_tau: float = 5.0
+    mirostat_eta: float = 0.1

    model: str = "./models/llama-7B/ggml-model.bin"
    prompt: str = ""
+    path_session: str = ""
    input_prefix: str = " "
-
+    input_suffix: str = ""
    antiprompt: List[str] = field(default_factory=list)

+    lora_adapter: str = ""
+    lora_base: str = ""
+
    memory_f16: bool = True
    random_prompt: bool = False
    use_color: bool = False
@ -38,7 +52,7 @@ class GptParams:
    interactive_start: bool = False

    instruct: bool = False
-    ignore_eos: bool = False
+    penalize_nl: bool = True
    perplexity: bool = False
    use_mmap: bool = True
    use_mlock: bool = False
@ -50,8 +64,7 @@ class GptParams:
    # If chat ended prematurely, append this to the conversation to fix it.
    # Set to "\nUser:" etc.
    # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:""
-    fix_prefix: str = " "
-    output_postfix: str = ""
+    fix_prefix: str = ""
    input_echo: bool = True,

    # Default instructions for Alpaca
@ -61,59 +74,43 @@ class GptParams:
    instruct_inp_suffix: str="\n\n### Response:\n\n"


-def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
-    if params is None:
-        params = GptParams()
-
+def gpt_params_parse(argv = None):
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed")
    parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads")
-    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
-    parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
+    parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict")
+    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
    parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx")
-    parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
-    parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
-    parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
-    parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
-    parser.add_argument("--n_predict", type=int, default=128, help="number of model parts",dest="n_predict")
-    parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
-    parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
    parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch")
    parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep")
+
+    parser.add_argument(
+        "-l",
+        "--logit-bias",
+        type=str,
+        action='append',
+        help="--logit-bias TOKEN_ID(+/-)BIAS",
+        dest="logit_bias_str"
+    )
+    parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
+    parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
+    parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
+    parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z")
+    parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
+    parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
+    parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
+    parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z")
+    parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty")
+    parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat")
+    parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau")
+    parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")
+
    parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
-    parser.add_argument(
-        "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
-    )
-    parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
-    parser.add_argument(
-        "--interactive-start",
-        action="store_true",
-        help="run in interactive mode",
-        dest="interactive"
-    )
-    parser.add_argument(
-        "--interactive-first",
-        action="store_true",
-        help="run in interactive mode and wait for input right away",
-        dest="interactive_start"
-    )
-    parser.add_argument(
-        "-ins",
-        "--instruct",
-        action="store_true",
-        help="run in instruction mode (use with Alpaca or Vicuna models)",
-        dest="instruct"
-    )
-    parser.add_argument(
-        "--color",
-        action="store_true",
-        help="colorise output to distinguish prompt and user input from generations",
-        dest="use_color"
-    )
-    parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
-    parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap")
-    parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
-    parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt")
+    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
+    parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
+    parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session")
+    parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
+    parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix")
    parser.add_argument(
        "-r",
        "--reverse-prompt",
@ -122,16 +119,70 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
        help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
        dest="antiprompt"
    )
-    parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
-    parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
-    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
+    
+    parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter")
+    parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base")
+
+    parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
    parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt")
-    parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
+    parser.add_argument(
+        "--color",
+        action="store_true",
+        help="colorise output to distinguish prompt and user input from generations",
+        dest="use_color"
+    )
+    parser.add_argument(
+        "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
+    )
+    
+    parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
+    parser.add_argument(
+        "--interactive-first",
+        action="store_true",
+        help="run in interactive mode and wait for input right away",
+        dest="interactive_start"
+    )
+
+    parser.add_argument(
+        "-ins",
+        "--instruct",
+        action="store_true",
+        help="run in instruction mode (use with Alpaca or Vicuna models)",
+        dest="instruct"
+    )
+    parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl")
+    parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
+    parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap")
+    parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
+    parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
+    parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt")
+
+    #Custom args
    parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix")
-    parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix")
    parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo")
+
+    parser.add_argument(
+        "--interactive-start",
+        action="store_true",
+        help="run in interactive mode",
+        dest="interactive"
+    )
+
    args = parser.parse_args(argv)
-    return args
+    
+    logit_bias_str = args.logit_bias_str
+    delattr(args, "logit_bias_str")
+    params = GptParams(**vars(args))
+
+    if (params.lora_adapter):
+        params.use_mmap = False
+
+    if (logit_bias_str != None):
+        for i in logit_bias_str:
+            if (m := re.match(r"(\d+)([-+]\d+)", i)):
+                params.logit_bias[int(m.group(1))] = float(m.group(2))
+
+    return params

 def gpt_random_prompt(rng):
    return [
@ -148,4 +199,4 @@ def gpt_random_prompt(rng):
    ][rng % 10]

 if __name__ == "__main__":
-    print(GptParams(gpt_params_parse()))
+    print(gpt_params_parse())
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@ -10,40 +10,14 @@ Quirks:
   You should also still be feeding the model with a "primer" prompt that 
   shows it the expected format.
 """
+import ctypes
 import sys
 from time import time
-from os import cpu_count
+from os import cpu_count, path

 import llama_cpp
 from common import GptParams, gpt_params_parse, gpt_random_prompt
-
-ANSI_COLOR_RESET = "\x1b[0m"
-ANSI_COLOR_YELLOW = "\x1b[33m"
-ANSI_BOLD = "\x1b[1m"
-ANSI_COLOR_GREEN = "\x1b[32m"
-
-CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
-CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
-CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
-
-# Iterative search
-# Actively searches and prevents a pattern from being returned
-class IterSearch:
-	def __init__(self, pattern):
-		self.pattern = list(pattern)
-		self.buffer = []
-
-	def __call__(self, char):
-		self.buffer += [char]
-
-		if (self.pattern[:len(self.buffer)] == self.buffer):
-			if (len(self.buffer) >= len(self.pattern)):
-				self.buffer.clear()
-			return []
-
-		_tmp = self.buffer[:]
-		self.buffer.clear()
-		return _tmp
+import util

 # A LLaMA interactive session
 class LLaMAInteract:
@ -77,9 +51,11 @@ specified) expect poor results""", file=sys.stderr)
 		# runtime args
 		self.input_consumed = 0
 		self.n_past = 0
+		self.n_session_consumed = 0
 		self.first_antiprompt = []
 		self.remaining_tokens = self.params.n_predict
 		self.output_echo = self.params.input_echo
+		self.multibyte_fix = []

 		# model load
 		self.lparams = llama_cpp.llama_context_default_params()
@ -94,6 +70,19 @@ specified) expect poor results""", file=sys.stderr)
 		if (not self.ctx):
 			raise RuntimeError(f"error: failed to load model '{self.params.model}'")

+		if (self.params.ignore_eos):
+			self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")
+
+		if (len(self.params.lora_adapter) > 0):
+			if (llama_cpp.llama_apply_lora_from_file(
+				self.ctx, 
+				self.params.lora_adapter.encode("utf8"), 
+				self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None,
+				self.params.n_threads
+			) != 0):
+				print("error: failed to apply lora adapter")
+				return
+
 		print(file=sys.stderr)
 		print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
 | {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr)
@ -117,13 +106,52 @@ specified) expect poor results""", file=sys.stderr)
 			with open(self.params.file) as f:
 				self.params.prompt = f.read()

+		self.session_tokens: list[llama_cpp.llama_token] = []
+		if (len(self.params.path_session) > 0):
+			print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr)
+
+			if (path.exists(self.params.path_session)):
+				_session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
+				_n_token_count_out = llama_cpp.c_size_t()
+				if (llama_cpp.llama_load_session_file(
+					self.ctx, 
+					self.params.path_session.encode("utf8"),
+					_session_tokens,
+					self.params.n_ctx,
+					ctypes.byref(_n_token_count_out)
+				) != 1):
+					print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr)
+					return
+				_n_token_count_out = _n_token_count_out.value
+				self.session_tokens = _session_tokens[:_n_token_count_out]
+				print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr)
+			else:
+				print(f"session file does not exist, will create", file=sys.stderr)
+
 		# tokenize the prompt
 		self.embd = []
 		self.embd_inp = self._tokenize(self.params.prompt)

-		if (len(self.embd_inp) > self.params.n_ctx - 4):
+		if (len(self.embd_inp) > self.n_ctx - 4):
 			raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})")

+		# debug message about similarity of saved session, if applicable
+		self.n_matching_session_tokens = 0
+		if len(self.session_tokens) > 0:
+			for id in self.session_tokens:
+				if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]:
+					break
+				self.n_matching_session_tokens += 1
+			
+			if self.n_matching_session_tokens >= len(self.embd_inp):
+				print(f"session file has exact match for prompt!")
+			elif self.n_matching_session_tokens < (len(self.embd_inp) / 2):
+				print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated")
+			else:
+				print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt")
+
+		self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
+
 		# number of tokens to keep when resetting context
 		if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct):
 			self.params.n_keep = len(self.embd_inp)
@ -132,11 +160,12 @@ specified) expect poor results""", file=sys.stderr)
 		self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False)

 		# in instruct mode, we inject a prefix and a suffix to each input by the user
+		self.antiecho = None
 		if (self.params.instruct):
 			self.params.interactive_start = True
 			_ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False)
 			self.first_antiprompt.append(_ptn)
-			self.antiecho = IterSearch(_ptn)
+			self.antiecho = util.IterSearch(_ptn)

 		# enable interactive mode if reverse prompt or interactive start is specified
 		if (len(self.params.antiprompt) != 0 or self.params.interactive_start):
@ -144,6 +173,7 @@ specified) expect poor results""", file=sys.stderr)

 		# determine newline token
 		self.llama_token_newline = self._tokenize("\n", False)
+		self.llama_token_eot = self._tokenize(" [end of text]\n", False)

 		if (self.params.verbose_prompt):
 			print(f"""
@ -170,16 +200,24 @@ number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr)
 			if len(self.params.input_prefix) > 0:
 				print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr)

-		print(f"""sampling: temp = {self.params.temp},\
+		print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\
+repeat_penalty = {self.params.repeat_penalty},\
+presence_penalty = {self.params.presence_penalty},\
+frequency_penalty = {self.params.frequency_penalty},\
 top_k = {self.params.top_k},\
+tfs_z = {self.params.tfs_z},\
 top_p = {self.params.top_p},\
-repeat_last_n = {self.params.repeat_last_n},\
-repeat_penalty = {self.params.repeat_penalty}
+typical_p = {self.params.typical_p},\
+temp = {self.params.temp},\
+mirostat = {self.params.mirostat},\
+mirostat_lr = {self.params.mirostat_eta},\
+mirostat_ent = {self.params.mirostat_tau},\

-generate: n_ctx = {self.n_ctx}, \
-n_batch = {self.params.n_batch}, \
-n_predict = {self.params.n_predict}, \
+generate: n_ctx = {self.n_ctx},\
+n_batch = {self.params.n_batch},\
+n_predict = {self.params.n_predict},\
 n_keep = {self.params.n_keep}
+
 """, file=sys.stderr)

 		# determine antiprompt tokens
@ -195,24 +233,24 @@ n_keep = {self.params.n_keep}
 - If you want to submit another line, end your input in '\\'.

 """, file=sys.stderr)
-		self.set_color(CONSOLE_COLOR_PROMPT)
+		self.set_color(util.CONSOLE_COLOR_PROMPT)

 	# tokenize a prompt
 	def _tokenize(self, prompt, bos=True):
-		_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
-		_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos)
+		_arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
+		_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
 		return _arr[:_n]

-	def use_antiprompt(self):
-		return len(self.first_antiprompt) > 0
-
 	def set_color(self, c):
 		if (self.params.use_color):
 			print(c, end="")

+	def use_antiprompt(self):
+		return len(self.first_antiprompt) > 0
+
 	# generate tokens
 	def generate(self):
-		while self.remaining_tokens > 0 or self.params.interactive:
+		while self.remaining_tokens > 0 or self.params.interactive or self.params.n_predict == -1:
 			# predict
 			if len(self.embd) > 0:
 				# infinite text generation via context swapping
@ -228,43 +266,131 @@ n_keep = {self.params.n_keep}
 						self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd)
 					]
 					self.embd = _insert + self.embd
+					self.params.path_session = ""
+
+				# try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+				if self.n_session_consumed < len(self.session_tokens):
+					for i in range(len(self.embd)):
+						if self.embd[i] != self.session_tokens[self.n_session_consumed]:
+							self.session_tokens = self.session_tokens[:self.n_session_consumed]
+							break
+						
+						self.n_past += 1
+						self.n_session_consumed += 1
+						
+						if self.n_session_consumed >= len(self.session_tokens):
+							i += 1
+							break
+					
+					if i > 0:
+						self.embd = self.embd[i:]
+
+				# evaluate tokens in batches
+				# embd is typically prepared beforehand to fit within a batch, but not always
+				#TODO BUG: The batching code causes nonsensical generation
+				"""for i in range(0, len(self.embd), self.params.n_batch):
+					n_eval = self.params.n_batch
+					_arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval])
+					if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0:
+						print(f"failed to eval")
+						return
+					
+					self.n_past += n_eval"""

 				if (llama_cpp.llama_eval(
 					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
 				) != 0):
 					raise Exception("Failed to llama_eval!")

+				if len(self.embd) > 0 and len(self.params.path_session) > 0:
+					self.session_tokens.extend(self.embd)
+					self.n_session_consumed = len(self.session_tokens)
+
 			self.n_past += len(self.embd)
 			self.embd = []
-			if len(self.embd_inp) <= self.input_consumed:
+			if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting
 				# out of user input, sample next token
+				top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k
+				repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n

-				if (self.params.ignore_eos):
-					logits = llama_cpp.llama_get_logits(self.ctx)
-					logits[llama_cpp.llama_token_eos()] = llama_cpp.c_float(0)
+				# optionally save the session on first sample (for faster prompt loading next time)
+				if len(self.params.path_session) > 0 and self.need_to_save_session:
+					self.need_to_save_session = False
+					llama_cpp.llama_save_session_file(
+						self.ctx,
+						self.params.path_session.encode("utf8"),
+						(llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens),
+						len(self.session_tokens)
+					)
+
+				id = 0
+
+				logits = llama_cpp.llama_get_logits(self.ctx)
+				n_vocab = llama_cpp.llama_n_vocab(self.ctx)
+
+				# Apply params.logit_bias map
+				for key, value in self.params.logit_bias.items():
+					logits[key] += value
+
+				_arr = (llama_cpp.llama_token_data * n_vocab)(*[
+					llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
+					for token_id in range(n_vocab)
+				])
+				candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
+
+				# Apply penalties
+				nl_logit = logits[llama_cpp.llama_token_nl()]
+				last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
+
+				_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
+				llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
+					_arr,
+					last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty))
+				llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
+					_arr,
+					last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
+
+				if not self.params.penalize_nl:
+					logits[llama_cpp.llama_token_nl()] = nl_logit
+				
+				if self.params.temp <= 0:
+					# Greedy sampling
+					id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p)
+				else:
+					if self.params.mirostat == 1:
+						mirostat_mu = 2.0 * self.params.mirostat_tau
+						mirostat_m = 100
+						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
+						id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu))
+					elif self.params.mirostat == 2:
+						mirostat_mu = 2.0 * self.params.mirostat_tau
+						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
+						id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
+					else:
+						# Temperature sampling
+						llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1))
+						llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), min_keep=llama_cpp.c_size_t(1))
+						llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), min_keep=llama_cpp.c_size_t(1))
+						llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), min_keep=llama_cpp.c_size_t(1))
+						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
+						id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
+				# print("`{}`".format(candidates_p.size))

-				_arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):]
-				id = llama_cpp.llama_sample_top_p_top_k(
-					self.ctx,
-					(llama_cpp.llama_token * len(_arr))(*_arr),
-					len(_arr),
-					self.params.top_k,
-					self.params.top_p,
-					self.params.temp,
-					self.params.repeat_penalty,
-				)
 				self.last_n_tokens.pop(0)
 				self.last_n_tokens.append(id)

 				# replace end of text token with newline token when in interactive mode
 				if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
 					id = self.llama_token_newline[0]
+					self.embd.append(id)
 					if (self.use_antiprompt()):
 						# tokenize and inject first reverse prompt
 						self.embd_inp += self.first_antiprompt[0]
-
-				# add it to the context
-				self.embd.append(id)
+						for id in self.first_antiprompt[0]:
+							self.embd.append(id)
+				else:
+					# add it to the context
+					self.embd.append(id)

 				# echo this to console
 				self.output_echo = True
@ -287,7 +413,7 @@ n_keep = {self.params.n_keep}
 			# display tokens
 			if self.output_echo:
 				for id in self.embd:
-					if self.params.instruct:
+					if self.antiecho != None:
 						for r in self.antiecho(id):
 							yield r
 					else:
@ -295,7 +421,7 @@ n_keep = {self.params.n_keep}

 			# reset color to default if we there is no pending user input
 			if (self.params.input_echo and len(self.embd_inp) == self.input_consumed):
-				self.set_color(CONSOLE_COLOR_DEFAULT)
+				self.set_color(util.CONSOLE_COLOR_DEFAULT)

 			if (self.params.interactive and len(self.embd_inp) <= self.input_consumed):
 				# if antiprompt is present, stop
@ -313,9 +439,9 @@ n_keep = {self.params.n_keep}
 			# end of text token
 			if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
 				if (not self.params.instruct):
-					for i in " [end of text]\n":
+					for i in self.llama_token_eot:
 						yield i
-				break
+					break

 			# respect n_predict even if antiprompt is present
 			if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1):
@ -336,12 +462,12 @@ n_keep = {self.params.n_keep}

 	def exit(self):
 		llama_cpp.llama_free(self.ctx)
-		self.set_color(CONSOLE_COLOR_DEFAULT)
+		self.set_color(util.CONSOLE_COLOR_DEFAULT)

 	# return past text
 	def past(self):
 		for id in self.last_n_tokens[-self.n_past:]:
-			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
+			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf8", errors="ignore")

 	# write input
 	def input(self, prompt: str):
@ -355,7 +481,29 @@ n_keep = {self.params.n_keep}
 	def output(self):
 		self.remaining_tokens = self.params.n_predict
 		for id in self.generate():
-			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
+			cur_char = llama_cpp.llama_token_to_str(self.ctx, id)
+
+			# Add remainder of missing bytes
+			if None in self.multibyte_fix:
+				self.multibyte_fix[self.multibyte_fix.index(None)] = cur_char
+
+			# Return completed utf char
+			if len(self.multibyte_fix) > 0 and not None in self.multibyte_fix:
+				yield (b"".join(self.multibyte_fix)).decode("utf8")
+				self.multibyte_fix = []
+				continue
+
+			# Contains multi-byte UTF8
+			for num, pattern in [(2, 192), (3, 224), (4, 240)]:
+				# Bitwise AND check
+				if pattern & int.from_bytes(cur_char, 'little') == pattern:
+					self.multibyte_fix = [cur_char] + ([None] * (num-1))
+
+			# Stop incomplete bytes from passing
+			if len(self.multibyte_fix) > 0:
+				continue
+
+			yield cur_char.decode("utf8")

 	# read user input
 	def read_input(self):
@ -371,21 +519,21 @@ n_keep = {self.params.n_keep}
 		self.params.input_echo = False

 		while self.params.interactive:
-			self.set_color(CONSOLE_COLOR_USER_INPUT)
+			self.set_color(util.CONSOLE_COLOR_USER_INPUT)
 			if (self.params.instruct):
 				print('\n> ', end="")
 				self.input(self.read_input())
 			else:
 				print(self.params.input_prefix, end="")
-				self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.output_postfix}")
-				print(self.params.output_postfix,end="")
-			self.set_color(CONSOLE_COLOR_DEFAULT)
+				self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}")
+				print(self.params.input_suffix,end="")
+			self.set_color(util.CONSOLE_COLOR_DEFAULT)

 			try:
 				for i in self.output():
 					print(i,end="",flush=True)
 			except KeyboardInterrupt:
-				self.set_color(CONSOLE_COLOR_DEFAULT)
+				self.set_color(util.CONSOLE_COLOR_DEFAULT)
 				if not self.params.instruct:
 					print(self.params.fix_prefix,end="")
 					self.input(self.params.fix_prefix)
@ -414,8 +562,7 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {USER_NAME}: Name a color.
 {AI_NAME}: Blue
 {USER_NAME}:"""
-	args = gpt_params_parse()
-	params = GptParams(**vars(args))
+	params = gpt_params_parse()

 	with LLaMAInteract(params) as m:
 		m.interact()
--- a/examples/low_level_api/low_level_api_llama_cpp.py
+++ b/examples/low_level_api/low_level_api_llama_cpp.py
@ -37,6 +37,10 @@ embd = []
 last_n_size = 64
 last_n_tokens_data = [0] * last_n_size
 n_batch = 24
+last_n_repeat = 64
+repeat_penalty = 1
+frequency_penalty = 0.0
+presence_penalty = 0.0

 while remaining_tokens > 0:
    if len(embd) > 0:
@ -47,15 +51,28 @@ while remaining_tokens > 0:
    n_past += len(embd)
    embd = []
    if len(embd_inp) <= input_consumed:
-        id = llama_cpp.llama_sample_top_p_top_k(
-            ctx,
-            (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data),
-            len(last_n_tokens_data),
-            40,
-            0.8,
-            0.2,
-            1.0 / 0.85,
-        )
+        logits = llama_cpp.llama_get_logits(ctx)
+        n_vocab = llama_cpp.llama_n_vocab(ctx)
+
+        _arr = (llama_cpp.llama_token_data * n_vocab)(*[
+            llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
+            for token_id in range(n_vocab)
+        ])
+        candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
+
+        _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
+        llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
+            _arr,
+            last_n_repeat, repeat_penalty)
+        llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p,
+            _arr,
+            last_n_repeat, frequency_penalty, presence_penalty)
+
+        llama_cpp.llama_sample_top_k(ctx, candidates_p, 40)
+        llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8)
+        llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2)
+        id = llama_cpp.llama_sample_token(ctx, candidates_p)
+
        last_n_tokens_data = last_n_tokens_data[1:] + [id]
        embd.append(id)
        input_noecho = False
@ -70,7 +87,7 @@ while remaining_tokens > 0:
    if not input_noecho:
        for id in embd:
            print(
-                llama_cpp.llama_token_to_str(ctx, id).decode("utf-8"),
+                llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"),
                end="",
                flush=True,
            )
--- a/examples/low_level_api/util.py
+++ b/examples/low_level_api/util.py
@ -0,0 +1,95 @@
+
+ANSI_COLOR_RESET = "\x1b[0m"
+ANSI_COLOR_YELLOW = "\x1b[33m"
+ANSI_BOLD = "\x1b[1m"
+ANSI_COLOR_GREEN = "\x1b[32m"
+
+CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
+CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
+CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
+
+# Iterative search
+# Actively searches and prevents a pattern from being returned
+class IterSearch:
+	def __init__(self, pattern):
+		self.pattern = list(pattern)
+		self.buffer = []
+
+	def __call__(self, char):
+		self.buffer += [char]
+
+		if (self.pattern[:len(self.buffer)] == self.buffer):
+			if (len(self.buffer) >= len(self.pattern)):
+				self.buffer.clear()
+			return []
+
+		_tmp = self.buffer[:]
+		self.buffer.clear()
+		return _tmp
+
+class Circle:
+	def __init__(self, size, default=0):
+		self.list = [default] * size
+		self.maxsize = size
+		self.size = 0
+		self.offset = 0
+
+	def append(self, elem):
+		if self.size < self.maxsize:
+			self.list[self.size] = elem
+			self.size += 1
+		else:
+			self.list[self.offset] = elem
+			self.offset = (self.offset + 1) % self.maxsize
+
+	def __getitem__(self, val):
+		if isinstance(val, int):
+			if 0 > val or val >= self.size:
+				raise IndexError('Index out of range')
+			return self.list[val] if self.size < self.maxsize else self.list[(self.offset + val) % self.maxsize]
+		elif isinstance(val, slice):
+			start, stop, step = val.start, val.stop, val.step
+			if step is None:
+				step = 1
+			if start is None:
+				start = 0
+			if stop is None:
+				stop = self.size
+			if start < 0:
+				start = self.size + start
+			if stop < 0:
+				stop = self.size + stop
+
+			indices = range(start, stop, step)
+			return [self.list[(self.offset + i) % self.maxsize] for i in indices if i < self.size]
+		else:
+			raise TypeError('Invalid argument type')
+
+
+
+
+if __name__ == "__main__":
+	c = Circle(5)
+
+	c.append(1)
+	print(c.list)
+	print(c[:])
+	assert c[0] == 1
+	assert c[:5] == [1]
+
+	for i in range(2,5+1):
+		c.append(i)
+	print(c.list)
+	print(c[:])
+	assert c[0] == 1
+	assert c[:5] == [1,2,3,4,5]
+
+	for i in range(5+1,9+1):
+		c.append(i)
+	print(c.list)
+	print(c[:])
+	assert c[0] == 5
+	assert c[:5] == [5,6,7,8,9]
+	#assert c[:-5] == [5,6,7,8,9]
+	assert c[:10] == [5,6,7,8,9]
+
--- a/examples/notebooks/Guidance.ipynb
+++ b/examples/notebooks/Guidance.ipynb
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@ -1,4 +1,4 @@
-from typing import List, Optional, Dict, Union
+from typing import Any, List, Optional, Dict, Union
 from typing_extensions import TypedDict, NotRequired, Literal


@ -22,9 +22,9 @@ class Embedding(TypedDict):

 class CompletionLogprobs(TypedDict):
    text_offset: List[int]
-    token_logprobs: List[float]
+    token_logprobs: List[Optional[float]]
    tokens: List[str]
-    top_logprobs: List[Dict[str, float]]
+    top_logprobs: List[Optional[Dict[str, float]]]


 class CompletionChoice(TypedDict):
@ -58,7 +58,7 @@ class Completion(TypedDict):


 class ChatCompletionMessage(TypedDict):
-    role: Union[Literal["assistant"], Literal["user"], Literal["system"]]
+    role: Literal["assistant", "user", "system"]
    content: str
    user: NotRequired[str]

@ -77,6 +77,8 @@ class ChatCompletion(TypedDict):
    choices: List[ChatCompletionChoice]
    usage: CompletionUsage

+class ChatCompletionChunkDeltaEmpty(TypedDict):
+    pass

 class ChatCompletionChunkDelta(TypedDict):
    role: NotRequired[Literal["assistant"]]
@ -85,7 +87,7 @@ class ChatCompletionChunkDelta(TypedDict):

 class ChatCompletionChunkChoice(TypedDict):
    index: int
-    delta: ChatCompletionChunkDelta
+    delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty]
    finish_reason: Optional[str]


--- a/llama_cpp/server/init.py
+++ b/llama_cpp/server/init.py
--- a/llama_cpp/server/main.py
+++ b/llama_cpp/server/main.py
@ -3,267 +3,48 @@
 To run this example:

 ```bash
-pip install fastapi uvicorn sse-starlette
+pip install fastapi uvicorn sse-starlette pydantic-settings
 export MODEL=../models/7B/...
-uvicorn fastapi_server_chat:app --reload
+```
+
+Then run:
+```
+uvicorn llama_cpp.server.app:app --reload
+```
+
+or
+
+```
+python3 -m llama_cpp.server
 ```

 Then visit http://localhost:8000/docs to see the interactive API docs.

 """
 import os
-import json
-from typing import List, Optional, Literal, Union, Iterator, Dict
-from typing_extensions import TypedDict
+import argparse

-import llama_cpp
-
-from fastapi import FastAPI
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
-from sse_starlette.sse import EventSourceResponse
-
-
-class Settings(BaseSettings):
-    model: str
-    n_ctx: int = 2048
-    n_batch: int = 8
-    n_threads: int = ((os.cpu_count() or 2) // 2) or 1
-    f16_kv: bool = True
-    use_mlock: bool = False  # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
-    embedding: bool = True
-    last_n_tokens_size: int = 64
-
-
-app = FastAPI(
-    title="🦙 llama.cpp Python API",
-    version="0.0.1",
-)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-settings = Settings()
-llama = llama_cpp.Llama(
-    settings.model,
-    f16_kv=settings.f16_kv,
-    use_mlock=settings.use_mlock,
-    embedding=settings.embedding,
-    n_threads=settings.n_threads,
-    n_batch=settings.n_batch,
-    n_ctx=settings.n_ctx,
-    last_n_tokens_size=settings.last_n_tokens_size,
-)
-
-
-class CreateCompletionRequest(BaseModel):
-    prompt: Union[str, List[str]]
-    suffix: Optional[str] = Field(None)
-    max_tokens: int = 16
-    temperature: float = 0.8
-    top_p: float = 0.95
-    echo: bool = False
-    stop: List[str] = []
-    stream: bool = False
-
-    # ignored or currently unsupported
-    model: Optional[str] = Field(None)
-    n: Optional[int] = 1
-    logprobs: Optional[int] = Field(None)
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
-    best_of: Optional[int] = 1
-    logit_bias: Optional[Dict[str, float]] = Field(None)
-    user: Optional[str] = Field(None)
-
-    # llama.cpp specific parameters
-    top_k: int = 40
-    repeat_penalty: float = 1.1
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
-                "stop": ["\n", "###"],
-            }
-        }
-
-
-CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
-
-
-@app.post(
-    "/v1/completions",
-    response_model=CreateCompletionResponse,
-)
-def create_completion(request: CreateCompletionRequest):
-    if isinstance(request.prompt, list):
-        request.prompt = "".join(request.prompt)
-
-    completion_or_chunks = llama(
-        **request.dict(
-            exclude={
-                "model",
-                "n",
-                "logprobs",
-                "frequency_penalty",
-                "presence_penalty",
-                "best_of",
-                "logit_bias",
-                "user",
-            }
-        )
-    )
-    if request.stream:
-        chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks  # type: ignore
-        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
-    completion: llama_cpp.Completion = completion_or_chunks  # type: ignore
-    return completion
-
-
-class CreateEmbeddingRequest(BaseModel):
-    model: Optional[str]
-    input: str
-    user: Optional[str]
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "input": "The food was delicious and the waiter...",
-            }
-        }
-
-
-CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
-
-
-@app.post(
-    "/v1/embeddings",
-    response_model=CreateEmbeddingResponse,
-)
-def create_embedding(request: CreateEmbeddingRequest):
-    return llama.create_embedding(**request.dict(exclude={"model", "user"}))
-
-
-class ChatCompletionRequestMessage(BaseModel):
-    role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
-    content: str
-    user: Optional[str] = None
-
-
-class CreateChatCompletionRequest(BaseModel):
-    model: Optional[str]
-    messages: List[ChatCompletionRequestMessage]
-    temperature: float = 0.8
-    top_p: float = 0.95
-    stream: bool = False
-    stop: List[str] = []
-    max_tokens: int = 128
-
-    # ignored or currently unsupported
-    model: Optional[str] = Field(None)
-    n: Optional[int] = 1
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
-    logit_bias: Optional[Dict[str, float]] = Field(None)
-    user: Optional[str] = Field(None)
-
-    # llama.cpp specific parameters
-    repeat_penalty: float = 1.1
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "messages": [
-                    ChatCompletionRequestMessage(
-                        role="system", content="You are a helpful assistant."
-                    ),
-                    ChatCompletionRequestMessage(
-                        role="user", content="What is the capital of France?"
-                    ),
-                ]
-            }
-        }
-
-
-CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)
-
-
-@app.post(
-    "/v1/chat/completions",
-    response_model=CreateChatCompletionResponse,
-)
-async def create_chat_completion(
-    request: CreateChatCompletionRequest,
-) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
-    completion_or_chunks = llama.create_chat_completion(
-        **request.dict(
-            exclude={
-                "model",
-                "n",
-                "presence_penalty",
-                "frequency_penalty",
-                "logit_bias",
-                "user",
-            }
-        ),
-    )
-
-    if request.stream:
-
-        async def server_sent_events(
-            chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
-        ):
-            for chat_chunk in chat_chunks:
-                yield dict(data=json.dumps(chat_chunk))
-            yield dict(data="[DONE]")
-
-        chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks  # type: ignore
-
-        return EventSourceResponse(
-            server_sent_events(chunks),
-        )
-    completion: llama_cpp.ChatCompletion = completion_or_chunks  # type: ignore
-    return completion
-
-
-class ModelData(TypedDict):
-    id: str
-    object: Literal["model"]
-    owned_by: str
-    permissions: List[str]
-
-
-class ModelList(TypedDict):
-    object: Literal["list"]
-    data: List[ModelData]
-
-
-GetModelResponse = create_model_from_typeddict(ModelList)
-
-
-@app.get("/v1/models", response_model=GetModelResponse)
-def get_models() -> ModelList:
-    return {
-        "object": "list",
-        "data": [
-            {
-                "id": llama.model_path,
-                "object": "model",
-                "owned_by": "me",
-                "permissions": [],
-            }
-        ],
-    }
+import uvicorn

+from llama_cpp.server.app import create_app, Settings

 if __name__ == "__main__":
-    import os
-    import uvicorn
+    parser = argparse.ArgumentParser()
+    for name, field in Settings.__model_fields__.items():
+        description = field.field_info.description
+        if field.default is not None and description is not None:
+            description += f" (default: {field.default})"
+        parser.add_argument(
+            f"--{name}",
+            dest=name,
+            type=field.type_,
+            help=description,
+        )
+
+    args = parser.parse_args()
+    settings = Settings(**{k: v for k, v in vars(args).items() if v is not None})
+    app = create_app(settings=settings)

    uvicorn.run(
-        app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
+        app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port))
    )
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@ -0,0 +1,568 @@
+import json
+import multiprocessing
+from threading import Lock
+from functools import partial
+from typing import Iterator, List, Optional, Union, Dict
+from typing_extensions import TypedDict, Literal
+
+import llama_cpp
+
+import anyio
+from anyio.streams.memory import MemoryObjectSendStream
+from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
+from fastapi import Depends, FastAPI, APIRouter, Request
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from pydantic_settings import BaseSettings
+from sse_starlette.sse import EventSourceResponse
+
+
+class Settings(BaseSettings):
+    model: str = Field(
+        description="The path to the model to use for generating completions."
+    )
+    model_alias: Optional[str] = Field(
+        default=None,
+        description="The alias of the model to use for generating completions.",
+    )
+    n_ctx: int = Field(default=2048, ge=1, description="The context size.")
+    n_gpu_layers: int = Field(
+        default=0,
+        ge=0,
+        description="The number of layers to put on the GPU. The rest will be on the CPU.",
+    )
+    seed: int = Field(
+        default=1337, description="Random seed. -1 for random."
+    )
+    n_batch: int = Field(
+        default=512, ge=1, description="The batch size to use per eval."
+    )
+    n_threads: int = Field(
+        default=max(multiprocessing.cpu_count() // 2, 1),
+        ge=1,
+        description="The number of threads to use.",
+    )
+    f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
+    use_mlock: bool = Field(
+        default=llama_cpp.llama_mlock_supported(),
+        description="Use mlock.",
+    )
+    use_mmap: bool = Field(
+        default=llama_cpp.llama_mmap_supported(),
+        description="Use mmap.",
+    )
+    embedding: bool = Field(default=True, description="Whether to use embeddings.")
+    low_vram: bool = Field(
+        default=False,
+        description="Whether to use less VRAM. This will reduce performance.",
+    )
+    last_n_tokens_size: int = Field(
+        default=64,
+        ge=0,
+        description="Last n tokens to keep for repeat penalty calculation.",
+    )
+    logits_all: bool = Field(default=True, description="Whether to return logits.")
+    cache: bool = Field(
+        default=False,
+        description="Use a cache to reduce processing times for evaluated prompts.",
+    )
+    cache_type: Literal["ram", "disk"] = Field(
+        default="ram",
+        description="The type of cache to use. Only used if cache is True.",
+    )
+    cache_size: int = Field(
+        default=2 << 30,
+        description="The size of the cache in bytes. Only used if cache is True.",
+    )
+    vocab_only: bool = Field(
+        default=False, description="Whether to only return the vocabulary."
+    )
+    verbose: bool = Field(
+        default=True, description="Whether to print debug information."
+    )
+    host: str = Field(
+        default="localhost", description="Listen address"
+    )
+    port: int = Field(
+        default=8000, description="Listen port"
+    )
+    interrupt_requests: bool = Field(
+        default=True,
+        description="Whether to interrupt requests when a new request is received.",
+    )
+
+
+router = APIRouter()
+
+settings: Optional[Settings] = None
+llama: Optional[llama_cpp.Llama] = None
+
+
+def create_app(settings: Optional[Settings] = None):
+    if settings is None:
+        settings = Settings()
+    app = FastAPI(
+        title="🦙 llama.cpp Python API",
+        version="0.0.1",
+    )
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    app.include_router(router)
+    global llama
+    llama = llama_cpp.Llama(
+        model_path=settings.model,
+        n_gpu_layers=settings.n_gpu_layers,
+        seed=settings.seed,
+        f16_kv=settings.f16_kv,
+        use_mlock=settings.use_mlock,
+        use_mmap=settings.use_mmap,
+        embedding=settings.embedding,
+        logits_all=settings.logits_all,
+        n_threads=settings.n_threads,
+        n_batch=settings.n_batch,
+        n_ctx=settings.n_ctx,
+        last_n_tokens_size=settings.last_n_tokens_size,
+        vocab_only=settings.vocab_only,
+        verbose=settings.verbose,
+    )
+    if settings.cache:
+        if settings.cache_type == "disk":
+            if settings.verbose:
+                print(f"Using disk cache with size {settings.cache_size}")
+            cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
+        else:
+            if settings.verbose:
+                print(f"Using ram cache with size {settings.cache_size}")
+            cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
+
+        cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size)
+        llama.set_cache(cache)
+
+    def set_settings(_settings: Settings):
+        global settings
+        settings = _settings
+
+    set_settings(settings)
+    return app
+
+
+llama_outer_lock = Lock()
+llama_inner_lock = Lock()
+
+
+def get_llama():
+    # NOTE: This double lock allows the currently streaming llama model to
+    # check if any other requests are pending in the same thread and cancel
+    # the stream if so.
+    llama_outer_lock.acquire()
+    release_outer_lock = True
+    try:
+        llama_inner_lock.acquire()
+        try:
+            llama_outer_lock.release()
+            release_outer_lock = False
+            yield llama
+        finally:
+            llama_inner_lock.release()
+    finally:
+        if release_outer_lock:
+            llama_outer_lock.release()
+
+
+def get_settings():
+    yield settings
+
+
+model_field = Field(description="The model to use for generating completions.")
+
+max_tokens_field = Field(
+    default=16, ge=1, le=2048, description="The maximum number of tokens to generate."
+)
+
+temperature_field = Field(
+    default=0.8,
+    ge=0.0,
+    le=2.0,
+    description="Adjust the randomness of the generated text.\n\n"
+    + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
+)
+
+top_p_field = Field(
+    default=0.95,
+    ge=0.0,
+    le=1.0,
+    description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n"
+    + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.",
+)
+
+stop_field = Field(
+    default=None,
+    description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
+)
+
+stream_field = Field(
+    default=False,
+    description="Whether to stream the results as they are generated. Useful for chatbots.",
+)
+
+top_k_field = Field(
+    default=40,
+    ge=0,
+    description="Limit the next token selection to the K most probable tokens.\n\n"
+    + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.",
+)
+
+repeat_penalty_field = Field(
+    default=1.1,
+    ge=0.0,
+    description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n"
+    + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
+)
+
+presence_penalty_field = Field(
+    default=0.0,
+    ge=-2.0,
+    le=2.0,
+    description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
+)
+
+frequency_penalty_field = Field(
+    default=0.0,
+    ge=-2.0,
+    le=2.0,
+    description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
+)
+
+mirostat_mode_field = Field(
+    default=0,
+    ge=0,
+    le=2,
+    description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)"
+)
+
+mirostat_tau_field = Field(
+    default=5.0,
+    ge=0.0,
+    le=10.0,
+    description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text"
+)
+
+mirostat_eta_field = Field(
+    default=0.1,
+    ge=0.001,
+    le=1.0,
+    description="Mirostat learning rate"
+)
+
+
+class CreateCompletionRequest(BaseModel):
+    prompt: Union[str, List[str]] = Field(
+        default="", description="The prompt to generate completions for."
+    )
+    suffix: Optional[str] = Field(
+        default=None,
+        description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
+    )
+    max_tokens: int = max_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    mirostat_mode: int = mirostat_mode_field
+    mirostat_tau: float = mirostat_tau_field
+    mirostat_eta: float = mirostat_eta_field
+    echo: bool = Field(
+        default=False,
+        description="Whether to echo the prompt in the generated text. Useful for chatbots.",
+    )
+    stop: Optional[Union[str, List[str]]] = stop_field
+    stream: bool = stream_field
+    logprobs: Optional[int] = Field(
+        default=None,
+        ge=0,
+        description="The number of logprobs to generate. If None, no logprobs are generated.",
+    )
+    presence_penalty: Optional[float] = presence_penalty_field
+    frequency_penalty: Optional[float] = frequency_penalty_field
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    logprobs: Optional[int] = Field(None)
+
+    # ignored or currently unsupported
+    model: Optional[str] = model_field
+    n: Optional[int] = 1
+    best_of: Optional[int] = 1
+    user: Optional[str] = Field(None)
+
+    # llama.cpp specific parameters
+    top_k: int = top_k_field
+    repeat_penalty: float = repeat_penalty_field
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+                "stop": ["\n", "###"],
+            }
+        }
+
+
+
+
+def make_logit_bias_processor(
+    llama: llama_cpp.Llama,
+    logit_bias: Dict[str, float],
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]],
+):
+    if logit_bias_type is None:
+        logit_bias_type = "input_ids"
+
+    to_bias: Dict[int, float] = {}
+    if logit_bias_type == "input_ids":
+        for input_id, score in logit_bias.items():
+            input_id = int(input_id)
+            to_bias[input_id] = score
+
+    elif logit_bias_type == "tokens":
+        for token, score in logit_bias.items():
+            token = token.encode('utf-8')
+            for input_id in llama.tokenize(token, add_bos=False):
+                to_bias[input_id] = score
+
+    def logit_bias_processor(
+        input_ids: List[int],
+        scores: List[float],
+    ) -> List[float]:
+        new_scores = [None] * len(scores)
+        for input_id, score in enumerate(scores):
+            new_scores[input_id] = score + to_bias.get(input_id, 0.0)
+
+        return new_scores
+
+    return logit_bias_processor
+
+
+@router.post(
+    "/v1/completions",
+)
+async def create_completion(
+    request: Request,
+    body: CreateCompletionRequest,
+    llama: llama_cpp.Llama = Depends(get_llama),
+):
+    if isinstance(body.prompt, list):
+        assert len(body.prompt) <= 1
+        body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
+
+    exclude = {
+        "n",
+        "best_of",
+        "logit_bias",
+        "logit_bias_type",
+        "user",
+    }
+    kwargs = body.dict(exclude=exclude)
+
+    if body.logit_bias is not None:
+        kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
+            make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
+        ])
+
+    if body.stream:
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+
+        async def event_publisher(inner_send_chan: MemoryObjectSendStream):
+            async with inner_send_chan:
+                try:
+                    iterator: Iterator[llama_cpp.CompletionChunk] = await run_in_threadpool(llama, **kwargs)  # type: ignore
+                    async for chunk in iterate_in_threadpool(iterator):
+                        await inner_send_chan.send(dict(data=json.dumps(chunk)))
+                        if await request.is_disconnected():
+                            raise anyio.get_cancelled_exc_class()()
+                        if settings.interrupt_requests and llama_outer_lock.locked():
+                            await inner_send_chan.send(dict(data="[DONE]"))
+                            raise anyio.get_cancelled_exc_class()()
+                    await inner_send_chan.send(dict(data="[DONE]"))
+                except anyio.get_cancelled_exc_class() as e:
+                    print("disconnected")
+                    with anyio.move_on_after(1, shield=True):
+                        print(
+                            f"Disconnected from client (via refresh/close) {request.client}"
+                        )
+                        raise e
+
+        return EventSourceResponse(
+            recv_chan, data_sender_callable=partial(event_publisher, send_chan)
+        )
+    else:
+        completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs)  # type: ignore
+        return completion
+
+
+class CreateEmbeddingRequest(BaseModel):
+    model: Optional[str] = model_field
+    input: Union[str, List[str]] = Field(description="The input to embed.")
+    user: Optional[str]
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "input": "The food was delicious and the waiter...",
+            }
+        }
+
+
+
+
+@router.post(
+    "/v1/embeddings",
+)
+async def create_embedding(
+    request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
+):
+    return await run_in_threadpool(
+        llama.create_embedding, **request.dict(exclude={"user"})
+    )
+
+
+class ChatCompletionRequestMessage(BaseModel):
+    role: Literal["system", "user", "assistant"] = Field(
+        default="user", description="The role of the message."
+    )
+    content: str = Field(default="", description="The content of the message.")
+
+
+class CreateChatCompletionRequest(BaseModel):
+    messages: List[ChatCompletionRequestMessage] = Field(
+        default=[], description="A list of messages to generate completions for."
+    )
+    max_tokens: int = max_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    mirostat_mode: int = mirostat_mode_field
+    mirostat_tau: float = mirostat_tau_field
+    mirostat_eta: float = mirostat_eta_field
+    stop: Optional[List[str]] = stop_field
+    stream: bool = stream_field
+    presence_penalty: Optional[float] = presence_penalty_field
+    frequency_penalty: Optional[float] = frequency_penalty_field
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+
+    # ignored or currently unsupported
+    model: Optional[str] = model_field
+    n: Optional[int] = 1
+    user: Optional[str] = Field(None)
+
+    # llama.cpp specific parameters
+    top_k: int = top_k_field
+    repeat_penalty: float = repeat_penalty_field
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "messages": [
+                    ChatCompletionRequestMessage(
+                        role="system", content="You are a helpful assistant."
+                    ),
+                    ChatCompletionRequestMessage(
+                        role="user", content="What is the capital of France?"
+                    ),
+                ]
+            }
+        }
+
+
+
+
+@router.post(
+    "/v1/chat/completions",
+)
+async def create_chat_completion(
+    request: Request,
+    body: CreateChatCompletionRequest,
+    llama: llama_cpp.Llama = Depends(get_llama),
+    settings: Settings = Depends(get_settings),
+) -> Union[llama_cpp.ChatCompletion]: # type: ignore
+    exclude = {
+        "n",
+        "logit_bias",
+        "logit_bias_type",
+        "user",
+    }
+    kwargs = body.dict(exclude=exclude)
+
+    if body.logit_bias is not None:
+        kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
+            make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
+        ])
+
+    if body.stream:
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+
+        async def event_publisher(inner_send_chan: MemoryObjectSendStream):
+            async with inner_send_chan:
+                try:
+                    iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs)  # type: ignore
+                    async for chat_chunk in iterate_in_threadpool(iterator):
+                        await inner_send_chan.send(dict(data=json.dumps(chat_chunk)))
+                        if await request.is_disconnected():
+                            raise anyio.get_cancelled_exc_class()()
+                        if settings.interrupt_requests and llama_outer_lock.locked():
+                            await inner_send_chan.send(dict(data="[DONE]"))
+                            raise anyio.get_cancelled_exc_class()()
+                    await inner_send_chan.send(dict(data="[DONE]"))
+                except anyio.get_cancelled_exc_class() as e:
+                    print("disconnected")
+                    with anyio.move_on_after(1, shield=True):
+                        print(
+                            f"Disconnected from client (via refresh/close) {request.client}"
+                        )
+                        raise e
+
+        return EventSourceResponse(
+            recv_chan,
+            data_sender_callable=partial(event_publisher, send_chan),
+        )
+    else:
+        completion: llama_cpp.ChatCompletion = await run_in_threadpool(
+            llama.create_chat_completion, **kwargs  # type: ignore
+        )
+        return completion
+
+
+class ModelData(TypedDict):
+    id: str
+    object: Literal["model"]
+    owned_by: str
+    permissions: List[str]
+
+
+class ModelList(TypedDict):
+    object: Literal["list"]
+    data: List[ModelData]
+
+
+
+
+@router.get("/v1/models")
+async def get_models(
+    settings: Settings = Depends(get_settings),
+) -> ModelList:
+    assert llama is not None
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": settings.model_alias
+                if settings.model_alias is not None
+                else llama.model_path,
+                "object": "model",
+                "owned_by": "me",
+                "permissions": [],
+            }
+        ],
+    }
--- a/poetry.lock
+++ b/poetry.lock
--- a/poetry.toml
+++ b/poetry.toml
@ -0,0 +1,3 @@
+[virtualenvs]
+in-project = true
+prefer-active-python = true
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.30"
+version = "0.1.68"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
@ -14,16 +14,25 @@ include = [

 [tool.poetry.dependencies]
 python = "^3.8.1"
-typing-extensions = "^4.5.0"
-
+typing-extensions = "^4.7.1"
+numpy = "^1.24.4"
+diskcache = "^5.6.1"
+uvicorn = { version = "^0.22.0", optional = true }
+fastapi = { version = "^0.99.1", optional = true }
+sse-starlette = { version = "^1.6.1", optional = true }

 [tool.poetry.group.dev.dependencies]
-black = "^23.1.0"
+black = "^23.3.0"
 twine = "^4.0.2"
-mkdocs = "^1.4.2"
-mkdocstrings = {extras = ["python"], version = "^0.20.0"}
-mkdocs-material = "^9.1.4"
-pytest = "^7.2.2"
+mkdocs = "^1.4.3"
+mkdocstrings = {extras = ["python"], version = "^0.22.0"}
+mkdocs-material = "^9.1.18"
+pytest = "^7.4.0"
+httpx = "^0.24.1"
+scikit-build = "0.17.6"
+
+[tool.poetry.extras]
+server = ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"]

 [build-system]
 requires = [
@ -32,4 +41,4 @@ requires = [
    "cmake>=3.18",
    "ninja",
 ]
-build-backend = "setuptools.build_meta"
+build-backend = "setuptools.build_meta"
--- a/setup.py
+++ b/setup.py
@ -10,17 +10,15 @@ setup(
    description="A Python wrapper for llama.cpp",
    long_description=long_description,
    long_description_content_type="text/markdown",
-    version="0.1.30",
+    version="0.1.68",
    author="Andrei Betlen",
    author_email="abetlen@gmail.com",
    license="MIT",
    package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"},
    packages=["llama_cpp", "llama_cpp.server"],
-    install_requires=[
-        "typing-extensions>=4.5.0",
-    ],
+    install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"],
    extras_require={
-        "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"],
+        "server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
    },
    python_requires=">=3.7",
    classifiers=[
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@ -14,14 +14,22 @@ def test_llama():
    assert llama.detokenize(llama.tokenize(text)) == text


+# @pytest.mark.skip(reason="need to update sample mocking")
 def test_llama_patch(monkeypatch):
    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
+    n_vocab = llama_cpp.llama_n_vocab(llama.ctx)

    ## Set up mock function
    def mock_eval(*args, **kwargs):
        return 0

+    def mock_get_logits(*args, **kwargs):
+        return (llama_cpp.c_float * n_vocab)(
+            *[llama_cpp.c_float(0) for _ in range(n_vocab)]
+        )
+
    monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval)
+    monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)

    output_text = " jumps over the lazy dog."
    output_tokens = llama.tokenize(output_text.encode("utf-8"))
@ -36,7 +44,7 @@ def test_llama_patch(monkeypatch):
        else:
            return token_eos

-    monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_top_p_top_k", mock_sample)
+    monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample)

    text = "The quick brown fox"

@ -82,6 +90,7 @@ def test_llama_patch(monkeypatch):
 def test_llama_pickle():
    import pickle
    import tempfile
+
    fp = tempfile.TemporaryFile()
    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
    pickle.dump(llama, fp)
@ -93,4 +102,70 @@ def test_llama_pickle():

    text = b"Hello World"

-    assert llama.detokenize(llama.tokenize(text)) == text
+    assert llama.detokenize(llama.tokenize(text)) == text
+
+
+def test_utf8(monkeypatch):
+    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
+    n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
+
+    ## Set up mock function
+    def mock_eval(*args, **kwargs):
+        return 0
+
+    def mock_get_logits(*args, **kwargs):
+        return (llama_cpp.c_float * n_vocab)(
+            *[llama_cpp.c_float(0) for _ in range(n_vocab)]
+        )
+
+    monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval)
+    monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
+
+    output_text = "😀"
+    output_tokens = llama.tokenize(output_text.encode("utf-8"))
+    token_eos = llama.token_eos()
+    n = 0
+
+    def mock_sample(*args, **kwargs):
+        nonlocal n
+        if n < len(output_tokens):
+            n += 1
+            return output_tokens[n - 1]
+        else:
+            return token_eos
+
+    monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample)
+
+    ## Test basic completion with utf8 multibyte
+    n = 0  # reset
+    completion = llama.create_completion("", max_tokens=4)
+    assert completion["choices"][0]["text"] == output_text
+
+    ## Test basic completion with incomplete utf8 multibyte
+    n = 0  # reset
+    completion = llama.create_completion("", max_tokens=1)
+    assert completion["choices"][0]["text"] == ""
+
+
+def test_llama_server():
+    from fastapi.testclient import TestClient
+    from llama_cpp.server.app import create_app, Settings
+
+    settings = Settings(
+        model=MODEL,
+        vocab_only=True,
+    )
+    app = create_app(settings)
+    client = TestClient(app)
+    response = client.get("/v1/models")
+    assert response.json() == {
+        "object": "list",
+        "data": [
+            {
+                "id": MODEL,
+                "object": "model",
+                "owned_by": "me",
+                "permissions": [],
+            }
+        ],
+    }
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 180b693a47b6b825288ef9f2c39d24b6eea4eea6
+Subproject commit 061f5f8d2109bb7adcbd40f1b456d887c5a1df25