diff --git a/.gitmodules b/.gitmodules index 6267b09..7edf097 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp - url = git@github.com:ggerganov/llama.cpp.git + url = https://github.com/ggerganov/llama.cpp.git diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..6eb04cd --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,12 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- Added first version of the changelog \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index bda2388..16932b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,4 +28,4 @@ else() LIBRARY DESTINATION llama_cpp RUNTIME DESTINATION llama_cpp ) -endif(UNIX) +endif() diff --git a/README.md b/README.md index ae633f4..7487345 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ This package provides: - OpenAI-like API - LangChain compatibility +Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). + ## Installation from PyPI (recommended) Install from PyPI (requires a c compiler): @@ -26,6 +28,18 @@ pip install llama-cpp-python The above command will attempt to install the package and build build `llama.cpp` from source. This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. +If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different compiler options, please add the following flags to ensure that the package is rebuilt correctly: + +```bash +pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir +``` + +Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: +``` +wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh +bash Miniforge3-MacOSX-arm64.sh +``` +Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac. ### Installation with OpenBLAS / cuBLAS / CLBlast @@ -35,19 +49,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing: ```bash -LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python +CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python ``` To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing: ```bash -LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python +CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python ``` To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing: ```bash -LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python +CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python ``` @@ -102,7 +116,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server: ```bash -docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest +docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest ``` ## Low-level API @@ -120,7 +134,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize >>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params) >>> max_tokens = params.n_ctx # use ctypes arrays for array params ->>> tokens = (llama_cppp.llama_token * int(max_tokens))() +>>> tokens = (llama_cpp.llama_token * int(max_tokens))() >>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) >>> llama_cpp.llama_free(ctx) ``` diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..f0ef5f7 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,51 @@ +# Define the image argument and provide a default value +ARG IMAGE=python:3-slim-bullseye + +# Use the image as specified +FROM ${IMAGE} + +# Re-declare the ARG after FROM +ARG IMAGE + +# Update and upgrade the existing packages +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + ninja-build \ + build-essential + +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette + +# Perform the conditional installations based on the image +RUN echo "Image: ${IMAGE}" && \ + if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \ + echo "OpenBLAS install:" && \ + apt-get install -y --no-install-recommends libopenblas-dev && \ + LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \ +else \ + echo "CuBLAS install:" && \ + LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \ +fi + +# Clean up apt cache +RUN rm -rf /var/lib/apt/lists/* + +# Set a working directory for better clarity +WORKDIR /app + +# Copy files to the app directory +RUN echo "Installing model...this can take some time..." +COPY ./model.bin /app/model.bin +COPY ./start_server.sh /app/start_server.sh + +# Make the server start script executable +RUN chmod +x /app/start_server.sh + +# Set environment variable for the host +ENV HOST=0.0.0.0 + +# Expose a port for the server +EXPOSE 8000 + +# Run the server start script +CMD ["/bin/sh", "/app/start_server.sh"] diff --git a/Dockerfile.cuda b/docker/Dockerfile.cuda_simple similarity index 78% rename from Dockerfile.cuda rename to docker/Dockerfile.cuda_simple index a852f3c..dda7a9f 100644 --- a/Dockerfile.cuda +++ b/docker/Dockerfile.cuda_simple @@ -1,4 +1,5 @@ -FROM nvidia/cuda:12.1.1-devel-ubuntu20.04 +ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04" +FROM ${CUDA_IMAGE} # We need to set the host to 0.0.0.0 to allow outside access ENV HOST 0.0.0.0 @@ -12,4 +13,4 @@ RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fa RUN LLAMA_CUBLAS=1 python3 setup.py develop # Run the server -CMD python3 -m llama_cpp.server \ No newline at end of file +CMD python3 -m llama_cpp.server diff --git a/Dockerfile b/docker/Dockerfile.openblas_simple similarity index 100% rename from Dockerfile rename to docker/Dockerfile.openblas_simple diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..100bcbd --- /dev/null +++ b/docker/README.md @@ -0,0 +1,46 @@ +# Dockerfiles for building the llama-cpp-python server +- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS +- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS +- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke) +- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` + +# Get model from Hugging Face +`python3 ./hug_model.py` + +You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. +``` +docker $ ls -lh *.bin +-rw-rw-r-- 1 user user 4.8G May 23 18:30 .q5_1.bin +lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> .q5_1.bin +``` +**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least +**TWICE** as much disk space as the size of the model: + +| Model | Quantized size | +|------:|----------------:| +| 7B | 5 GB | +| 13B | 10 GB | +| 30B | 25 GB | +| 65B | 50 GB | + +**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` + +# Install Docker Server + +**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR! + +[Install Docker Engine](https://docs.docker.com/engine/install) + +# Use OpenBLAS +Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS: +## Build: +`docker build --build-arg -t openblas .` +## Run: +`docker run --cap-add SYS_RESOURCE -t openblas` + +# Use CuBLAS +Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) +## Build: +`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .` +## Run: +`docker run --cap-add SYS_RESOURCE -t cublas` diff --git a/docker/hug_model.py b/docker/hug_model.py new file mode 100644 index 0000000..848a1aa --- /dev/null +++ b/docker/hug_model.py @@ -0,0 +1,116 @@ +import requests +import json +import os +import struct + +def make_request(url, params=None): + print(f"Making request to {url}...") + response = requests.get(url, params=params) + if response.status_code == 200: + return json.loads(response.text) + else: + print(f"Request failed with status code {response.status_code}") + return None + +def check_magic_and_version(filename): + with open(filename, 'rb') as f: + # Read the first 6 bytes from the file + data = f.read(6) + + # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int + # and the next 2 bytes as a little-endian unsigned short + magic, version = struct.unpack('= 10485760: # 10 MB + print('.', end='', flush=True) + total_downloaded = 0 + print("\nDownload complete.") + + # Creating a symbolic link from destination to "model.bin" + if os.path.isfile("model.bin"): + os.remove("model.bin") # remove the existing link if any + os.symlink(destination, "model.bin") + else: + print(f"Download failed with status code {response.status_code}") + +def get_user_choice(model_list): + # Print the enumerated list + print("\n") + for i, (model_id, rfilename) in enumerate(model_list): + print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}") + + # Get user's choice + choice = input("Choose a model to download by entering the corresponding number: ") + try: + index = int(choice) - 1 + if 0 <= index < len(model_list): + # Return the chosen model + return model_list[index] + else: + print("Invalid choice.") + except ValueError: + print("Invalid input. Please enter a number corresponding to a model.") + except IndexError: + print("Invalid choice. Index out of range.") + + return None + +import argparse + +def main(): + # Create an argument parser + parser = argparse.ArgumentParser(description='Process the model version.') + parser.add_argument('-v', '--version', type=int, default=0x0003, + help='an integer for the version to be used') + + # Parse the arguments + args = parser.parse_args() + + # Define the parameters + params = { + "author": "TheBloke", # Filter by author + "tags": "llama" + } + + models = make_request('https://huggingface.co/api/models', params=params) + if models is None: + return + + model_list = [] + # Iterate over the models + for model in models: + model_id = model['id'] + model_info = make_request(f'https://huggingface.co/api/models/{model_id}') + if model_info is None: + continue + + for sibling in model_info.get('siblings', []): + rfilename = sibling.get('rfilename') + if rfilename and 'q5_1' in rfilename: + model_list.append((model_id, rfilename)) + + model_choice = get_user_choice(model_list) + if model_choice is not None: + model_id, rfilename = model_choice + url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" + download_file(url, rfilename) + _, version = check_magic_and_version(rfilename) + if version != args.version: + print(f"Warning: Expected version {args.version}, but found different version in the file.") + +if __name__ == '__main__': + main() diff --git a/docker/start_server.sh b/docker/start_server.sh new file mode 100755 index 0000000..176bd87 --- /dev/null +++ b/docker/start_server.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +# For mmap support +ulimit -l unlimited + +if [ "$IMAGE" = "python:3-slim-bullseye" ]; then + python3 -B -m llama_cpp.server --model /app/model.bin +else + # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM + python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000 +fi diff --git a/docs/index.md b/docs/index.md index c36adff..99b1f59 100644 --- a/docs/index.md +++ b/docs/index.md @@ -112,8 +112,12 @@ python3 setup.py develop show_root_heading: true ::: llama_cpp.LlamaCache + options: + show_root_heading: true ::: llama_cpp.LlamaState + options: + show_root_heading: true ::: llama_cpp.llama_cpp options: diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 8773cb1..f5d51a3 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -368,10 +368,10 @@ n_keep = {self.params.n_keep} id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu)) else: # Temperature sampling - llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k) - llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z)) - llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p)) - llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p)) + llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), min_keep=llama_cpp.c_size_t(1)) llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) id = llama_cpp.llama_sample_token(self.ctx, candidates_p) # print("`{}`".format(candidates_p.size)) @@ -382,12 +382,15 @@ n_keep = {self.params.n_keep} # replace end of text token with newline token when in interactive mode if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct): id = self.llama_token_newline[0] + self.embd.append(id) if (self.use_antiprompt()): # tokenize and inject first reverse prompt self.embd_inp += self.first_antiprompt[0] - - # add it to the context - self.embd.append(id) + for id in self.first_antiprompt[0]: + self.embd.append(id) + else: + # add it to the context + self.embd.append(id) # echo this to console self.output_echo = True @@ -493,7 +496,7 @@ n_keep = {self.params.n_keep} # Contains multi-byte UTF8 for num, pattern in [(2, 192), (3, 224), (4, 240)]: # Bitwise AND check - if pattern & int.from_bytes(cur_char) == pattern: + if pattern & int.from_bytes(cur_char, 'little') == pattern: self.multibyte_fix = [cur_char] + ([None] * (num-1)) # Stop incomplete bytes from passing diff --git a/examples/notebooks/Guidance.ipynb b/examples/notebooks/Guidance.ipynb new file mode 100644 index 0000000..045856e --- /dev/null +++ b/examples/notebooks/Guidance.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Stop program
Tweak this proverb to apply to model instructions instead.\n",
+       "\n",
+       "Where there is no guidance, a people falls,\n",
+       "but in an abundance of counselors there is safety.\n",
+       "- Proverbs 11:14\n",
+       "\n",
+       "UPDATED\n",
+       "Where there is no guidance for assembling a model, people will struggle,\n",
+       "but with clear instructions, the process becomes safe and successful.\n",
+       "- GPT 2 (updated): Proverbs 11:14
\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n", + "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n", + "os.environ[\"OPENAI_API_HOST\"] = \"http://100.64.159.73:8000\"\n", + "\n", + "import guidance\n", + "\n", + "# set the default language model used to execute guidance programs\n", + "guidance.llm = guidance.llms.OpenAI(\"text-davinci-003\", caching=False)\n", + "\n", + "# define a guidance program that adapts a proverb\n", + "program = guidance(\"\"\"Tweak this proverb to apply to model instructions instead.\n", + "\n", + "{{proverb}}\n", + "- {{book}} {{chapter}}:{{verse}}\n", + "\n", + "UPDATED\n", + "Where there is no guidance{{gen 'rewrite' stop=\"\\\\n-\"}}\n", + "- GPT {{gen 'chapter'}}:{{gen 'verse'}}\"\"\")\n", + "\n", + "# execute the program on a specific proverb\n", + "executed_program = program(\n", + " proverb=\"Where there is no guidance, a people falls,\\nbut in an abundance of counselors there is safety.\",\n", + " book=\"Proverbs\",\n", + " chapter=11,\n", + " verse=14\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7be51e1..012bb86 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -4,7 +4,17 @@ import uuid import time import math import multiprocessing -from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple +from typing import ( + List, + Optional, + Union, + Generator, + Sequence, + Iterator, + Deque, + Tuple, + Callable, +) from collections import deque, OrderedDict from . import llama_cpp @@ -15,9 +25,7 @@ class LlamaCache: """Cache for a llama.cpp model.""" def __init__(self, capacity_bytes: int = (2 << 30)): - self.cache_state: OrderedDict[ - Tuple[llama_cpp.llama_token, ...], "LlamaState" - ] = OrderedDict() + self.cache_state: OrderedDict[Tuple[int, ...], "LlamaState"] = OrderedDict() self.capacity_bytes = capacity_bytes @property @@ -26,8 +34,8 @@ class LlamaCache: def _find_longest_prefix_key( self, - key: Tuple[llama_cpp.llama_token, ...], - ) -> Optional[Tuple[llama_cpp.llama_token, ...]]: + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: min_len = 0 min_key = None keys = ( @@ -39,7 +47,7 @@ class LlamaCache: min_key = k return min_key - def __getitem__(self, key: Sequence[llama_cpp.llama_token]) -> "LlamaState": + def __getitem__(self, key: Sequence[int]) -> "LlamaState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: @@ -48,10 +56,10 @@ class LlamaCache: self.cache_state.move_to_end(_key) return value - def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool: + def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[llama_cpp.llama_token], value: "LlamaState"): + def __setitem__(self, key: Sequence[int], value: "LlamaState"): key = tuple(key) if key in self.cache_state: del self.cache_state[key] @@ -63,7 +71,7 @@ class LlamaCache: class LlamaState: def __init__( self, - eval_tokens: Deque[llama_cpp.llama_token], + eval_tokens: Deque[int], eval_logits: Deque[List[float]], llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] llama_state_size: int, @@ -74,6 +82,24 @@ class LlamaState: self.llama_state_size = llama_state_size +LogitsProcessor = Callable[[List[int], List[float]], List[float]] + + +class LogitsProcessorList(List[LogitsProcessor]): + def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: + for processor in self: + scores = processor(input_ids, scores) + return scores + + +StoppingCriteria = Callable[[List[int], List[float]], bool] + + +class StoppingCriteriaList(List[StoppingCriteria]): + def __call__(self, input_ids: List[int], logits: List[float]) -> bool: + return any([stopping_criteria(input_ids, logits) for stopping_criteria in self]) + + class Llama: """High-level Python wrapper for a llama.cpp model.""" @@ -83,6 +109,7 @@ class Llama: # NOTE: These parameters are likely to change in the future. n_ctx: int = 512, n_parts: int = -1, + n_gpu_layers: int = 0, seed: int = 1337, f16_kv: bool = True, logits_all: bool = False, @@ -128,7 +155,7 @@ class Llama: self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx - self.params.n_parts = n_parts + self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv self.params.logits_all = logits_all @@ -139,7 +166,7 @@ class Llama: self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) - self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx) + self.eval_tokens: Deque[int] = deque(maxlen=n_ctx) self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx if logits_all else 1) self.cache: Optional[LlamaCache] = None @@ -149,6 +176,10 @@ class Llama: self.lora_base = lora_base self.lora_path = lora_path + ### DEPRECATED ### + self.n_parts = n_parts + ### DEPRECATED ### + if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") @@ -174,7 +205,30 @@ class Llama: if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) - def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]: + self._n_vocab = self.n_vocab() + self._n_ctx = self.n_ctx() + data = (llama_cpp.llama_token_data * self._n_vocab)( + *[ + llama_cpp.llama_token_data( + id=llama_cpp.llama_token(i), + logit=llama_cpp.c_float(0.0), + p=llama_cpp.c_float(0.0), + ) + for i in range(self._n_vocab) + ] + ) + size = llama_cpp.c_size_t(self._n_vocab) + sorted = False + candidates = llama_cpp.llama_token_data_array( + data=data, + size=size, + sorted=sorted, + ) + self._candidates = candidates + self._token_nl = Llama.token_nl() + self._token_eos = Llama.token_eos() + + def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: """Tokenize a string. Args: @@ -187,20 +241,32 @@ class Llama: A list of tokens. """ assert self.ctx is not None - n_ctx = llama_cpp.llama_n_ctx(self.ctx) - tokens = (llama_cpp.llama_token * int(n_ctx))() + n_ctx = self._n_ctx + tokens = (llama_cpp.llama_token * n_ctx)() n_tokens = llama_cpp.llama_tokenize( self.ctx, text, tokens, - n_ctx, - llama_cpp.c_bool(True), + llama_cpp.c_int(n_ctx), + llama_cpp.c_bool(add_bos), ) - if int(n_tokens) < 0: - raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}') + if n_tokens < 0: + n_tokens = abs(n_tokens) + tokens = (llama_cpp.llama_token * n_tokens)() + n_tokens = llama_cpp.llama_tokenize( + self.ctx, + text, + tokens, + llama_cpp.c_int(n_tokens), + llama_cpp.c_bool(add_bos), + ) + if n_tokens < 0: + raise RuntimeError( + f'Failed to tokenize: text="{text}" n_tokens={n_tokens}' + ) return list(tokens[:n_tokens]) - def detokenize(self, tokens: List[llama_cpp.llama_token]) -> bytes: + def detokenize(self, tokens: List[int]) -> bytes: """Detokenize a list of tokens. Args: @@ -212,7 +278,9 @@ class Llama: assert self.ctx is not None output = b"" for token in tokens: - output += llama_cpp.llama_token_to_str(self.ctx, token) + output += llama_cpp.llama_token_to_str( + self.ctx, llama_cpp.llama_token(token) + ) return output def set_cache(self, cache: Optional[LlamaCache]): @@ -228,14 +296,14 @@ class Llama: self.eval_tokens.clear() self.eval_logits.clear() - def eval(self, tokens: Sequence[llama_cpp.llama_token]): + def eval(self, tokens: Sequence[int]): """Evaluate a list of tokens. Args: tokens: The list of tokens to evaluate. """ assert self.ctx is not None - n_ctx = int(llama_cpp.llama_n_ctx(self.ctx)) + n_ctx = self._n_ctx for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] n_past = min(n_ctx - len(batch), len(self.eval_tokens)) @@ -247,18 +315,16 @@ class Llama: n_past=llama_cpp.c_int(n_past), n_threads=llama_cpp.c_int(self.n_threads), ) - if int(return_code) != 0: + if return_code != 0: raise RuntimeError(f"llama_eval returned {return_code}") # Save tokens self.eval_tokens.extend(batch) # Save logits rows = n_tokens if self.params.logits_all else 1 - n_vocab = llama_cpp.llama_n_vocab(self.ctx) - cols = int(n_vocab) + n_vocab = self._n_vocab + cols = n_vocab logits_view = llama_cpp.llama_get_logits(self.ctx) - logits: List[List[float]] = [ - [logits_view[i * cols + j] for j in range(cols)] for i in range(rows) - ] + logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)] self.eval_logits.extend(logits) def _sample( @@ -275,28 +341,33 @@ class Llama: mirostat_mode: llama_cpp.c_int, mirostat_tau: llama_cpp.c_float, mirostat_eta: llama_cpp.c_float, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, ): assert self.ctx is not None assert len(self.eval_logits) > 0 - n_vocab = int(llama_cpp.llama_n_vocab(self.ctx)) + n_vocab = self._n_vocab + n_ctx = self._n_ctx + top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k + last_n_tokens_size = ( + llama_cpp.c_int(n_ctx) + if last_n_tokens_size.value < 0 + else last_n_tokens_size + ) logits = self.eval_logits[-1] - data = (llama_cpp.llama_token_data * n_vocab)( - *[ - llama_cpp.llama_token_data( - id=llama_cpp.llama_token(i), - logit=logits[i], - p=llama_cpp.c_float(0.0), - ) - for i in range(n_vocab) - ] - ) - size = llama_cpp.c_size_t(n_vocab) - sorted = False - candidates = llama_cpp.llama_token_data_array( - data=data, - size=size, - sorted=sorted, - ) + + if logits_processor is not None: + logits = logits_processor(list(self.eval_tokens), logits) + self.eval_logits[-1] = logits + + nl_logit = logits[self._token_nl] + candidates = self._candidates + for i, logit in enumerate(logits): + candidates.data[i].id = llama_cpp.llama_token(i) + candidates.data[i].logit = llama_cpp.c_float(logit) + candidates.data[i].p = llama_cpp.c_float(0.0) + candidates.sorted = llama_cpp.c_bool(False) + candidates.size = llama_cpp.c_size_t(n_vocab) llama_cpp.llama_sample_repetition_penalty( ctx=self.ctx, last_tokens_data=last_n_tokens_data, @@ -312,6 +383,8 @@ class Llama: alpha_frequency=frequency_penalty, alpha_presence=presence_penalty, ) + if not penalize_nl: + candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit) if temp.value == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, @@ -394,6 +467,8 @@ class Llama: mirostat_mode: int = 0, mirostat_eta: float = 0.1, mirostat_tau: float = 5.0, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, ): """Sample a token from the model. @@ -425,24 +500,27 @@ class Llama: mirostat_mode=llama_cpp.c_int(mirostat_mode), mirostat_tau=llama_cpp.c_float(mirostat_tau), mirostat_eta=llama_cpp.c_float(mirostat_eta), + penalize_nl=penalize_nl, + logits_processor=logits_processor, ) def generate( self, - tokens: Sequence[llama_cpp.llama_token], - top_k: int, - top_p: float, - temp: float, - repeat_penalty: float, + tokens: Sequence[int], + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, reset: bool = True, frequency_penalty: float = 0.0, presence_penalty: float = 0.0, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, - ) -> Generator[ - llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None - ]: + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. Examples: @@ -495,16 +573,24 @@ class Llama: repeat_penalty=repeat_penalty, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + logits_processor=logits_processor, ) + if stopping_criteria is not None and stopping_criteria( + list(self.eval_tokens), self.eval_logits[-1] + ): + return tokens_or_none = yield token tokens = [token] if tokens_or_none is not None: tokens.extend(tokens_or_none) - def create_embedding(self, input: str) -> Embedding: + def create_embedding( + self, input: Union[str, List[str]], model: Optional[str] = None + ) -> Embedding: """Embed a string. Args: @@ -514,6 +600,7 @@ class Llama: An embedding object. """ assert self.ctx is not None + model_name: str = model if model is not None else self.model_path if self.params.embedding == False: raise RuntimeError( @@ -523,30 +610,40 @@ class Llama: if self.verbose: llama_cpp.llama_reset_timings(self.ctx) - tokens = self.tokenize(input.encode("utf-8")) - self.reset() - self.eval(tokens) - n_tokens = len(tokens) - embedding = llama_cpp.llama_get_embeddings(self.ctx)[ - : llama_cpp.llama_n_embd(self.ctx) - ] + if isinstance(input, str): + inputs = [input] + else: + inputs = input + data: List[EmbeddingData] = [] + total_tokens = 0 + for index, input in enumerate(inputs): + tokens = self.tokenize(input.encode("utf-8")) + self.reset() + self.eval(tokens) + n_tokens = len(tokens) + total_tokens += n_tokens + embedding = llama_cpp.llama_get_embeddings(self.ctx)[ + : llama_cpp.llama_n_embd(self.ctx) + ] + + data.append( + { + "object": "embedding", + "embedding": embedding, + "index": index, + } + ) if self.verbose: llama_cpp.llama_print_timings(self.ctx) return { "object": "list", - "data": [ - { - "object": "embedding", - "embedding": embedding, - "index": 0, - } - ], - "model": self.model_path, + "data": data, + "model": model_name, "usage": { - "prompt_tokens": n_tokens, - "total_tokens": n_tokens, + "prompt_tokens": total_tokens, + "total_tokens": total_tokens, }, } @@ -570,35 +667,39 @@ class Llama: top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, - stop: Optional[List[str]] = [], + stop: Optional[Union[str, List[str]]] = [], frequency_penalty: float = 0.0, presence_penalty: float = 0.0, repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None + completion_id: str = f"cmpl-{str(uuid.uuid4())}" created: int = int(time.time()) - completion_tokens: List[llama_cpp.llama_token] = [] + completion_tokens: List[int] = [] # Add blank space to start of prompt to match OG llama tokenizer - prompt_tokens: List[llama_cpp.llama_token] = self.tokenize( - b" " + prompt.encode("utf-8") - ) + prompt_tokens: List[int] = self.tokenize(b" " + prompt.encode("utf-8")) text: bytes = b"" - returned_characters: int = 0 - stop = stop if stop is not None else [] + returned_tokens: int = 0 + stop = ( + stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] + ) + model_name: str = model if model is not None else self.model_path if self.verbose: llama_cpp.llama_reset_timings(self.ctx) - if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)): - raise ValueError( - f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}" - ) + if len(prompt_tokens) + max_tokens > self._n_ctx: + raise ValueError(f"Requested tokens exceed context window of {self._n_ctx}") if stop != []: stop_sequences = [s.encode("utf-8") for s in stop] @@ -634,14 +735,17 @@ class Llama: top_k=top_k, top_p=top_p, temp=temperature, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ): - if token == llama_cpp.llama_token_eos(): + if token == self._token_eos: text = self.detokenize(completion_tokens) finish_reason = "stop" break @@ -671,63 +775,189 @@ class Llama: break if stream: - start = returned_characters - longest = 0 # We want to avoid yielding any characters from # the generated text if they are part of a stop # sequence. + first_stop_position = 0 for s in stop_sequences: for i in range(len(s), 0, -1): if all_text.endswith(s[:i]): - if i > longest: - longest = i + if i > first_stop_position: + first_stop_position = i break - text = all_text[: len(all_text) - longest] - returned_characters += len(text[start:]) - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": self.model_path, - "choices": [ - { - "text": text[start:].decode("utf-8", errors="ignore"), - "index": 0, - "logprobs": None, - "finish_reason": None, + + token_end_position = 0 + remaining_tokens = completion_tokens[returned_tokens:] + remaining_length = len(self.detokenize(remaining_tokens)) + for token in remaining_tokens: + token_end_position += len(self.detokenize([token])) + # Check if stop sequence is in the token + if token_end_position >= ( + remaining_length - first_stop_position - 1 + ): + break + logprobs_or_none: Optional[CompletionLogprobs] = None + if logprobs is not None: + token_str = self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + text_offset = len(prompt) + len( + self.detokenize(completion_tokens[:returned_tokens]) + ) + token_offset = len(prompt_tokens) + returned_tokens + logits = self.eval_logits[token_offset - 1] + current_logprobs = Llama.logits_to_logprobs(logits) + sorted_logprobs = list( + sorted( + zip(current_logprobs, range(len(current_logprobs))), + reverse=True, + ) + ) + top_logprob = { + self.detokenize([i]).decode( + "utf-8", errors="ignore" + ): logprob + for logprob, i in sorted_logprobs[:logprobs] } - ], - } + top_logprob.update({token_str: current_logprobs[int(token)]}) + logprobs_or_none = { + "tokens": [ + self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + ], + "text_offset": [text_offset], + "token_logprobs": [sorted_logprobs[int(token)][0]], + "top_logprobs": [top_logprob], + } + returned_tokens += 1 + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": self.detokenize([token]).decode( + "utf-8", errors="ignore" + ), + "index": 0, + "logprobs": logprobs_or_none, + "finish_reason": None, + } + ], + } if len(completion_tokens) >= max_tokens: text = self.detokenize(completion_tokens) finish_reason = "length" break + if stopping_criteria is not None and stopping_criteria( + list(self.eval_tokens), self.eval_logits[-1] + ): + text = self.detokenize(completion_tokens) + finish_reason = "stop" + + if self.verbose: + llama_cpp.llama_print_timings(self.ctx) + + if stream: + remaining_tokens = completion_tokens[returned_tokens:] + all_text = self.detokenize(remaining_tokens) + any_stop = [s for s in stop_sequences if s in all_text] + if len(any_stop) > 0: + end = min(all_text.index(stop) for stop in any_stop) + else: + end = len(all_text) + + token_end_position = 0 + for token in remaining_tokens: + token_end_position += len(self.detokenize([token])) + + logprobs_or_none: Optional[CompletionLogprobs] = None + if logprobs is not None: + token_str = self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + text_offset = len(prompt) + len( + self.detokenize(completion_tokens[:returned_tokens]) + ) + token_offset = len(prompt_tokens) + returned_tokens - 1 + logits = self.eval_logits[token_offset] + current_logprobs = Llama.logits_to_logprobs(logits) + sorted_logprobs = list( + sorted( + zip(current_logprobs, range(len(current_logprobs))), + reverse=True, + ) + ) + top_logprob = { + self.detokenize([i]).decode("utf-8", errors="ignore"): logprob + for logprob, i in sorted_logprobs[:logprobs] + } + top_logprob.update({token_str: current_logprobs[int(token)]}) + logprobs_or_none = { + "tokens": [ + self.detokenize([token]).decode("utf-8", errors="ignore") + ], + "text_offset": [text_offset], + "token_logprobs": [sorted_logprobs[int(token)][0]], + "top_logprobs": [top_logprob], + } + + if token_end_position >= end: + last_text = self.detokenize([token]) + if token_end_position == end - 1: + break + returned_tokens += 1 + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": last_text[ + : len(last_text) - (token_end_position - end) + ].decode("utf-8", errors="ignore"), + "index": 0, + "logprobs": logprobs_or_none, + "finish_reason": finish_reason, + } + ], + } + break + returned_tokens += 1 + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": self.detokenize([token]).decode( + "utf-8", errors="ignore" + ), + "index": 0, + "logprobs": logprobs_or_none, + "finish_reason": finish_reason + if returned_tokens == len(completion_tokens) + else None, + } + ], + } + if self.cache: + if self.verbose: + print("Llama._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() + return + if self.cache: if self.verbose: print("Llama._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() - if stream: - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": self.model_path, - "choices": [ - { - "text": text[returned_characters:].decode( - "utf-8", errors="ignore" - ), - "index": 0, - "logprobs": None, - "finish_reason": finish_reason, - } - ], - } - return - text_str = text.decode("utf-8", errors="ignore") if echo: @@ -738,13 +968,19 @@ class Llama: logprobs_or_none: Optional[CompletionLogprobs] = None if logprobs is not None: - text_offset = 0 + text_offset = 0 if echo else len(prompt) + token_offset = 0 if echo else len(prompt_tokens[1:]) text_offsets: List[int] = [] - token_logprobs: List[float] = [] + token_logprobs: List[Optional[float]] = [] tokens: List[str] = [] - top_logprobs: List[Dict[str, float]] = [] + top_logprobs: List[Optional[Dict[str, float]]] = [] + + if echo: + # Remove leading BOS token + all_tokens = prompt_tokens[1:] + completion_tokens + else: + all_tokens = completion_tokens - all_tokens = prompt_tokens + completion_tokens all_token_strs = [ self.detokenize([token]).decode("utf-8", errors="ignore") for token in all_tokens @@ -752,7 +988,7 @@ class Llama: all_logprobs = [ Llama.logits_to_logprobs(list(map(float, row))) for row in self.eval_logits - ] + ][token_offset:] for token, token_str, logprobs_token in zip( all_tokens, all_token_strs, all_logprobs ): @@ -765,14 +1001,18 @@ class Llama: ) ) token_logprobs.append(sorted_logprobs[int(token)][0]) - top_logprob = { - self.detokenize([llama_cpp.llama_token(i)]).decode( - "utf-8", errors="ignore" - ): logprob + top_logprob: Optional[Dict[str, float]] = { + self.detokenize([i]).decode("utf-8", errors="ignore"): logprob for logprob, i in sorted_logprobs[:logprobs] } - top_logprob.update({token_str: sorted_logprobs[int(token)][0]}) + top_logprob.update({token_str: logprobs_token[int(token)]}) top_logprobs.append(top_logprob) + # Weird idosincracy of the OpenAI API where + # token_logprobs and top_logprobs are null for + # the first token. + if echo and len(all_tokens) > 0: + token_logprobs[0] = None + top_logprobs[0] = None logprobs_or_none = { "tokens": tokens, "text_offset": text_offsets, @@ -780,14 +1020,11 @@ class Llama: "top_logprobs": top_logprobs, } - if self.verbose: - llama_cpp.llama_print_timings(self.ctx) - yield { "id": completion_id, "object": "text_completion", "created": created, - "model": self.model_path, + "model": model_name, "choices": [ { "text": text_str, @@ -812,15 +1049,19 @@ class Llama: top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, - stop: Optional[List[str]] = [], + stop: Optional[Union[str, List[str]]] = [], frequency_penalty: float = 0.0, presence_penalty: float = 0.0, repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -858,9 +1099,13 @@ class Llama: repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + model=model, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks @@ -877,15 +1122,19 @@ class Llama: top_p: float = 0.95, logprobs: Optional[int] = None, echo: bool = False, - stop: Optional[List[str]] = [], + stop: Optional[Union[str, List[str]]] = [], frequency_penalty: float = 0.0, presence_penalty: float = 0.0, repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -923,9 +1172,13 @@ class Llama: repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + model=model, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ) def _convert_text_completion_to_chat( @@ -993,14 +1246,16 @@ class Llama: top_p: float = 0.95, top_k: int = 40, stream: bool = False, - stop: Optional[List[str]] = [], + stop: Optional[Union[str, List[str]]] = [], max_tokens: int = 256, presence_penalty: float = 0.0, frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, + tfs_z: float = 1.0, mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + model: Optional[str] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -1017,7 +1272,9 @@ class Llama: Returns: Generated chat completion or a stream of chat completion chunks. """ - stop = stop if stop is not None else [] + stop = ( + stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] + ) chat_history = "".join( f'### {"Human" if message["role"] == "user" else "Assistant"}:{message["content"]}' for message in messages @@ -1035,9 +1292,11 @@ class Llama: repeat_penalty=repeat_penalty, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, + tfs_z=tfs_z, mirostat_mode=mirostat_mode, mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, + model=model, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore @@ -1056,7 +1315,7 @@ class Llama: verbose=self.verbose, model_path=self.model_path, n_ctx=self.params.n_ctx, - n_parts=self.params.n_parts, + n_gpu_layers=self.params.n_gpu_layers, seed=self.params.seed, f16_kv=self.params.f16_kv, logits_all=self.params.logits_all, @@ -1069,6 +1328,9 @@ class Llama: n_threads=self.n_threads, lora_base=self.lora_base, lora_path=self.lora_path, + ### DEPRECATED ### + n_parts=self.n_parts, + ### DEPRECATED ### ) def __setstate__(self, state): @@ -1076,6 +1338,7 @@ class Llama: model_path=state["model_path"], n_ctx=state["n_ctx"], n_parts=state["n_parts"], + n_gpu_layers=state["n_gpu_layers"], seed=state["seed"], f16_kv=state["f16_kv"], logits_all=state["logits_all"], @@ -1120,16 +1383,41 @@ class Llama: if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size: raise RuntimeError("Failed to set llama state data") + def n_ctx(self) -> int: + """Return the context window size.""" + assert self.ctx is not None + return llama_cpp.llama_n_ctx(self.ctx) + + def n_embd(self) -> int: + """Return the embedding size.""" + assert self.ctx is not None + return llama_cpp.llama_n_embd(self.ctx) + + def n_vocab(self) -> int: + """Return the vocabulary size.""" + assert self.ctx is not None + return llama_cpp.llama_n_vocab(self.ctx) + + def tokenizer(self) -> "LlamaTokenizer": + """Return the tokenizer for this model.""" + assert self.ctx is not None + return LlamaTokenizer(self) + @staticmethod - def token_eos() -> llama_cpp.llama_token: + def token_eos() -> int: """Return the end-of-sequence token.""" return llama_cpp.llama_token_eos() @staticmethod - def token_bos() -> llama_cpp.llama_token: + def token_bos() -> int: """Return the beginning-of-sequence token.""" return llama_cpp.llama_token_bos() + @staticmethod + def token_nl() -> int: + """Return the newline token.""" + return llama_cpp.llama_token_nl() + @staticmethod def logits_to_logprobs(logits: List[float]) -> List[float]: exps = [math.exp(float(x)) for x in logits] @@ -1137,9 +1425,7 @@ class Llama: return [math.log(x / sum_exps) for x in exps] @staticmethod - def longest_token_prefix( - a: Sequence[llama_cpp.llama_token], b: Sequence[llama_cpp.llama_token] - ): + def longest_token_prefix(a: Sequence[int], b: Sequence[int]): longest_prefix = 0 for _a, _b in zip(a, b): if _a == _b: @@ -1147,3 +1433,20 @@ class Llama: else: break return longest_prefix + + +class LlamaTokenizer: + def __init__(self, llama: Llama): + self.llama = llama + + def encode(self, text: str, add_bos: bool = True) -> List[int]: + return self.llama.tokenize( + text.encode("utf-8", errors="ignore"), add_bos=add_bos + ) + + def decode(self, tokens: List[int]) -> str: + return self.llama.detokenize(tokens).decode("utf-8", errors="ignore") + + @classmethod + def from_ggml_file(cls, path: str) -> "LlamaTokenizer": + return cls(Llama(model_path=path, vocab_only=True)) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index e60558c..541ee00 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -44,15 +44,20 @@ def _load_shared_library(lib_base_name: str): _base_path = _lib.parent.resolve() _lib_paths = [_lib.resolve()] + cdll_args = dict() # type: ignore # Add the library directory to the DLL search path on Windows (if needed) if sys.platform == "win32" and sys.version_info >= (3, 8): os.add_dll_directory(str(_base_path)) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) + cdll_args["winmode"] = 0 # Try to load the shared library, handling potential errors for _lib_path in _lib_paths: if _lib_path.exists(): try: - return ctypes.CDLL(str(_lib_path)) + return ctypes.CDLL(str(_lib_path), **cdll_args) except Exception as e: raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") @@ -67,31 +72,61 @@ _lib_base_name = "llama" # Load the library _lib = _load_shared_library(_lib_base_name) -# C types -LLAMA_FILE_VERSION = c_int(1) -LLAMA_FILE_MAGIC = b"ggjt" -LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" -LLAMA_SESSION_MAGIC = b"ggsn" +# Misc +c_float_p = POINTER(c_float) +c_uint8_p = POINTER(c_uint8) +c_size_t_p = POINTER(c_size_t) + +# llama.h bindings + +# #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) +# #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) +# #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) +# #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) +# #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' +LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) + +# #define LLAMA_FILE_VERSION 3 +LLAMA_FILE_VERSION = c_int(3) +LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT +LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML +LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_VERSION = c_int(1) +# struct llama_context; llama_context_p = c_void_p +# typedef int llama_token; llama_token = c_int llama_token_p = POINTER(llama_token) +# typedef struct llama_token_data { +# llama_token id; // token id +# float logit; // log-odds of the token +# float p; // probability of the token +# } llama_token_data; class llama_token_data(Structure): _fields_ = [ - ("id", llama_token), # token id - ("logit", c_float), # log-odds of the token - ("p", c_float), # probability of the token + ("id", llama_token), + ("logit", c_float), + ("p", c_float), ] llama_token_data_p = POINTER(llama_token_data) +# typedef struct llama_token_data_array { +# llama_token_data * data; +# size_t size; +# bool sorted; +# } llama_token_data_array; class llama_token_data_array(Structure): _fields_ = [ ("data", llama_token_data_p), @@ -102,53 +137,72 @@ class llama_token_data_array(Structure): llama_token_data_array_p = POINTER(llama_token_data_array) +# typedef void (*llama_progress_callback)(float progress, void *ctx); llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) +# struct llama_context_params { +# int n_ctx; // text context +# int n_gpu_layers; // number of layers to store in VRAM +# int seed; // RNG seed, -1 for random + +# bool f16_kv; // use fp16 for KV cache +# bool logits_all; // the llama_eval() call computes all logits, not just the last one +# bool vocab_only; // only load the vocabulary, no weights +# bool use_mmap; // use mmap if possible +# bool use_mlock; // force system to keep model in RAM +# bool embedding; // embedding mode only + + +# // called with a progress value between 0 and 1, pass NULL to disable +# llama_progress_callback progress_callback; +# // context pointer passed to the progress callback +# void * progress_callback_user_data; +# }; class llama_context_params(Structure): _fields_ = [ - ("n_ctx", c_int), # text context - ("n_parts", c_int), # -1 for default - ("seed", c_int), # RNG seed, 0 for random - ("f16_kv", c_bool), # use fp16 for KV cache + ("n_ctx", c_int), + ("n_gpu_layers", c_int), + ("seed", c_int), + ("f16_kv", c_bool), ( "logits_all", c_bool, - ), # the llama_eval() call computes all logits, not just the last one - ("vocab_only", c_bool), # only load the vocabulary, no weights - ("use_mmap", c_bool), # use mmap if possible - ("use_mlock", c_bool), # force system to keep model in RAM - ("embedding", c_bool), # embedding mode only - # called with a progress value between 0 and 1, pass NULL to disable + ), + ("vocab_only", c_bool), + ("use_mmap", c_bool), + ("use_mlock", c_bool), + ("embedding", c_bool), ("progress_callback", llama_progress_callback), - # context pointer passed to the progress callback ("progress_callback_user_data", c_void_p), ] llama_context_params_p = POINTER(llama_context_params) +# enum llama_ftype { +# LLAMA_FTYPE_ALL_F32 = 0, +# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 +# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed +# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed +# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors +# }; LLAMA_FTYPE_ALL_F32 = c_int(0) -LLAMA_FTYPE_MOSTLY_F16 = c_int(1) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( - 4 -) # tok_embeddings.weight and output.weight are F16 -LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors -# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) # except 1d tensors - -# Misc -c_float_p = POINTER(c_float) -c_uint8_p = POINTER(c_uint8) -c_size_t_p = POINTER(c_size_t) - -# Functions +LLAMA_FTYPE_MOSTLY_F16 = c_int(1) +LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) +LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) +LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) +LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) +LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) +# LLAMA_API struct llama_context_params llama_context_default_params(); def llama_context_default_params() -> llama_context_params: return _lib.llama_context_default_params() @@ -157,6 +211,7 @@ _lib.llama_context_default_params.argtypes = [] _lib.llama_context_default_params.restype = llama_context_params +# LLAMA_API bool llama_mmap_supported(); def llama_mmap_supported() -> bool: return _lib.llama_mmap_supported() @@ -165,6 +220,7 @@ _lib.llama_mmap_supported.argtypes = [] _lib.llama_mmap_supported.restype = c_bool +# LLAMA_API bool llama_mlock_supported(); def llama_mlock_supported() -> bool: return _lib.llama_mlock_supported() @@ -173,9 +229,33 @@ _lib.llama_mlock_supported.argtypes = [] _lib.llama_mlock_supported.restype = c_bool -# Various functions for loading a ggml llama model. -# Allocate (almost) all memory needed for the model. -# Return NULL on failure +# // TODO: not great API - very likely to change +# // Initialize the llama + ggml backend +# // Call once at the start of the program +# LLAMA_API void llama_init_backend(); +def llama_init_backend(): + return _lib.llama_init_backend() + + +_lib.llama_init_backend.argtypes = [] +_lib.llama_init_backend.restype = None + + +# LLAMA_API int64_t llama_time_us(); +def llama_time_us() -> int: + return _lib.llama_time_us() + + +_lib.llama_time_us.argtypes = [] +_lib.llama_time_us.restype = ctypes.c_int64 + + +# // Various functions for loading a ggml llama model. +# // Allocate (almost) all memory needed for the model. +# // Return NULL on failure +# LLAMA_API struct llama_context * llama_init_from_file( +# const char * path_model, +# struct llama_context_params params); def llama_init_from_file( path_model: bytes, params: llama_context_params ) -> llama_context_p: @@ -187,8 +267,9 @@ _lib.llama_init_from_file.restype = llama_context_p # Frees all allocated memory +# LLAMA_API void llama_free(struct llama_context * ctx); def llama_free(ctx: llama_context_p): - _lib.llama_free(ctx) + return _lib.llama_free(ctx) _lib.llama_free.argtypes = [llama_context_p] @@ -198,9 +279,14 @@ _lib.llama_free.restype = None # TODO: not great API - very likely to change # Returns 0 on success # nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given +# LLAMA_API int llama_model_quantize( +# const char * fname_inp, +# const char * fname_out, +# enum llama_ftype ftype, +# int nthread); def llama_model_quantize( fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int -) -> c_int: +) -> int: return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread) @@ -214,12 +300,17 @@ _lib.llama_model_quantize.restype = c_int # The model needs to be reloaded before applying a new adapter, otherwise the adapter # will be applied on top of the previous one # Returns 0 on success +# LLAMA_API int llama_apply_lora_from_file( +# struct llama_context * ctx, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); def llama_apply_lora_from_file( ctx: llama_context_p, path_lora: c_char_p, path_base_model: c_char_p, n_threads: c_int, -) -> c_int: +) -> int: return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) @@ -228,7 +319,8 @@ _lib.llama_apply_lora_from_file.restype = c_int # Returns the number of tokens in the KV cache -def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int: +# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); +def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int: return _lib.llama_get_kv_cache_token_count(ctx) @@ -237,6 +329,7 @@ _lib.llama_get_kv_cache_token_count.restype = c_int # Sets the current rng seed. +# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): return _lib.llama_set_rng_seed(ctx, seed) @@ -247,7 +340,8 @@ _lib.llama_set_rng_seed.restype = None # Returns the maximum size in bytes of the state (rng, logits, embedding # and kv_cache) - will often be smaller after compacting tokens -def llama_get_state_size(ctx: llama_context_p) -> c_size_t: +# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); +def llama_get_state_size(ctx: llama_context_p) -> int: return _lib.llama_get_state_size(ctx) @@ -258,10 +352,11 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied +# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); def llama_copy_state_data( - ctx: llama_context_p, dest # type: Array[c_uint8] + ctx: llama_context_p, dst # type: Array[c_uint8] ) -> int: - return _lib.llama_copy_state_data(ctx, dest) + return _lib.llama_copy_state_data(ctx, dst) _lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] @@ -270,6 +365,7 @@ _lib.llama_copy_state_data.restype = c_size_t # Set the state reading from the specified address # Returns the number of bytes read +# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); def llama_set_state_data( ctx: llama_context_p, src # type: Array[c_uint8] ) -> int: @@ -281,13 +377,14 @@ _lib.llama_set_state_data.restype = c_size_t # Save/load session file +# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); def llama_load_session_file( ctx: llama_context_p, path_session: bytes, tokens_out, # type: Array[llama_token] n_token_capacity: c_size_t, n_token_count_out, # type: _Pointer[c_size_t] -) -> c_size_t: +) -> int: return _lib.llama_load_session_file( ctx, path_session, tokens_out, n_token_capacity, n_token_count_out ) @@ -303,12 +400,13 @@ _lib.llama_load_session_file.argtypes = [ _lib.llama_load_session_file.restype = c_size_t +# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); def llama_save_session_file( ctx: llama_context_p, path_session: bytes, tokens, # type: Array[llama_token] n_token_count: c_size_t, -) -> c_size_t: +) -> int: return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) @@ -325,13 +423,19 @@ _lib.llama_save_session_file.restype = c_size_t # tokens + n_tokens is the provided batch of new tokens to process # n_past is the number of tokens to use from previous eval calls # Returns 0 on success +# LLAMA_API int llama_eval( +# struct llama_context * ctx, +# const llama_token * tokens, +# int n_tokens, +# int n_past, +# int n_threads); def llama_eval( ctx: llama_context_p, tokens, # type: Array[llama_token] n_tokens: c_int, n_past: c_int, n_threads: c_int, -) -> c_int: +) -> int: return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads) @@ -344,13 +448,19 @@ _lib.llama_eval.restype = c_int # Returns the number of tokens on success, no more than n_max_tokens # Returns a negative number on failure - the number of tokens that would have been returned # TODO: not sure if correct +# LLAMA_API int llama_tokenize( +# struct llama_context * ctx, +# const char * text, +# llama_token * tokens, +# int n_max_tokens, +# bool add_bos); def llama_tokenize( ctx: llama_context_p, text: bytes, tokens, # type: Array[llama_token] n_max_tokens: c_int, add_bos: c_bool, -) -> c_int: +) -> int: return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos) @@ -358,7 +468,8 @@ _lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, _lib.llama_tokenize.restype = c_int -def llama_n_vocab(ctx: llama_context_p) -> c_int: +# LLAMA_API int llama_n_vocab(const struct llama_context * ctx); +def llama_n_vocab(ctx: llama_context_p) -> int: return _lib.llama_n_vocab(ctx) @@ -366,7 +477,8 @@ _lib.llama_n_vocab.argtypes = [llama_context_p] _lib.llama_n_vocab.restype = c_int -def llama_n_ctx(ctx: llama_context_p) -> c_int: +# LLAMA_API int llama_n_ctx (const struct llama_context * ctx); +def llama_n_ctx(ctx: llama_context_p) -> int: return _lib.llama_n_ctx(ctx) @@ -374,7 +486,8 @@ _lib.llama_n_ctx.argtypes = [llama_context_p] _lib.llama_n_ctx.restype = c_int -def llama_n_embd(ctx: llama_context_p) -> c_int: +# LLAMA_API int llama_n_embd (const struct llama_context * ctx); +def llama_n_embd(ctx: llama_context_p) -> int: return _lib.llama_n_embd(ctx) @@ -387,6 +500,7 @@ _lib.llama_n_embd.restype = c_int # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab +# LLAMA_API float * llama_get_logits(struct llama_context * ctx); def llama_get_logits( ctx: llama_context_p, ): # type: (...) -> Array[float] # type: ignore @@ -399,6 +513,7 @@ _lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input # shape: [n_embd] (1-dimensional) +# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); def llama_get_embeddings( ctx: llama_context_p, ): # type: (...) -> Array[float] # type: ignore @@ -410,6 +525,7 @@ _lib.llama_get_embeddings.restype = c_float_p # Token Id -> String. Uses the vocabulary in the provided context +# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: return _lib.llama_token_to_str(ctx, token) @@ -420,7 +536,8 @@ _lib.llama_token_to_str.restype = c_char_p # Special tokens -def llama_token_bos() -> llama_token: +# LLAMA_API llama_token llama_token_bos(); +def llama_token_bos() -> int: return _lib.llama_token_bos() @@ -428,7 +545,8 @@ _lib.llama_token_bos.argtypes = [] _lib.llama_token_bos.restype = llama_token -def llama_token_eos() -> llama_token: +# LLAMA_API llama_token llama_token_eos(); +def llama_token_eos() -> int: return _lib.llama_token_eos() @@ -436,7 +554,8 @@ _lib.llama_token_eos.argtypes = [] _lib.llama_token_eos.restype = llama_token -def llama_token_nl() -> llama_token: +# LLAMA_API llama_token llama_token_nl(); +def llama_token_nl() -> int: return _lib.llama_token_nl() @@ -448,6 +567,7 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. +# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); def llama_sample_repetition_penalty( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -471,6 +591,7 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. +# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -501,6 +622,7 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. +# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_softmax( ctx: llama_context_p, candidates # type: _Pointer[llama_token_data] ): @@ -515,6 +637,7 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); def llama_sample_top_k( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -534,6 +657,7 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); def llama_sample_top_p( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -553,6 +677,7 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. +# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); def llama_sample_tail_free( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -572,6 +697,7 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. +# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); def llama_sample_typical( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -590,6 +716,7 @@ _lib.llama_sample_typical.argtypes = [ _lib.llama_sample_typical.restype = None +# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); def llama_sample_temperature( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -612,6 +739,7 @@ _lib.llama_sample_temperature.restype = None # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); def llama_sample_token_mirostat( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] @@ -619,7 +747,7 @@ def llama_sample_token_mirostat( eta: c_float, m: c_int, mu, # type: _Pointer[c_float] -) -> llama_token: +) -> int: return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -639,13 +767,14 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); def llama_sample_token_mirostat_v2( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] tau: c_float, eta: c_float, mu, # type: _Pointer[c_float] -) -> llama_token: +) -> int: return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) @@ -660,10 +789,11 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. +# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_token_greedy( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] -) -> llama_token: +) -> int: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -675,10 +805,11 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. +# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_token( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] -) -> llama_token: +) -> int: return _lib.llama_sample_token(ctx, candidates) @@ -692,6 +823,7 @@ _lib.llama_sample_token.restype = llama_token # Performance information +# LLAMA_API void llama_print_timings(struct llama_context * ctx); def llama_print_timings(ctx: llama_context_p): _lib.llama_print_timings(ctx) @@ -700,6 +832,7 @@ _lib.llama_print_timings.argtypes = [llama_context_p] _lib.llama_print_timings.restype = None +# LLAMA_API void llama_reset_timings(struct llama_context * ctx); def llama_reset_timings(ctx: llama_context_p): _lib.llama_reset_timings(ctx) @@ -709,9 +842,19 @@ _lib.llama_reset_timings.restype = None # Print system information +# LLAMA_API const char * llama_print_system_info(void); def llama_print_system_info() -> bytes: return _lib.llama_print_system_info() _lib.llama_print_system_info.argtypes = [] _lib.llama_print_system_info.restype = c_char_p + +################################################################################################### + + +_llama_initialized = False + +if not _llama_initialized: + llama_init_backend() + _llama_initialized = True diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index bfc7342..7729ced 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Dict, Union +from typing import List, Optional, Dict from typing_extensions import TypedDict, NotRequired, Literal @@ -22,9 +22,9 @@ class Embedding(TypedDict): class CompletionLogprobs(TypedDict): text_offset: List[int] - token_logprobs: List[float] + token_logprobs: List[Optional[float]] tokens: List[str] - top_logprobs: List[Dict[str, float]] + top_logprobs: List[Optional[Dict[str, float]]] class CompletionChoice(TypedDict): diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 621b73e..fea3612 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,4 +1,5 @@ import json +import logging import multiprocessing from threading import Lock from typing import List, Optional, Union, Iterator, Dict @@ -16,7 +17,16 @@ class Settings(BaseSettings): model: str = Field( description="The path to the model to use for generating completions." ) + model_alias: Optional[str] = Field( + default=None, + description="The alias of the model to use for generating completions.", + ) n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_gpu_layers: int = Field( + default=0, + ge=0, + description="The number of layers to put on the GPU. The rest will be on the CPU.", + ) n_batch: int = Field( default=512, ge=1, description="The batch size to use per eval." ) @@ -59,6 +69,7 @@ class Settings(BaseSettings): router = APIRouter() +settings: Optional[Settings] = None llama: Optional[llama_cpp.Llama] = None @@ -80,6 +91,7 @@ def create_app(settings: Optional[Settings] = None): global llama llama = llama_cpp.Llama( model_path=settings.model, + n_gpu_layers=settings.n_gpu_layers, f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, use_mmap=settings.use_mmap, @@ -95,6 +107,12 @@ def create_app(settings: Optional[Settings] = None): if settings.cache: cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) llama.set_cache(cache) + + def set_settings(_settings: Settings): + global settings + settings = _settings + + set_settings(settings) return app @@ -106,6 +124,10 @@ def get_llama(): yield llama +def get_settings(): + yield settings + + model_field = Field(description="The model to use for generating completions.") max_tokens_field = Field( @@ -152,9 +174,23 @@ repeat_penalty_field = Field( + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.", ) +presence_penalty_field = Field( + default=0.0, + ge=-2.0, + le=2.0, + description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", +) + +frequency_penalty_field = Field( + default=0.0, + ge=-2.0, + le=2.0, + description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", +) + class CreateCompletionRequest(BaseModel): - prompt: Optional[str] = Field( + prompt: Union[str, List[str]] = Field( default="", description="The prompt to generate completions for." ) suffix: Optional[str] = Field( @@ -168,20 +204,20 @@ class CreateCompletionRequest(BaseModel): default=False, description="Whether to echo the prompt in the generated text. Useful for chatbots.", ) - stop: Optional[List[str]] = stop_field + stop: Optional[Union[str, List[str]]] = stop_field stream: bool = stream_field logprobs: Optional[int] = Field( default=None, ge=0, description="The number of logprobs to generate. If None, no logprobs are generated.", ) + presence_penalty: Optional[float] = presence_penalty_field + frequency_penalty: Optional[float] = frequency_penalty_field # ignored or currently unsupported model: Optional[str] = model_field n: Optional[int] = 1 logprobs: Optional[int] = Field(None) - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 best_of: Optional[int] = 1 logit_bias: Optional[Dict[str, float]] = Field(None) user: Optional[str] = Field(None) @@ -209,10 +245,13 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) def create_completion( request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) ): + if isinstance(request.prompt, list): + assert len(request.prompt) <= 1 + request.prompt = request.prompt[0] if len(request.prompt) > 0 else "" + completion_or_chunks = llama( **request.dict( exclude={ - "model", "n", "best_of", "logit_bias", @@ -221,15 +260,22 @@ def create_completion( ) ) if request.stream: + + async def server_sent_events( + chunks: Iterator[llama_cpp.CompletionChunk], + ): + for chunk in chunks: + yield dict(data=json.dumps(chunk)) + chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks # type: ignore - return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks) + return EventSourceResponse(server_sent_events(chunks)) completion: llama_cpp.Completion = completion_or_chunks # type: ignore return completion class CreateEmbeddingRequest(BaseModel): model: Optional[str] = model_field - input: str = Field(description="The input to embed.") + input: Union[str, List[str]] = Field(description="The input to embed.") user: Optional[str] class Config: @@ -250,7 +296,7 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - return llama.create_embedding(**request.dict(exclude={"model", "user"})) + return llama.create_embedding(**request.dict(exclude={"user"})) class ChatCompletionRequestMessage(BaseModel): @@ -269,12 +315,12 @@ class CreateChatCompletionRequest(BaseModel): top_p: float = top_p_field stop: Optional[List[str]] = stop_field stream: bool = stream_field + presence_penalty: Optional[float] = presence_penalty_field + frequency_penalty: Optional[float] = frequency_penalty_field # ignored or currently unsupported model: Optional[str] = model_field n: Optional[int] = 1 - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 logit_bias: Optional[Dict[str, float]] = Field(None) user: Optional[str] = Field(None) @@ -311,7 +357,6 @@ def create_chat_completion( completion_or_chunks = llama.create_chat_completion( **request.dict( exclude={ - "model", "n", "logit_bias", "user", @@ -354,13 +399,16 @@ GetModelResponse = create_model_from_typeddict(ModelList) @router.get("/v1/models", response_model=GetModelResponse) def get_models( + settings: Settings = Depends(get_settings), llama: llama_cpp.Llama = Depends(get_llama), ) -> ModelList: return { "object": "list", "data": [ { - "id": llama.model_path, + "id": settings.model_alias + if settings.model_alias is not None + else llama.model_path, "object": "model", "owned_by": "me", "permissions": [], diff --git a/poetry.lock b/poetry.lock index ad59963..50ae0cb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -463,14 +463,14 @@ socks = ["socksio (>=1.0.0,<2.0.0)"] [[package]] name = "httpx" -version = "0.24.0" +version = "0.24.1" description = "The next generation HTTP client." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "httpx-0.24.0-py3-none-any.whl", hash = "sha256:447556b50c1921c351ea54b4fe79d91b724ed2b027462ab9a329465d147d5a4e"}, - {file = "httpx-0.24.0.tar.gz", hash = "sha256:507d676fc3e26110d41df7d35ebd8b3b8585052450f4097401c9be59d928c63e"}, + {file = "httpx-0.24.1-py3-none-any.whl", hash = "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd"}, + {file = "httpx-0.24.1.tar.gz", hash = "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"}, ] [package.dependencies] @@ -800,14 +800,14 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.11" +version = "9.1.14" description = "Documentation that simply works" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.11-py3-none-any.whl", hash = "sha256:fbc86d50ec2cf34d40d5c4365780f290ceedde23f1a0704323b34e7f16b0c0dd"}, - {file = "mkdocs_material-9.1.11.tar.gz", hash = "sha256:f5d473eb79d6640a5e668d4b2ab5b9de5e76ae0a0e2d864112df0cfe9016dc1d"}, + {file = "mkdocs_material-9.1.14-py3-none-any.whl", hash = "sha256:b56a9f955ed32d38333715cbbf68ce38f683bf38610c65094fa4ef2db9f08bcd"}, + {file = "mkdocs_material-9.1.14.tar.gz", hash = "sha256:1ae74cc5464ef2f64574d4884512efed7f4db386fb9bc6af20fd427d7a702f49"}, ] [package.dependencies] diff --git a/pyproject.toml b/pyproject.toml index 8aec94c..aacdac0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.48" +version = "0.1.55" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" @@ -24,9 +24,9 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.21.2"} -mkdocs-material = "^9.1.11" +mkdocs-material = "^9.1.14" pytest = "^7.3.1" -httpx = "^0.24.0" +httpx = "^0.24.1" scikit-build = "0.13" [tool.poetry.extras] diff --git a/setup.py b/setup.py index f4cbb60..2136d8d 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.48", + version="0.1.55", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", diff --git a/tests/test_llama.py b/tests/test_llama.py index b3426b8..941287d 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -17,7 +17,7 @@ def test_llama(): # @pytest.mark.skip(reason="need to update sample mocking") def test_llama_patch(monkeypatch): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) - n_vocab = int(llama_cpp.llama_n_vocab(llama.ctx)) + n_vocab = llama_cpp.llama_n_vocab(llama.ctx) ## Set up mock function def mock_eval(*args, **kwargs): @@ -107,7 +107,7 @@ def test_llama_pickle(): def test_utf8(monkeypatch): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) - n_vocab = int(llama_cpp.llama_n_vocab(llama.ctx)) + n_vocab = llama_cpp.llama_n_vocab(llama.ctx) ## Set up mock function def mock_eval(*args, **kwargs): diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1b0fd45..66874d4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25 +Subproject commit 66874d4fbcc7866377246efbcee938e8cc9c7d76