Merge branch 'main' into setup
This commit is contained in:
commit
447a3d249e
22 changed files with 1095 additions and 254 deletions
2
.gitmodules
vendored
2
.gitmodules
vendored
|
@ -1,3 +1,3 @@
|
|||
[submodule "vendor/llama.cpp"]
|
||||
path = vendor/llama.cpp
|
||||
url = git@github.com:ggerganov/llama.cpp.git
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
|
|
12
CHANGELOG.md
Normal file
12
CHANGELOG.md
Normal file
|
@ -0,0 +1,12 @@
|
|||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
|
||||
- Added first version of the changelog
|
|
@ -28,4 +28,4 @@ else()
|
|||
LIBRARY DESTINATION llama_cpp
|
||||
RUNTIME DESTINATION llama_cpp
|
||||
)
|
||||
endif(UNIX)
|
||||
endif()
|
||||
|
|
24
README.md
24
README.md
|
@ -15,6 +15,8 @@ This package provides:
|
|||
- OpenAI-like API
|
||||
- LangChain compatibility
|
||||
|
||||
Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python).
|
||||
|
||||
## Installation from PyPI (recommended)
|
||||
|
||||
Install from PyPI (requires a c compiler):
|
||||
|
@ -26,6 +28,18 @@ pip install llama-cpp-python
|
|||
The above command will attempt to install the package and build build `llama.cpp` from source.
|
||||
This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system.
|
||||
|
||||
If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different compiler options, please add the following flags to ensure that the package is rebuilt correctly:
|
||||
|
||||
```bash
|
||||
pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir
|
||||
```
|
||||
|
||||
Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example:
|
||||
```
|
||||
wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
|
||||
bash Miniforge3-MacOSX-arm64.sh
|
||||
```
|
||||
Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
|
||||
|
||||
### Installation with OpenBLAS / cuBLAS / CLBlast
|
||||
|
||||
|
@ -35,19 +49,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins
|
|||
To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
|
||||
|
||||
```bash
|
||||
LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
|
||||
CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
||||
```
|
||||
|
||||
To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
|
||||
|
||||
```bash
|
||||
LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
|
||||
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
||||
```
|
||||
|
||||
To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
|
||||
|
||||
```bash
|
||||
LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python
|
||||
CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
||||
```
|
||||
|
||||
|
||||
|
@ -102,7 +116,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the
|
|||
A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
|
||||
|
||||
```bash
|
||||
docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
|
||||
docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
|
||||
```
|
||||
|
||||
## Low-level API
|
||||
|
@ -120,7 +134,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize
|
|||
>>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params)
|
||||
>>> max_tokens = params.n_ctx
|
||||
# use ctypes arrays for array params
|
||||
>>> tokens = (llama_cppp.llama_token * int(max_tokens))()
|
||||
>>> tokens = (llama_cpp.llama_token * int(max_tokens))()
|
||||
>>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True))
|
||||
>>> llama_cpp.llama_free(ctx)
|
||||
```
|
||||
|
|
51
docker/Dockerfile
Normal file
51
docker/Dockerfile
Normal file
|
@ -0,0 +1,51 @@
|
|||
# Define the image argument and provide a default value
|
||||
ARG IMAGE=python:3-slim-bullseye
|
||||
|
||||
# Use the image as specified
|
||||
FROM ${IMAGE}
|
||||
|
||||
# Re-declare the ARG after FROM
|
||||
ARG IMAGE
|
||||
|
||||
# Update and upgrade the existing packages
|
||||
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
|
||||
python3 \
|
||||
python3-pip \
|
||||
ninja-build \
|
||||
build-essential
|
||||
|
||||
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
|
||||
|
||||
# Perform the conditional installations based on the image
|
||||
RUN echo "Image: ${IMAGE}" && \
|
||||
if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
|
||||
echo "OpenBLAS install:" && \
|
||||
apt-get install -y --no-install-recommends libopenblas-dev && \
|
||||
LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
|
||||
else \
|
||||
echo "CuBLAS install:" && \
|
||||
LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
|
||||
fi
|
||||
|
||||
# Clean up apt cache
|
||||
RUN rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set a working directory for better clarity
|
||||
WORKDIR /app
|
||||
|
||||
# Copy files to the app directory
|
||||
RUN echo "Installing model...this can take some time..."
|
||||
COPY ./model.bin /app/model.bin
|
||||
COPY ./start_server.sh /app/start_server.sh
|
||||
|
||||
# Make the server start script executable
|
||||
RUN chmod +x /app/start_server.sh
|
||||
|
||||
# Set environment variable for the host
|
||||
ENV HOST=0.0.0.0
|
||||
|
||||
# Expose a port for the server
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the server start script
|
||||
CMD ["/bin/sh", "/app/start_server.sh"]
|
|
@ -1,4 +1,5 @@
|
|||
FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
|
||||
ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
|
||||
FROM ${CUDA_IMAGE}
|
||||
|
||||
# We need to set the host to 0.0.0.0 to allow outside access
|
||||
ENV HOST 0.0.0.0
|
||||
|
@ -12,4 +13,4 @@ RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fa
|
|||
RUN LLAMA_CUBLAS=1 python3 setup.py develop
|
||||
|
||||
# Run the server
|
||||
CMD python3 -m llama_cpp.server
|
||||
CMD python3 -m llama_cpp.server
|
46
docker/README.md
Normal file
46
docker/README.md
Normal file
|
@ -0,0 +1,46 @@
|
|||
# Dockerfiles for building the llama-cpp-python server
|
||||
- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS
|
||||
- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS
|
||||
- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke)
|
||||
- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
|
||||
|
||||
# Get model from Hugging Face
|
||||
`python3 ./hug_model.py`
|
||||
|
||||
You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
|
||||
```
|
||||
docker $ ls -lh *.bin
|
||||
-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>.q5_1.bin
|
||||
lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> <downloaded-model-file>.q5_1.bin
|
||||
```
|
||||
**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
|
||||
**TWICE** as much disk space as the size of the model:
|
||||
|
||||
| Model | Quantized size |
|
||||
|------:|----------------:|
|
||||
| 7B | 5 GB |
|
||||
| 13B | 10 GB |
|
||||
| 30B | 25 GB |
|
||||
| 65B | 50 GB |
|
||||
|
||||
**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
|
||||
|
||||
# Install Docker Server
|
||||
|
||||
**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
|
||||
|
||||
[Install Docker Engine](https://docs.docker.com/engine/install)
|
||||
|
||||
# Use OpenBLAS
|
||||
Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
|
||||
## Build:
|
||||
`docker build --build-arg -t openblas .`
|
||||
## Run:
|
||||
`docker run --cap-add SYS_RESOURCE -t openblas`
|
||||
|
||||
# Use CuBLAS
|
||||
Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
|
||||
## Build:
|
||||
`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
|
||||
## Run:
|
||||
`docker run --cap-add SYS_RESOURCE -t cublas`
|
116
docker/hug_model.py
Normal file
116
docker/hug_model.py
Normal file
|
@ -0,0 +1,116 @@
|
|||
import requests
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
|
||||
def make_request(url, params=None):
|
||||
print(f"Making request to {url}...")
|
||||
response = requests.get(url, params=params)
|
||||
if response.status_code == 200:
|
||||
return json.loads(response.text)
|
||||
else:
|
||||
print(f"Request failed with status code {response.status_code}")
|
||||
return None
|
||||
|
||||
def check_magic_and_version(filename):
|
||||
with open(filename, 'rb') as f:
|
||||
# Read the first 6 bytes from the file
|
||||
data = f.read(6)
|
||||
|
||||
# Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
|
||||
# and the next 2 bytes as a little-endian unsigned short
|
||||
magic, version = struct.unpack('<I H', data)
|
||||
|
||||
print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
|
||||
|
||||
return magic, version
|
||||
|
||||
def download_file(url, destination):
|
||||
print(f"Downloading {url} to {destination}...")
|
||||
response = requests.get(url, stream=True)
|
||||
if response.status_code == 200:
|
||||
with open(destination, 'wb') as f:
|
||||
total_downloaded = 0
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
f.write(chunk)
|
||||
total_downloaded += len(chunk)
|
||||
if total_downloaded >= 10485760: # 10 MB
|
||||
print('.', end='', flush=True)
|
||||
total_downloaded = 0
|
||||
print("\nDownload complete.")
|
||||
|
||||
# Creating a symbolic link from destination to "model.bin"
|
||||
if os.path.isfile("model.bin"):
|
||||
os.remove("model.bin") # remove the existing link if any
|
||||
os.symlink(destination, "model.bin")
|
||||
else:
|
||||
print(f"Download failed with status code {response.status_code}")
|
||||
|
||||
def get_user_choice(model_list):
|
||||
# Print the enumerated list
|
||||
print("\n")
|
||||
for i, (model_id, rfilename) in enumerate(model_list):
|
||||
print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
|
||||
|
||||
# Get user's choice
|
||||
choice = input("Choose a model to download by entering the corresponding number: ")
|
||||
try:
|
||||
index = int(choice) - 1
|
||||
if 0 <= index < len(model_list):
|
||||
# Return the chosen model
|
||||
return model_list[index]
|
||||
else:
|
||||
print("Invalid choice.")
|
||||
except ValueError:
|
||||
print("Invalid input. Please enter a number corresponding to a model.")
|
||||
except IndexError:
|
||||
print("Invalid choice. Index out of range.")
|
||||
|
||||
return None
|
||||
|
||||
import argparse
|
||||
|
||||
def main():
|
||||
# Create an argument parser
|
||||
parser = argparse.ArgumentParser(description='Process the model version.')
|
||||
parser.add_argument('-v', '--version', type=int, default=0x0003,
|
||||
help='an integer for the version to be used')
|
||||
|
||||
# Parse the arguments
|
||||
args = parser.parse_args()
|
||||
|
||||
# Define the parameters
|
||||
params = {
|
||||
"author": "TheBloke", # Filter by author
|
||||
"tags": "llama"
|
||||
}
|
||||
|
||||
models = make_request('https://huggingface.co/api/models', params=params)
|
||||
if models is None:
|
||||
return
|
||||
|
||||
model_list = []
|
||||
# Iterate over the models
|
||||
for model in models:
|
||||
model_id = model['id']
|
||||
model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
|
||||
if model_info is None:
|
||||
continue
|
||||
|
||||
for sibling in model_info.get('siblings', []):
|
||||
rfilename = sibling.get('rfilename')
|
||||
if rfilename and 'q5_1' in rfilename:
|
||||
model_list.append((model_id, rfilename))
|
||||
|
||||
model_choice = get_user_choice(model_list)
|
||||
if model_choice is not None:
|
||||
model_id, rfilename = model_choice
|
||||
url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
|
||||
download_file(url, rfilename)
|
||||
_, version = check_magic_and_version(rfilename)
|
||||
if version != args.version:
|
||||
print(f"Warning: Expected version {args.version}, but found different version in the file.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
11
docker/start_server.sh
Executable file
11
docker/start_server.sh
Executable file
|
@ -0,0 +1,11 @@
|
|||
#!/bin/sh
|
||||
|
||||
# For mmap support
|
||||
ulimit -l unlimited
|
||||
|
||||
if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
|
||||
python3 -B -m llama_cpp.server --model /app/model.bin
|
||||
else
|
||||
# You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
|
||||
python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
|
||||
fi
|
|
@ -112,8 +112,12 @@ python3 setup.py develop
|
|||
show_root_heading: true
|
||||
|
||||
::: llama_cpp.LlamaCache
|
||||
options:
|
||||
show_root_heading: true
|
||||
|
||||
::: llama_cpp.LlamaState
|
||||
options:
|
||||
show_root_heading: true
|
||||
|
||||
::: llama_cpp.llama_cpp
|
||||
options:
|
||||
|
|
|
@ -368,10 +368,10 @@ n_keep = {self.params.n_keep}
|
|||
id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
|
||||
else:
|
||||
# Temperature sampling
|
||||
llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k)
|
||||
llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z))
|
||||
llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p))
|
||||
llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p))
|
||||
llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1))
|
||||
llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), min_keep=llama_cpp.c_size_t(1))
|
||||
llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), min_keep=llama_cpp.c_size_t(1))
|
||||
llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), min_keep=llama_cpp.c_size_t(1))
|
||||
llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
|
||||
id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
|
||||
# print("`{}`".format(candidates_p.size))
|
||||
|
@ -382,12 +382,15 @@ n_keep = {self.params.n_keep}
|
|||
# replace end of text token with newline token when in interactive mode
|
||||
if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
|
||||
id = self.llama_token_newline[0]
|
||||
self.embd.append(id)
|
||||
if (self.use_antiprompt()):
|
||||
# tokenize and inject first reverse prompt
|
||||
self.embd_inp += self.first_antiprompt[0]
|
||||
|
||||
# add it to the context
|
||||
self.embd.append(id)
|
||||
for id in self.first_antiprompt[0]:
|
||||
self.embd.append(id)
|
||||
else:
|
||||
# add it to the context
|
||||
self.embd.append(id)
|
||||
|
||||
# echo this to console
|
||||
self.output_echo = True
|
||||
|
@ -493,7 +496,7 @@ n_keep = {self.params.n_keep}
|
|||
# Contains multi-byte UTF8
|
||||
for num, pattern in [(2, 192), (3, 224), (4, 240)]:
|
||||
# Bitwise AND check
|
||||
if pattern & int.from_bytes(cur_char) == pattern:
|
||||
if pattern & int.from_bytes(cur_char, 'little') == pattern:
|
||||
self.multibyte_fix = [cur_char] + ([None] * (num-1))
|
||||
|
||||
# Stop incomplete bytes from passing
|
||||
|
|
89
examples/notebooks/Guidance.ipynb
Normal file
89
examples/notebooks/Guidance.ipynb
Normal file
File diff suppressed because one or more lines are too long
|
@ -4,7 +4,17 @@ import uuid
|
|||
import time
|
||||
import math
|
||||
import multiprocessing
|
||||
from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple
|
||||
from typing import (
|
||||
List,
|
||||
Optional,
|
||||
Union,
|
||||
Generator,
|
||||
Sequence,
|
||||
Iterator,
|
||||
Deque,
|
||||
Tuple,
|
||||
Callable,
|
||||
)
|
||||
from collections import deque, OrderedDict
|
||||
|
||||
from . import llama_cpp
|
||||
|
@ -15,9 +25,7 @@ class LlamaCache:
|
|||
"""Cache for a llama.cpp model."""
|
||||
|
||||
def __init__(self, capacity_bytes: int = (2 << 30)):
|
||||
self.cache_state: OrderedDict[
|
||||
Tuple[llama_cpp.llama_token, ...], "LlamaState"
|
||||
] = OrderedDict()
|
||||
self.cache_state: OrderedDict[Tuple[int, ...], "LlamaState"] = OrderedDict()
|
||||
self.capacity_bytes = capacity_bytes
|
||||
|
||||
@property
|
||||
|
@ -26,8 +34,8 @@ class LlamaCache:
|
|||
|
||||
def _find_longest_prefix_key(
|
||||
self,
|
||||
key: Tuple[llama_cpp.llama_token, ...],
|
||||
) -> Optional[Tuple[llama_cpp.llama_token, ...]]:
|
||||
key: Tuple[int, ...],
|
||||
) -> Optional[Tuple[int, ...]]:
|
||||
min_len = 0
|
||||
min_key = None
|
||||
keys = (
|
||||
|
@ -39,7 +47,7 @@ class LlamaCache:
|
|||
min_key = k
|
||||
return min_key
|
||||
|
||||
def __getitem__(self, key: Sequence[llama_cpp.llama_token]) -> "LlamaState":
|
||||
def __getitem__(self, key: Sequence[int]) -> "LlamaState":
|
||||
key = tuple(key)
|
||||
_key = self._find_longest_prefix_key(key)
|
||||
if _key is None:
|
||||
|
@ -48,10 +56,10 @@ class LlamaCache:
|
|||
self.cache_state.move_to_end(_key)
|
||||
return value
|
||||
|
||||
def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool:
|
||||
def __contains__(self, key: Sequence[int]) -> bool:
|
||||
return self._find_longest_prefix_key(tuple(key)) is not None
|
||||
|
||||
def __setitem__(self, key: Sequence[llama_cpp.llama_token], value: "LlamaState"):
|
||||
def __setitem__(self, key: Sequence[int], value: "LlamaState"):
|
||||
key = tuple(key)
|
||||
if key in self.cache_state:
|
||||
del self.cache_state[key]
|
||||
|
@ -63,7 +71,7 @@ class LlamaCache:
|
|||
class LlamaState:
|
||||
def __init__(
|
||||
self,
|
||||
eval_tokens: Deque[llama_cpp.llama_token],
|
||||
eval_tokens: Deque[int],
|
||||
eval_logits: Deque[List[float]],
|
||||
llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8]
|
||||
llama_state_size: int,
|
||||
|
@ -74,6 +82,24 @@ class LlamaState:
|
|||
self.llama_state_size = llama_state_size
|
||||
|
||||
|
||||
LogitsProcessor = Callable[[List[int], List[float]], List[float]]
|
||||
|
||||
|
||||
class LogitsProcessorList(List[LogitsProcessor]):
|
||||
def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]:
|
||||
for processor in self:
|
||||
scores = processor(input_ids, scores)
|
||||
return scores
|
||||
|
||||
|
||||
StoppingCriteria = Callable[[List[int], List[float]], bool]
|
||||
|
||||
|
||||
class StoppingCriteriaList(List[StoppingCriteria]):
|
||||
def __call__(self, input_ids: List[int], logits: List[float]) -> bool:
|
||||
return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
|
||||
|
||||
|
||||
class Llama:
|
||||
"""High-level Python wrapper for a llama.cpp model."""
|
||||
|
||||
|
@ -83,6 +109,7 @@ class Llama:
|
|||
# NOTE: These parameters are likely to change in the future.
|
||||
n_ctx: int = 512,
|
||||
n_parts: int = -1,
|
||||
n_gpu_layers: int = 0,
|
||||
seed: int = 1337,
|
||||
f16_kv: bool = True,
|
||||
logits_all: bool = False,
|
||||
|
@ -128,7 +155,7 @@ class Llama:
|
|||
|
||||
self.params = llama_cpp.llama_context_default_params()
|
||||
self.params.n_ctx = n_ctx
|
||||
self.params.n_parts = n_parts
|
||||
self.params.n_gpu_layers = n_gpu_layers
|
||||
self.params.seed = seed
|
||||
self.params.f16_kv = f16_kv
|
||||
self.params.logits_all = logits_all
|
||||
|
@ -139,7 +166,7 @@ class Llama:
|
|||
|
||||
self.last_n_tokens_size = last_n_tokens_size
|
||||
self.n_batch = min(n_ctx, n_batch)
|
||||
self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx)
|
||||
self.eval_tokens: Deque[int] = deque(maxlen=n_ctx)
|
||||
self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx if logits_all else 1)
|
||||
|
||||
self.cache: Optional[LlamaCache] = None
|
||||
|
@ -149,6 +176,10 @@ class Llama:
|
|||
self.lora_base = lora_base
|
||||
self.lora_path = lora_path
|
||||
|
||||
### DEPRECATED ###
|
||||
self.n_parts = n_parts
|
||||
### DEPRECATED ###
|
||||
|
||||
if not os.path.exists(model_path):
|
||||
raise ValueError(f"Model path does not exist: {model_path}")
|
||||
|
||||
|
@ -174,7 +205,30 @@ class Llama:
|
|||
if self.verbose:
|
||||
print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
|
||||
|
||||
def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]:
|
||||
self._n_vocab = self.n_vocab()
|
||||
self._n_ctx = self.n_ctx()
|
||||
data = (llama_cpp.llama_token_data * self._n_vocab)(
|
||||
*[
|
||||
llama_cpp.llama_token_data(
|
||||
id=llama_cpp.llama_token(i),
|
||||
logit=llama_cpp.c_float(0.0),
|
||||
p=llama_cpp.c_float(0.0),
|
||||
)
|
||||
for i in range(self._n_vocab)
|
||||
]
|
||||
)
|
||||
size = llama_cpp.c_size_t(self._n_vocab)
|
||||
sorted = False
|
||||
candidates = llama_cpp.llama_token_data_array(
|
||||
data=data,
|
||||
size=size,
|
||||
sorted=sorted,
|
||||
)
|
||||
self._candidates = candidates
|
||||
self._token_nl = Llama.token_nl()
|
||||
self._token_eos = Llama.token_eos()
|
||||
|
||||
def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
|
||||
"""Tokenize a string.
|
||||
|
||||
Args:
|
||||
|
@ -187,20 +241,32 @@ class Llama:
|
|||
A list of tokens.
|
||||
"""
|
||||
assert self.ctx is not None
|
||||
n_ctx = llama_cpp.llama_n_ctx(self.ctx)
|
||||
tokens = (llama_cpp.llama_token * int(n_ctx))()
|
||||
n_ctx = self._n_ctx
|
||||
tokens = (llama_cpp.llama_token * n_ctx)()
|
||||
n_tokens = llama_cpp.llama_tokenize(
|
||||
self.ctx,
|
||||
text,
|
||||
tokens,
|
||||
n_ctx,
|
||||
llama_cpp.c_bool(True),
|
||||
llama_cpp.c_int(n_ctx),
|
||||
llama_cpp.c_bool(add_bos),
|
||||
)
|
||||
if int(n_tokens) < 0:
|
||||
raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}')
|
||||
if n_tokens < 0:
|
||||
n_tokens = abs(n_tokens)
|
||||
tokens = (llama_cpp.llama_token * n_tokens)()
|
||||
n_tokens = llama_cpp.llama_tokenize(
|
||||
self.ctx,
|
||||
text,
|
||||
tokens,
|
||||
llama_cpp.c_int(n_tokens),
|
||||
llama_cpp.c_bool(add_bos),
|
||||
)
|
||||
if n_tokens < 0:
|
||||
raise RuntimeError(
|
||||
f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
|
||||
)
|
||||
return list(tokens[:n_tokens])
|
||||
|
||||
def detokenize(self, tokens: List[llama_cpp.llama_token]) -> bytes:
|
||||
def detokenize(self, tokens: List[int]) -> bytes:
|
||||
"""Detokenize a list of tokens.
|
||||
|
||||
Args:
|
||||
|
@ -212,7 +278,9 @@ class Llama:
|
|||
assert self.ctx is not None
|
||||
output = b""
|
||||
for token in tokens:
|
||||
output += llama_cpp.llama_token_to_str(self.ctx, token)
|
||||
output += llama_cpp.llama_token_to_str(
|
||||
self.ctx, llama_cpp.llama_token(token)
|
||||
)
|
||||
return output
|
||||
|
||||
def set_cache(self, cache: Optional[LlamaCache]):
|
||||
|
@ -228,14 +296,14 @@ class Llama:
|
|||
self.eval_tokens.clear()
|
||||
self.eval_logits.clear()
|
||||
|
||||
def eval(self, tokens: Sequence[llama_cpp.llama_token]):
|
||||
def eval(self, tokens: Sequence[int]):
|
||||
"""Evaluate a list of tokens.
|
||||
|
||||
Args:
|
||||
tokens: The list of tokens to evaluate.
|
||||
"""
|
||||
assert self.ctx is not None
|
||||
n_ctx = int(llama_cpp.llama_n_ctx(self.ctx))
|
||||
n_ctx = self._n_ctx
|
||||
for i in range(0, len(tokens), self.n_batch):
|
||||
batch = tokens[i : min(len(tokens), i + self.n_batch)]
|
||||
n_past = min(n_ctx - len(batch), len(self.eval_tokens))
|
||||
|
@ -247,18 +315,16 @@ class Llama:
|
|||
n_past=llama_cpp.c_int(n_past),
|
||||
n_threads=llama_cpp.c_int(self.n_threads),
|
||||
)
|
||||
if int(return_code) != 0:
|
||||
if return_code != 0:
|
||||
raise RuntimeError(f"llama_eval returned {return_code}")
|
||||
# Save tokens
|
||||
self.eval_tokens.extend(batch)
|
||||
# Save logits
|
||||
rows = n_tokens if self.params.logits_all else 1
|
||||
n_vocab = llama_cpp.llama_n_vocab(self.ctx)
|
||||
cols = int(n_vocab)
|
||||
n_vocab = self._n_vocab
|
||||
cols = n_vocab
|
||||
logits_view = llama_cpp.llama_get_logits(self.ctx)
|
||||
logits: List[List[float]] = [
|
||||
[logits_view[i * cols + j] for j in range(cols)] for i in range(rows)
|
||||
]
|
||||
logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)]
|
||||
self.eval_logits.extend(logits)
|
||||
|
||||
def _sample(
|
||||
|
@ -275,28 +341,33 @@ class Llama:
|
|||
mirostat_mode: llama_cpp.c_int,
|
||||
mirostat_tau: llama_cpp.c_float,
|
||||
mirostat_eta: llama_cpp.c_float,
|
||||
penalize_nl: bool = True,
|
||||
logits_processor: Optional[LogitsProcessorList] = None,
|
||||
):
|
||||
assert self.ctx is not None
|
||||
assert len(self.eval_logits) > 0
|
||||
n_vocab = int(llama_cpp.llama_n_vocab(self.ctx))
|
||||
n_vocab = self._n_vocab
|
||||
n_ctx = self._n_ctx
|
||||
top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k
|
||||
last_n_tokens_size = (
|
||||
llama_cpp.c_int(n_ctx)
|
||||
if last_n_tokens_size.value < 0
|
||||
else last_n_tokens_size
|
||||
)
|
||||
logits = self.eval_logits[-1]
|
||||
data = (llama_cpp.llama_token_data * n_vocab)(
|
||||
*[
|
||||
llama_cpp.llama_token_data(
|
||||
id=llama_cpp.llama_token(i),
|
||||
logit=logits[i],
|
||||
p=llama_cpp.c_float(0.0),
|
||||
)
|
||||
for i in range(n_vocab)
|
||||
]
|
||||
)
|
||||
size = llama_cpp.c_size_t(n_vocab)
|
||||
sorted = False
|
||||
candidates = llama_cpp.llama_token_data_array(
|
||||
data=data,
|
||||
size=size,
|
||||
sorted=sorted,
|
||||
)
|
||||
|
||||
if logits_processor is not None:
|
||||
logits = logits_processor(list(self.eval_tokens), logits)
|
||||
self.eval_logits[-1] = logits
|
||||
|
||||
nl_logit = logits[self._token_nl]
|
||||
candidates = self._candidates
|
||||
for i, logit in enumerate(logits):
|
||||
candidates.data[i].id = llama_cpp.llama_token(i)
|
||||
candidates.data[i].logit = llama_cpp.c_float(logit)
|
||||
candidates.data[i].p = llama_cpp.c_float(0.0)
|
||||
candidates.sorted = llama_cpp.c_bool(False)
|
||||
candidates.size = llama_cpp.c_size_t(n_vocab)
|
||||
llama_cpp.llama_sample_repetition_penalty(
|
||||
ctx=self.ctx,
|
||||
last_tokens_data=last_n_tokens_data,
|
||||
|
@ -312,6 +383,8 @@ class Llama:
|
|||
alpha_frequency=frequency_penalty,
|
||||
alpha_presence=presence_penalty,
|
||||
)
|
||||
if not penalize_nl:
|
||||
candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit)
|
||||
if temp.value == 0.0:
|
||||
return llama_cpp.llama_sample_token_greedy(
|
||||
ctx=self.ctx,
|
||||
|
@ -394,6 +467,8 @@ class Llama:
|
|||
mirostat_mode: int = 0,
|
||||
mirostat_eta: float = 0.1,
|
||||
mirostat_tau: float = 5.0,
|
||||
penalize_nl: bool = True,
|
||||
logits_processor: Optional[LogitsProcessorList] = None,
|
||||
):
|
||||
"""Sample a token from the model.
|
||||
|
||||
|
@ -425,24 +500,27 @@ class Llama:
|
|||
mirostat_mode=llama_cpp.c_int(mirostat_mode),
|
||||
mirostat_tau=llama_cpp.c_float(mirostat_tau),
|
||||
mirostat_eta=llama_cpp.c_float(mirostat_eta),
|
||||
penalize_nl=penalize_nl,
|
||||
logits_processor=logits_processor,
|
||||
)
|
||||
|
||||
def generate(
|
||||
self,
|
||||
tokens: Sequence[llama_cpp.llama_token],
|
||||
top_k: int,
|
||||
top_p: float,
|
||||
temp: float,
|
||||
repeat_penalty: float,
|
||||
tokens: Sequence[int],
|
||||
top_k: int = 40,
|
||||
top_p: float = 0.95,
|
||||
temp: float = 0.80,
|
||||
repeat_penalty: float = 1.1,
|
||||
reset: bool = True,
|
||||
frequency_penalty: float = 0.0,
|
||||
presence_penalty: float = 0.0,
|
||||
tfs_z: float = 1.0,
|
||||
mirostat_mode: int = 0,
|
||||
mirostat_tau: float = 5.0,
|
||||
mirostat_eta: float = 0.1,
|
||||
) -> Generator[
|
||||
llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None
|
||||
]:
|
||||
logits_processor: Optional[LogitsProcessorList] = None,
|
||||
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
||||
) -> Generator[int, Optional[Sequence[int]], None]:
|
||||
"""Create a generator of tokens from a prompt.
|
||||
|
||||
Examples:
|
||||
|
@ -495,16 +573,24 @@ class Llama:
|
|||
repeat_penalty=repeat_penalty,
|
||||
frequency_penalty=frequency_penalty,
|
||||
presence_penalty=presence_penalty,
|
||||
tfs_z=tfs_z,
|
||||
mirostat_mode=mirostat_mode,
|
||||
mirostat_tau=mirostat_tau,
|
||||
mirostat_eta=mirostat_eta,
|
||||
logits_processor=logits_processor,
|
||||
)
|
||||
if stopping_criteria is not None and stopping_criteria(
|
||||
list(self.eval_tokens), self.eval_logits[-1]
|
||||
):
|
||||
return
|
||||
tokens_or_none = yield token
|
||||
tokens = [token]
|
||||
if tokens_or_none is not None:
|
||||
tokens.extend(tokens_or_none)
|
||||
|
||||
def create_embedding(self, input: str) -> Embedding:
|
||||
def create_embedding(
|
||||
self, input: Union[str, List[str]], model: Optional[str] = None
|
||||
) -> Embedding:
|
||||
"""Embed a string.
|
||||
|
||||
Args:
|
||||
|
@ -514,6 +600,7 @@ class Llama:
|
|||
An embedding object.
|
||||
"""
|
||||
assert self.ctx is not None
|
||||
model_name: str = model if model is not None else self.model_path
|
||||
|
||||
if self.params.embedding == False:
|
||||
raise RuntimeError(
|
||||
|
@ -523,30 +610,40 @@ class Llama:
|
|||
if self.verbose:
|
||||
llama_cpp.llama_reset_timings(self.ctx)
|
||||
|
||||
tokens = self.tokenize(input.encode("utf-8"))
|
||||
self.reset()
|
||||
self.eval(tokens)
|
||||
n_tokens = len(tokens)
|
||||
embedding = llama_cpp.llama_get_embeddings(self.ctx)[
|
||||
: llama_cpp.llama_n_embd(self.ctx)
|
||||
]
|
||||
if isinstance(input, str):
|
||||
inputs = [input]
|
||||
else:
|
||||
inputs = input
|
||||
|
||||
data: List[EmbeddingData] = []
|
||||
total_tokens = 0
|
||||
for index, input in enumerate(inputs):
|
||||
tokens = self.tokenize(input.encode("utf-8"))
|
||||
self.reset()
|
||||
self.eval(tokens)
|
||||
n_tokens = len(tokens)
|
||||
total_tokens += n_tokens
|
||||
embedding = llama_cpp.llama_get_embeddings(self.ctx)[
|
||||
: llama_cpp.llama_n_embd(self.ctx)
|
||||
]
|
||||
|
||||
data.append(
|
||||
{
|
||||
"object": "embedding",
|
||||
"embedding": embedding,
|
||||
"index": index,
|
||||
}
|
||||
)
|
||||
if self.verbose:
|
||||
llama_cpp.llama_print_timings(self.ctx)
|
||||
|
||||
return {
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"object": "embedding",
|
||||
"embedding": embedding,
|
||||
"index": 0,
|
||||
}
|
||||
],
|
||||
"model": self.model_path,
|
||||
"data": data,
|
||||
"model": model_name,
|
||||
"usage": {
|
||||
"prompt_tokens": n_tokens,
|
||||
"total_tokens": n_tokens,
|
||||
"prompt_tokens": total_tokens,
|
||||
"total_tokens": total_tokens,
|
||||
},
|
||||
}
|
||||
|
||||
|
@ -570,35 +667,39 @@ class Llama:
|
|||
top_p: float = 0.95,
|
||||
logprobs: Optional[int] = None,
|
||||
echo: bool = False,
|
||||
stop: Optional[List[str]] = [],
|
||||
stop: Optional[Union[str, List[str]]] = [],
|
||||
frequency_penalty: float = 0.0,
|
||||
presence_penalty: float = 0.0,
|
||||
repeat_penalty: float = 1.1,
|
||||
top_k: int = 40,
|
||||
stream: bool = False,
|
||||
tfs_z: float = 1.0,
|
||||
mirostat_mode: int = 0,
|
||||
mirostat_tau: float = 5.0,
|
||||
mirostat_eta: float = 0.1,
|
||||
model: Optional[str] = None,
|
||||
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
||||
logits_processor: Optional[LogitsProcessorList] = None,
|
||||
) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
|
||||
assert self.ctx is not None
|
||||
|
||||
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
|
||||
created: int = int(time.time())
|
||||
completion_tokens: List[llama_cpp.llama_token] = []
|
||||
completion_tokens: List[int] = []
|
||||
# Add blank space to start of prompt to match OG llama tokenizer
|
||||
prompt_tokens: List[llama_cpp.llama_token] = self.tokenize(
|
||||
b" " + prompt.encode("utf-8")
|
||||
)
|
||||
prompt_tokens: List[int] = self.tokenize(b" " + prompt.encode("utf-8"))
|
||||
text: bytes = b""
|
||||
returned_characters: int = 0
|
||||
stop = stop if stop is not None else []
|
||||
returned_tokens: int = 0
|
||||
stop = (
|
||||
stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else []
|
||||
)
|
||||
model_name: str = model if model is not None else self.model_path
|
||||
|
||||
if self.verbose:
|
||||
llama_cpp.llama_reset_timings(self.ctx)
|
||||
|
||||
if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
|
||||
raise ValueError(
|
||||
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
|
||||
)
|
||||
if len(prompt_tokens) + max_tokens > self._n_ctx:
|
||||
raise ValueError(f"Requested tokens exceed context window of {self._n_ctx}")
|
||||
|
||||
if stop != []:
|
||||
stop_sequences = [s.encode("utf-8") for s in stop]
|
||||
|
@ -634,14 +735,17 @@ class Llama:
|
|||
top_k=top_k,
|
||||
top_p=top_p,
|
||||
temp=temperature,
|
||||
tfs_z=tfs_z,
|
||||
mirostat_mode=mirostat_mode,
|
||||
mirostat_tau=mirostat_tau,
|
||||
mirostat_eta=mirostat_eta,
|
||||
frequency_penalty=frequency_penalty,
|
||||
presence_penalty=presence_penalty,
|
||||
repeat_penalty=repeat_penalty,
|
||||
stopping_criteria=stopping_criteria,
|
||||
logits_processor=logits_processor,
|
||||
):
|
||||
if token == llama_cpp.llama_token_eos():
|
||||
if token == self._token_eos:
|
||||
text = self.detokenize(completion_tokens)
|
||||
finish_reason = "stop"
|
||||
break
|
||||
|
@ -671,63 +775,189 @@ class Llama:
|
|||
break
|
||||
|
||||
if stream:
|
||||
start = returned_characters
|
||||
longest = 0
|
||||
# We want to avoid yielding any characters from
|
||||
# the generated text if they are part of a stop
|
||||
# sequence.
|
||||
first_stop_position = 0
|
||||
for s in stop_sequences:
|
||||
for i in range(len(s), 0, -1):
|
||||
if all_text.endswith(s[:i]):
|
||||
if i > longest:
|
||||
longest = i
|
||||
if i > first_stop_position:
|
||||
first_stop_position = i
|
||||
break
|
||||
text = all_text[: len(all_text) - longest]
|
||||
returned_characters += len(text[start:])
|
||||
yield {
|
||||
"id": completion_id,
|
||||
"object": "text_completion",
|
||||
"created": created,
|
||||
"model": self.model_path,
|
||||
"choices": [
|
||||
{
|
||||
"text": text[start:].decode("utf-8", errors="ignore"),
|
||||
"index": 0,
|
||||
"logprobs": None,
|
||||
"finish_reason": None,
|
||||
|
||||
token_end_position = 0
|
||||
remaining_tokens = completion_tokens[returned_tokens:]
|
||||
remaining_length = len(self.detokenize(remaining_tokens))
|
||||
for token in remaining_tokens:
|
||||
token_end_position += len(self.detokenize([token]))
|
||||
# Check if stop sequence is in the token
|
||||
if token_end_position >= (
|
||||
remaining_length - first_stop_position - 1
|
||||
):
|
||||
break
|
||||
logprobs_or_none: Optional[CompletionLogprobs] = None
|
||||
if logprobs is not None:
|
||||
token_str = self.detokenize([token]).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
text_offset = len(prompt) + len(
|
||||
self.detokenize(completion_tokens[:returned_tokens])
|
||||
)
|
||||
token_offset = len(prompt_tokens) + returned_tokens
|
||||
logits = self.eval_logits[token_offset - 1]
|
||||
current_logprobs = Llama.logits_to_logprobs(logits)
|
||||
sorted_logprobs = list(
|
||||
sorted(
|
||||
zip(current_logprobs, range(len(current_logprobs))),
|
||||
reverse=True,
|
||||
)
|
||||
)
|
||||
top_logprob = {
|
||||
self.detokenize([i]).decode(
|
||||
"utf-8", errors="ignore"
|
||||
): logprob
|
||||
for logprob, i in sorted_logprobs[:logprobs]
|
||||
}
|
||||
],
|
||||
}
|
||||
top_logprob.update({token_str: current_logprobs[int(token)]})
|
||||
logprobs_or_none = {
|
||||
"tokens": [
|
||||
self.detokenize([token]).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
],
|
||||
"text_offset": [text_offset],
|
||||
"token_logprobs": [sorted_logprobs[int(token)][0]],
|
||||
"top_logprobs": [top_logprob],
|
||||
}
|
||||
returned_tokens += 1
|
||||
yield {
|
||||
"id": completion_id,
|
||||
"object": "text_completion",
|
||||
"created": created,
|
||||
"model": model_name,
|
||||
"choices": [
|
||||
{
|
||||
"text": self.detokenize([token]).decode(
|
||||
"utf-8", errors="ignore"
|
||||
),
|
||||
"index": 0,
|
||||
"logprobs": logprobs_or_none,
|
||||
"finish_reason": None,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
if len(completion_tokens) >= max_tokens:
|
||||
text = self.detokenize(completion_tokens)
|
||||
finish_reason = "length"
|
||||
break
|
||||
|
||||
if stopping_criteria is not None and stopping_criteria(
|
||||
list(self.eval_tokens), self.eval_logits[-1]
|
||||
):
|
||||
text = self.detokenize(completion_tokens)
|
||||
finish_reason = "stop"
|
||||
|
||||
if self.verbose:
|
||||
llama_cpp.llama_print_timings(self.ctx)
|
||||
|
||||
if stream:
|
||||
remaining_tokens = completion_tokens[returned_tokens:]
|
||||
all_text = self.detokenize(remaining_tokens)
|
||||
any_stop = [s for s in stop_sequences if s in all_text]
|
||||
if len(any_stop) > 0:
|
||||
end = min(all_text.index(stop) for stop in any_stop)
|
||||
else:
|
||||
end = len(all_text)
|
||||
|
||||
token_end_position = 0
|
||||
for token in remaining_tokens:
|
||||
token_end_position += len(self.detokenize([token]))
|
||||
|
||||
logprobs_or_none: Optional[CompletionLogprobs] = None
|
||||
if logprobs is not None:
|
||||
token_str = self.detokenize([token]).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
text_offset = len(prompt) + len(
|
||||
self.detokenize(completion_tokens[:returned_tokens])
|
||||
)
|
||||
token_offset = len(prompt_tokens) + returned_tokens - 1
|
||||
logits = self.eval_logits[token_offset]
|
||||
current_logprobs = Llama.logits_to_logprobs(logits)
|
||||
sorted_logprobs = list(
|
||||
sorted(
|
||||
zip(current_logprobs, range(len(current_logprobs))),
|
||||
reverse=True,
|
||||
)
|
||||
)
|
||||
top_logprob = {
|
||||
self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
|
||||
for logprob, i in sorted_logprobs[:logprobs]
|
||||
}
|
||||
top_logprob.update({token_str: current_logprobs[int(token)]})
|
||||
logprobs_or_none = {
|
||||
"tokens": [
|
||||
self.detokenize([token]).decode("utf-8", errors="ignore")
|
||||
],
|
||||
"text_offset": [text_offset],
|
||||
"token_logprobs": [sorted_logprobs[int(token)][0]],
|
||||
"top_logprobs": [top_logprob],
|
||||
}
|
||||
|
||||
if token_end_position >= end:
|
||||
last_text = self.detokenize([token])
|
||||
if token_end_position == end - 1:
|
||||
break
|
||||
returned_tokens += 1
|
||||
yield {
|
||||
"id": completion_id,
|
||||
"object": "text_completion",
|
||||
"created": created,
|
||||
"model": model_name,
|
||||
"choices": [
|
||||
{
|
||||
"text": last_text[
|
||||
: len(last_text) - (token_end_position - end)
|
||||
].decode("utf-8", errors="ignore"),
|
||||
"index": 0,
|
||||
"logprobs": logprobs_or_none,
|
||||
"finish_reason": finish_reason,
|
||||
}
|
||||
],
|
||||
}
|
||||
break
|
||||
returned_tokens += 1
|
||||
yield {
|
||||
"id": completion_id,
|
||||
"object": "text_completion",
|
||||
"created": created,
|
||||
"model": model_name,
|
||||
"choices": [
|
||||
{
|
||||
"text": self.detokenize([token]).decode(
|
||||
"utf-8", errors="ignore"
|
||||
),
|
||||
"index": 0,
|
||||
"logprobs": logprobs_or_none,
|
||||
"finish_reason": finish_reason
|
||||
if returned_tokens == len(completion_tokens)
|
||||
else None,
|
||||
}
|
||||
],
|
||||
}
|
||||
if self.cache:
|
||||
if self.verbose:
|
||||
print("Llama._create_completion: cache save", file=sys.stderr)
|
||||
self.cache[prompt_tokens + completion_tokens] = self.save_state()
|
||||
return
|
||||
|
||||
if self.cache:
|
||||
if self.verbose:
|
||||
print("Llama._create_completion: cache save", file=sys.stderr)
|
||||
self.cache[prompt_tokens + completion_tokens] = self.save_state()
|
||||
|
||||
if stream:
|
||||
yield {
|
||||
"id": completion_id,
|
||||
"object": "text_completion",
|
||||
"created": created,
|
||||
"model": self.model_path,
|
||||
"choices": [
|
||||
{
|
||||
"text": text[returned_characters:].decode(
|
||||
"utf-8", errors="ignore"
|
||||
),
|
||||
"index": 0,
|
||||
"logprobs": None,
|
||||
"finish_reason": finish_reason,
|
||||
}
|
||||
],
|
||||
}
|
||||
return
|
||||
|
||||
text_str = text.decode("utf-8", errors="ignore")
|
||||
|
||||
if echo:
|
||||
|
@ -738,13 +968,19 @@ class Llama:
|
|||
|
||||
logprobs_or_none: Optional[CompletionLogprobs] = None
|
||||
if logprobs is not None:
|
||||
text_offset = 0
|
||||
text_offset = 0 if echo else len(prompt)
|
||||
token_offset = 0 if echo else len(prompt_tokens[1:])
|
||||
text_offsets: List[int] = []
|
||||
token_logprobs: List[float] = []
|
||||
token_logprobs: List[Optional[float]] = []
|
||||
tokens: List[str] = []
|
||||
top_logprobs: List[Dict[str, float]] = []
|
||||
top_logprobs: List[Optional[Dict[str, float]]] = []
|
||||
|
||||
if echo:
|
||||
# Remove leading BOS token
|
||||
all_tokens = prompt_tokens[1:] + completion_tokens
|
||||
else:
|
||||
all_tokens = completion_tokens
|
||||
|
||||
all_tokens = prompt_tokens + completion_tokens
|
||||
all_token_strs = [
|
||||
self.detokenize([token]).decode("utf-8", errors="ignore")
|
||||
for token in all_tokens
|
||||
|
@ -752,7 +988,7 @@ class Llama:
|
|||
all_logprobs = [
|
||||
Llama.logits_to_logprobs(list(map(float, row)))
|
||||
for row in self.eval_logits
|
||||
]
|
||||
][token_offset:]
|
||||
for token, token_str, logprobs_token in zip(
|
||||
all_tokens, all_token_strs, all_logprobs
|
||||
):
|
||||
|
@ -765,14 +1001,18 @@ class Llama:
|
|||
)
|
||||
)
|
||||
token_logprobs.append(sorted_logprobs[int(token)][0])
|
||||
top_logprob = {
|
||||
self.detokenize([llama_cpp.llama_token(i)]).decode(
|
||||
"utf-8", errors="ignore"
|
||||
): logprob
|
||||
top_logprob: Optional[Dict[str, float]] = {
|
||||
self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
|
||||
for logprob, i in sorted_logprobs[:logprobs]
|
||||
}
|
||||
top_logprob.update({token_str: sorted_logprobs[int(token)][0]})
|
||||
top_logprob.update({token_str: logprobs_token[int(token)]})
|
||||
top_logprobs.append(top_logprob)
|
||||
# Weird idosincracy of the OpenAI API where
|
||||
# token_logprobs and top_logprobs are null for
|
||||
# the first token.
|
||||
if echo and len(all_tokens) > 0:
|
||||
token_logprobs[0] = None
|
||||
top_logprobs[0] = None
|
||||
logprobs_or_none = {
|
||||
"tokens": tokens,
|
||||
"text_offset": text_offsets,
|
||||
|
@ -780,14 +1020,11 @@ class Llama:
|
|||
"top_logprobs": top_logprobs,
|
||||
}
|
||||
|
||||
if self.verbose:
|
||||
llama_cpp.llama_print_timings(self.ctx)
|
||||
|
||||
yield {
|
||||
"id": completion_id,
|
||||
"object": "text_completion",
|
||||
"created": created,
|
||||
"model": self.model_path,
|
||||
"model": model_name,
|
||||
"choices": [
|
||||
{
|
||||
"text": text_str,
|
||||
|
@ -812,15 +1049,19 @@ class Llama:
|
|||
top_p: float = 0.95,
|
||||
logprobs: Optional[int] = None,
|
||||
echo: bool = False,
|
||||
stop: Optional[List[str]] = [],
|
||||
stop: Optional[Union[str, List[str]]] = [],
|
||||
frequency_penalty: float = 0.0,
|
||||
presence_penalty: float = 0.0,
|
||||
repeat_penalty: float = 1.1,
|
||||
top_k: int = 40,
|
||||
stream: bool = False,
|
||||
tfs_z: float = 1.0,
|
||||
mirostat_mode: int = 0,
|
||||
mirostat_tau: float = 5.0,
|
||||
mirostat_eta: float = 0.1,
|
||||
model: Optional[str] = None,
|
||||
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
||||
logits_processor: Optional[LogitsProcessorList] = None,
|
||||
) -> Union[Completion, Iterator[CompletionChunk]]:
|
||||
"""Generate text from a prompt.
|
||||
|
||||
|
@ -858,9 +1099,13 @@ class Llama:
|
|||
repeat_penalty=repeat_penalty,
|
||||
top_k=top_k,
|
||||
stream=stream,
|
||||
tfs_z=tfs_z,
|
||||
mirostat_mode=mirostat_mode,
|
||||
mirostat_tau=mirostat_tau,
|
||||
mirostat_eta=mirostat_eta,
|
||||
model=model,
|
||||
stopping_criteria=stopping_criteria,
|
||||
logits_processor=logits_processor,
|
||||
)
|
||||
if stream:
|
||||
chunks: Iterator[CompletionChunk] = completion_or_chunks
|
||||
|
@ -877,15 +1122,19 @@ class Llama:
|
|||
top_p: float = 0.95,
|
||||
logprobs: Optional[int] = None,
|
||||
echo: bool = False,
|
||||
stop: Optional[List[str]] = [],
|
||||
stop: Optional[Union[str, List[str]]] = [],
|
||||
frequency_penalty: float = 0.0,
|
||||
presence_penalty: float = 0.0,
|
||||
repeat_penalty: float = 1.1,
|
||||
top_k: int = 40,
|
||||
stream: bool = False,
|
||||
tfs_z: float = 1.0,
|
||||
mirostat_mode: int = 0,
|
||||
mirostat_tau: float = 5.0,
|
||||
mirostat_eta: float = 0.1,
|
||||
model: Optional[str] = None,
|
||||
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
||||
logits_processor: Optional[LogitsProcessorList] = None,
|
||||
) -> Union[Completion, Iterator[CompletionChunk]]:
|
||||
"""Generate text from a prompt.
|
||||
|
||||
|
@ -923,9 +1172,13 @@ class Llama:
|
|||
repeat_penalty=repeat_penalty,
|
||||
top_k=top_k,
|
||||
stream=stream,
|
||||
tfs_z=tfs_z,
|
||||
mirostat_mode=mirostat_mode,
|
||||
mirostat_tau=mirostat_tau,
|
||||
mirostat_eta=mirostat_eta,
|
||||
model=model,
|
||||
stopping_criteria=stopping_criteria,
|
||||
logits_processor=logits_processor,
|
||||
)
|
||||
|
||||
def _convert_text_completion_to_chat(
|
||||
|
@ -993,14 +1246,16 @@ class Llama:
|
|||
top_p: float = 0.95,
|
||||
top_k: int = 40,
|
||||
stream: bool = False,
|
||||
stop: Optional[List[str]] = [],
|
||||
stop: Optional[Union[str, List[str]]] = [],
|
||||
max_tokens: int = 256,
|
||||
presence_penalty: float = 0.0,
|
||||
frequency_penalty: float = 0.0,
|
||||
repeat_penalty: float = 1.1,
|
||||
tfs_z: float = 1.0,
|
||||
mirostat_mode: int = 0,
|
||||
mirostat_tau: float = 5.0,
|
||||
mirostat_eta: float = 0.1,
|
||||
model: Optional[str] = None,
|
||||
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
||||
"""Generate a chat completion from a list of messages.
|
||||
|
||||
|
@ -1017,7 +1272,9 @@ class Llama:
|
|||
Returns:
|
||||
Generated chat completion or a stream of chat completion chunks.
|
||||
"""
|
||||
stop = stop if stop is not None else []
|
||||
stop = (
|
||||
stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else []
|
||||
)
|
||||
chat_history = "".join(
|
||||
f'### {"Human" if message["role"] == "user" else "Assistant"}:{message["content"]}'
|
||||
for message in messages
|
||||
|
@ -1035,9 +1292,11 @@ class Llama:
|
|||
repeat_penalty=repeat_penalty,
|
||||
presence_penalty=presence_penalty,
|
||||
frequency_penalty=frequency_penalty,
|
||||
tfs_z=tfs_z,
|
||||
mirostat_mode=mirostat_mode,
|
||||
mirostat_tau=mirostat_tau,
|
||||
mirostat_eta=mirostat_eta,
|
||||
model=model,
|
||||
)
|
||||
if stream:
|
||||
chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore
|
||||
|
@ -1056,7 +1315,7 @@ class Llama:
|
|||
verbose=self.verbose,
|
||||
model_path=self.model_path,
|
||||
n_ctx=self.params.n_ctx,
|
||||
n_parts=self.params.n_parts,
|
||||
n_gpu_layers=self.params.n_gpu_layers,
|
||||
seed=self.params.seed,
|
||||
f16_kv=self.params.f16_kv,
|
||||
logits_all=self.params.logits_all,
|
||||
|
@ -1069,6 +1328,9 @@ class Llama:
|
|||
n_threads=self.n_threads,
|
||||
lora_base=self.lora_base,
|
||||
lora_path=self.lora_path,
|
||||
### DEPRECATED ###
|
||||
n_parts=self.n_parts,
|
||||
### DEPRECATED ###
|
||||
)
|
||||
|
||||
def __setstate__(self, state):
|
||||
|
@ -1076,6 +1338,7 @@ class Llama:
|
|||
model_path=state["model_path"],
|
||||
n_ctx=state["n_ctx"],
|
||||
n_parts=state["n_parts"],
|
||||
n_gpu_layers=state["n_gpu_layers"],
|
||||
seed=state["seed"],
|
||||
f16_kv=state["f16_kv"],
|
||||
logits_all=state["logits_all"],
|
||||
|
@ -1120,16 +1383,41 @@ class Llama:
|
|||
if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size:
|
||||
raise RuntimeError("Failed to set llama state data")
|
||||
|
||||
def n_ctx(self) -> int:
|
||||
"""Return the context window size."""
|
||||
assert self.ctx is not None
|
||||
return llama_cpp.llama_n_ctx(self.ctx)
|
||||
|
||||
def n_embd(self) -> int:
|
||||
"""Return the embedding size."""
|
||||
assert self.ctx is not None
|
||||
return llama_cpp.llama_n_embd(self.ctx)
|
||||
|
||||
def n_vocab(self) -> int:
|
||||
"""Return the vocabulary size."""
|
||||
assert self.ctx is not None
|
||||
return llama_cpp.llama_n_vocab(self.ctx)
|
||||
|
||||
def tokenizer(self) -> "LlamaTokenizer":
|
||||
"""Return the tokenizer for this model."""
|
||||
assert self.ctx is not None
|
||||
return LlamaTokenizer(self)
|
||||
|
||||
@staticmethod
|
||||
def token_eos() -> llama_cpp.llama_token:
|
||||
def token_eos() -> int:
|
||||
"""Return the end-of-sequence token."""
|
||||
return llama_cpp.llama_token_eos()
|
||||
|
||||
@staticmethod
|
||||
def token_bos() -> llama_cpp.llama_token:
|
||||
def token_bos() -> int:
|
||||
"""Return the beginning-of-sequence token."""
|
||||
return llama_cpp.llama_token_bos()
|
||||
|
||||
@staticmethod
|
||||
def token_nl() -> int:
|
||||
"""Return the newline token."""
|
||||
return llama_cpp.llama_token_nl()
|
||||
|
||||
@staticmethod
|
||||
def logits_to_logprobs(logits: List[float]) -> List[float]:
|
||||
exps = [math.exp(float(x)) for x in logits]
|
||||
|
@ -1137,9 +1425,7 @@ class Llama:
|
|||
return [math.log(x / sum_exps) for x in exps]
|
||||
|
||||
@staticmethod
|
||||
def longest_token_prefix(
|
||||
a: Sequence[llama_cpp.llama_token], b: Sequence[llama_cpp.llama_token]
|
||||
):
|
||||
def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
|
||||
longest_prefix = 0
|
||||
for _a, _b in zip(a, b):
|
||||
if _a == _b:
|
||||
|
@ -1147,3 +1433,20 @@ class Llama:
|
|||
else:
|
||||
break
|
||||
return longest_prefix
|
||||
|
||||
|
||||
class LlamaTokenizer:
|
||||
def __init__(self, llama: Llama):
|
||||
self.llama = llama
|
||||
|
||||
def encode(self, text: str, add_bos: bool = True) -> List[int]:
|
||||
return self.llama.tokenize(
|
||||
text.encode("utf-8", errors="ignore"), add_bos=add_bos
|
||||
)
|
||||
|
||||
def decode(self, tokens: List[int]) -> str:
|
||||
return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
|
||||
|
||||
@classmethod
|
||||
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
|
||||
return cls(Llama(model_path=path, vocab_only=True))
|
||||
|
|
|
@ -44,15 +44,20 @@ def _load_shared_library(lib_base_name: str):
|
|||
_base_path = _lib.parent.resolve()
|
||||
_lib_paths = [_lib.resolve()]
|
||||
|
||||
cdll_args = dict() # type: ignore
|
||||
# Add the library directory to the DLL search path on Windows (if needed)
|
||||
if sys.platform == "win32" and sys.version_info >= (3, 8):
|
||||
os.add_dll_directory(str(_base_path))
|
||||
if "CUDA_PATH" in os.environ:
|
||||
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
|
||||
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
|
||||
cdll_args["winmode"] = 0
|
||||
|
||||
# Try to load the shared library, handling potential errors
|
||||
for _lib_path in _lib_paths:
|
||||
if _lib_path.exists():
|
||||
try:
|
||||
return ctypes.CDLL(str(_lib_path))
|
||||
return ctypes.CDLL(str(_lib_path), **cdll_args)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
|
||||
|
||||
|
@ -67,31 +72,61 @@ _lib_base_name = "llama"
|
|||
# Load the library
|
||||
_lib = _load_shared_library(_lib_base_name)
|
||||
|
||||
# C types
|
||||
LLAMA_FILE_VERSION = c_int(1)
|
||||
LLAMA_FILE_MAGIC = b"ggjt"
|
||||
LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
|
||||
LLAMA_SESSION_MAGIC = b"ggsn"
|
||||
# Misc
|
||||
c_float_p = POINTER(c_float)
|
||||
c_uint8_p = POINTER(c_uint8)
|
||||
c_size_t_p = POINTER(c_size_t)
|
||||
|
||||
# llama.h bindings
|
||||
|
||||
# #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||
LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74)
|
||||
# #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61)
|
||||
# #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
||||
LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66)
|
||||
# #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
||||
LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C)
|
||||
# #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E)
|
||||
|
||||
# #define LLAMA_FILE_VERSION 3
|
||||
LLAMA_FILE_VERSION = c_int(3)
|
||||
LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT
|
||||
LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML
|
||||
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
|
||||
LLAMA_SESSION_VERSION = c_int(1)
|
||||
|
||||
# struct llama_context;
|
||||
llama_context_p = c_void_p
|
||||
|
||||
|
||||
# typedef int llama_token;
|
||||
llama_token = c_int
|
||||
llama_token_p = POINTER(llama_token)
|
||||
|
||||
|
||||
# typedef struct llama_token_data {
|
||||
# llama_token id; // token id
|
||||
# float logit; // log-odds of the token
|
||||
# float p; // probability of the token
|
||||
# } llama_token_data;
|
||||
class llama_token_data(Structure):
|
||||
_fields_ = [
|
||||
("id", llama_token), # token id
|
||||
("logit", c_float), # log-odds of the token
|
||||
("p", c_float), # probability of the token
|
||||
("id", llama_token),
|
||||
("logit", c_float),
|
||||
("p", c_float),
|
||||
]
|
||||
|
||||
|
||||
llama_token_data_p = POINTER(llama_token_data)
|
||||
|
||||
|
||||
# typedef struct llama_token_data_array {
|
||||
# llama_token_data * data;
|
||||
# size_t size;
|
||||
# bool sorted;
|
||||
# } llama_token_data_array;
|
||||
class llama_token_data_array(Structure):
|
||||
_fields_ = [
|
||||
("data", llama_token_data_p),
|
||||
|
@ -102,53 +137,72 @@ class llama_token_data_array(Structure):
|
|||
|
||||
llama_token_data_array_p = POINTER(llama_token_data_array)
|
||||
|
||||
# typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||
llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
|
||||
|
||||
|
||||
# struct llama_context_params {
|
||||
# int n_ctx; // text context
|
||||
# int n_gpu_layers; // number of layers to store in VRAM
|
||||
# int seed; // RNG seed, -1 for random
|
||||
|
||||
# bool f16_kv; // use fp16 for KV cache
|
||||
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
# bool vocab_only; // only load the vocabulary, no weights
|
||||
# bool use_mmap; // use mmap if possible
|
||||
# bool use_mlock; // force system to keep model in RAM
|
||||
# bool embedding; // embedding mode only
|
||||
|
||||
|
||||
# // called with a progress value between 0 and 1, pass NULL to disable
|
||||
# llama_progress_callback progress_callback;
|
||||
# // context pointer passed to the progress callback
|
||||
# void * progress_callback_user_data;
|
||||
# };
|
||||
class llama_context_params(Structure):
|
||||
_fields_ = [
|
||||
("n_ctx", c_int), # text context
|
||||
("n_parts", c_int), # -1 for default
|
||||
("seed", c_int), # RNG seed, 0 for random
|
||||
("f16_kv", c_bool), # use fp16 for KV cache
|
||||
("n_ctx", c_int),
|
||||
("n_gpu_layers", c_int),
|
||||
("seed", c_int),
|
||||
("f16_kv", c_bool),
|
||||
(
|
||||
"logits_all",
|
||||
c_bool,
|
||||
), # the llama_eval() call computes all logits, not just the last one
|
||||
("vocab_only", c_bool), # only load the vocabulary, no weights
|
||||
("use_mmap", c_bool), # use mmap if possible
|
||||
("use_mlock", c_bool), # force system to keep model in RAM
|
||||
("embedding", c_bool), # embedding mode only
|
||||
# called with a progress value between 0 and 1, pass NULL to disable
|
||||
),
|
||||
("vocab_only", c_bool),
|
||||
("use_mmap", c_bool),
|
||||
("use_mlock", c_bool),
|
||||
("embedding", c_bool),
|
||||
("progress_callback", llama_progress_callback),
|
||||
# context pointer passed to the progress callback
|
||||
("progress_callback_user_data", c_void_p),
|
||||
]
|
||||
|
||||
|
||||
llama_context_params_p = POINTER(llama_context_params)
|
||||
|
||||
# enum llama_ftype {
|
||||
# LLAMA_FTYPE_ALL_F32 = 0,
|
||||
# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||
# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
# };
|
||||
LLAMA_FTYPE_ALL_F32 = c_int(0)
|
||||
LLAMA_FTYPE_MOSTLY_F16 = c_int(1) # except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) # except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
|
||||
4
|
||||
) # tok_embeddings.weight and output.weight are F16
|
||||
LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) # except 1d tensors
|
||||
|
||||
# Misc
|
||||
c_float_p = POINTER(c_float)
|
||||
c_uint8_p = POINTER(c_uint8)
|
||||
c_size_t_p = POINTER(c_size_t)
|
||||
|
||||
# Functions
|
||||
LLAMA_FTYPE_MOSTLY_F16 = c_int(1)
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2)
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4)
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9)
|
||||
|
||||
|
||||
# LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
def llama_context_default_params() -> llama_context_params:
|
||||
return _lib.llama_context_default_params()
|
||||
|
||||
|
@ -157,6 +211,7 @@ _lib.llama_context_default_params.argtypes = []
|
|||
_lib.llama_context_default_params.restype = llama_context_params
|
||||
|
||||
|
||||
# LLAMA_API bool llama_mmap_supported();
|
||||
def llama_mmap_supported() -> bool:
|
||||
return _lib.llama_mmap_supported()
|
||||
|
||||
|
@ -165,6 +220,7 @@ _lib.llama_mmap_supported.argtypes = []
|
|||
_lib.llama_mmap_supported.restype = c_bool
|
||||
|
||||
|
||||
# LLAMA_API bool llama_mlock_supported();
|
||||
def llama_mlock_supported() -> bool:
|
||||
return _lib.llama_mlock_supported()
|
||||
|
||||
|
@ -173,9 +229,33 @@ _lib.llama_mlock_supported.argtypes = []
|
|||
_lib.llama_mlock_supported.restype = c_bool
|
||||
|
||||
|
||||
# Various functions for loading a ggml llama model.
|
||||
# Allocate (almost) all memory needed for the model.
|
||||
# Return NULL on failure
|
||||
# // TODO: not great API - very likely to change
|
||||
# // Initialize the llama + ggml backend
|
||||
# // Call once at the start of the program
|
||||
# LLAMA_API void llama_init_backend();
|
||||
def llama_init_backend():
|
||||
return _lib.llama_init_backend()
|
||||
|
||||
|
||||
_lib.llama_init_backend.argtypes = []
|
||||
_lib.llama_init_backend.restype = None
|
||||
|
||||
|
||||
# LLAMA_API int64_t llama_time_us();
|
||||
def llama_time_us() -> int:
|
||||
return _lib.llama_time_us()
|
||||
|
||||
|
||||
_lib.llama_time_us.argtypes = []
|
||||
_lib.llama_time_us.restype = ctypes.c_int64
|
||||
|
||||
|
||||
# // Various functions for loading a ggml llama model.
|
||||
# // Allocate (almost) all memory needed for the model.
|
||||
# // Return NULL on failure
|
||||
# LLAMA_API struct llama_context * llama_init_from_file(
|
||||
# const char * path_model,
|
||||
# struct llama_context_params params);
|
||||
def llama_init_from_file(
|
||||
path_model: bytes, params: llama_context_params
|
||||
) -> llama_context_p:
|
||||
|
@ -187,8 +267,9 @@ _lib.llama_init_from_file.restype = llama_context_p
|
|||
|
||||
|
||||
# Frees all allocated memory
|
||||
# LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
def llama_free(ctx: llama_context_p):
|
||||
_lib.llama_free(ctx)
|
||||
return _lib.llama_free(ctx)
|
||||
|
||||
|
||||
_lib.llama_free.argtypes = [llama_context_p]
|
||||
|
@ -198,9 +279,14 @@ _lib.llama_free.restype = None
|
|||
# TODO: not great API - very likely to change
|
||||
# Returns 0 on success
|
||||
# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
||||
# LLAMA_API int llama_model_quantize(
|
||||
# const char * fname_inp,
|
||||
# const char * fname_out,
|
||||
# enum llama_ftype ftype,
|
||||
# int nthread);
|
||||
def llama_model_quantize(
|
||||
fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int
|
||||
) -> c_int:
|
||||
) -> int:
|
||||
return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread)
|
||||
|
||||
|
||||
|
@ -214,12 +300,17 @@ _lib.llama_model_quantize.restype = c_int
|
|||
# The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
# will be applied on top of the previous one
|
||||
# Returns 0 on success
|
||||
# LLAMA_API int llama_apply_lora_from_file(
|
||||
# struct llama_context * ctx,
|
||||
# const char * path_lora,
|
||||
# const char * path_base_model,
|
||||
# int n_threads);
|
||||
def llama_apply_lora_from_file(
|
||||
ctx: llama_context_p,
|
||||
path_lora: c_char_p,
|
||||
path_base_model: c_char_p,
|
||||
n_threads: c_int,
|
||||
) -> c_int:
|
||||
) -> int:
|
||||
return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
|
||||
|
||||
|
||||
|
@ -228,7 +319,8 @@ _lib.llama_apply_lora_from_file.restype = c_int
|
|||
|
||||
|
||||
# Returns the number of tokens in the KV cache
|
||||
def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
|
||||
# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
||||
def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
|
||||
return _lib.llama_get_kv_cache_token_count(ctx)
|
||||
|
||||
|
||||
|
@ -237,6 +329,7 @@ _lib.llama_get_kv_cache_token_count.restype = c_int
|
|||
|
||||
|
||||
# Sets the current rng seed.
|
||||
# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
||||
def llama_set_rng_seed(ctx: llama_context_p, seed: c_int):
|
||||
return _lib.llama_set_rng_seed(ctx, seed)
|
||||
|
||||
|
@ -247,7 +340,8 @@ _lib.llama_set_rng_seed.restype = None
|
|||
|
||||
# Returns the maximum size in bytes of the state (rng, logits, embedding
|
||||
# and kv_cache) - will often be smaller after compacting tokens
|
||||
def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
|
||||
# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
||||
def llama_get_state_size(ctx: llama_context_p) -> int:
|
||||
return _lib.llama_get_state_size(ctx)
|
||||
|
||||
|
||||
|
@ -258,10 +352,11 @@ _lib.llama_get_state_size.restype = c_size_t
|
|||
# Copies the state to the specified destination address.
|
||||
# Destination needs to have allocated enough memory.
|
||||
# Returns the number of bytes copied
|
||||
# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
||||
def llama_copy_state_data(
|
||||
ctx: llama_context_p, dest # type: Array[c_uint8]
|
||||
ctx: llama_context_p, dst # type: Array[c_uint8]
|
||||
) -> int:
|
||||
return _lib.llama_copy_state_data(ctx, dest)
|
||||
return _lib.llama_copy_state_data(ctx, dst)
|
||||
|
||||
|
||||
_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
|
||||
|
@ -270,6 +365,7 @@ _lib.llama_copy_state_data.restype = c_size_t
|
|||
|
||||
# Set the state reading from the specified address
|
||||
# Returns the number of bytes read
|
||||
# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
||||
def llama_set_state_data(
|
||||
ctx: llama_context_p, src # type: Array[c_uint8]
|
||||
) -> int:
|
||||
|
@ -281,13 +377,14 @@ _lib.llama_set_state_data.restype = c_size_t
|
|||
|
||||
|
||||
# Save/load session file
|
||||
# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
||||
def llama_load_session_file(
|
||||
ctx: llama_context_p,
|
||||
path_session: bytes,
|
||||
tokens_out, # type: Array[llama_token]
|
||||
n_token_capacity: c_size_t,
|
||||
n_token_count_out, # type: _Pointer[c_size_t]
|
||||
) -> c_size_t:
|
||||
) -> int:
|
||||
return _lib.llama_load_session_file(
|
||||
ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
|
||||
)
|
||||
|
@ -303,12 +400,13 @@ _lib.llama_load_session_file.argtypes = [
|
|||
_lib.llama_load_session_file.restype = c_size_t
|
||||
|
||||
|
||||
# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
|
||||
def llama_save_session_file(
|
||||
ctx: llama_context_p,
|
||||
path_session: bytes,
|
||||
tokens, # type: Array[llama_token]
|
||||
n_token_count: c_size_t,
|
||||
) -> c_size_t:
|
||||
) -> int:
|
||||
return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
|
||||
|
||||
|
||||
|
@ -325,13 +423,19 @@ _lib.llama_save_session_file.restype = c_size_t
|
|||
# tokens + n_tokens is the provided batch of new tokens to process
|
||||
# n_past is the number of tokens to use from previous eval calls
|
||||
# Returns 0 on success
|
||||
# LLAMA_API int llama_eval(
|
||||
# struct llama_context * ctx,
|
||||
# const llama_token * tokens,
|
||||
# int n_tokens,
|
||||
# int n_past,
|
||||
# int n_threads);
|
||||
def llama_eval(
|
||||
ctx: llama_context_p,
|
||||
tokens, # type: Array[llama_token]
|
||||
n_tokens: c_int,
|
||||
n_past: c_int,
|
||||
n_threads: c_int,
|
||||
) -> c_int:
|
||||
) -> int:
|
||||
return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
|
||||
|
||||
|
||||
|
@ -344,13 +448,19 @@ _lib.llama_eval.restype = c_int
|
|||
# Returns the number of tokens on success, no more than n_max_tokens
|
||||
# Returns a negative number on failure - the number of tokens that would have been returned
|
||||
# TODO: not sure if correct
|
||||
# LLAMA_API int llama_tokenize(
|
||||
# struct llama_context * ctx,
|
||||
# const char * text,
|
||||
# llama_token * tokens,
|
||||
# int n_max_tokens,
|
||||
# bool add_bos);
|
||||
def llama_tokenize(
|
||||
ctx: llama_context_p,
|
||||
text: bytes,
|
||||
tokens, # type: Array[llama_token]
|
||||
n_max_tokens: c_int,
|
||||
add_bos: c_bool,
|
||||
) -> c_int:
|
||||
) -> int:
|
||||
return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
|
||||
|
||||
|
||||
|
@ -358,7 +468,8 @@ _lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int,
|
|||
_lib.llama_tokenize.restype = c_int
|
||||
|
||||
|
||||
def llama_n_vocab(ctx: llama_context_p) -> c_int:
|
||||
# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
||||
def llama_n_vocab(ctx: llama_context_p) -> int:
|
||||
return _lib.llama_n_vocab(ctx)
|
||||
|
||||
|
||||
|
@ -366,7 +477,8 @@ _lib.llama_n_vocab.argtypes = [llama_context_p]
|
|||
_lib.llama_n_vocab.restype = c_int
|
||||
|
||||
|
||||
def llama_n_ctx(ctx: llama_context_p) -> c_int:
|
||||
# LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||
def llama_n_ctx(ctx: llama_context_p) -> int:
|
||||
return _lib.llama_n_ctx(ctx)
|
||||
|
||||
|
||||
|
@ -374,7 +486,8 @@ _lib.llama_n_ctx.argtypes = [llama_context_p]
|
|||
_lib.llama_n_ctx.restype = c_int
|
||||
|
||||
|
||||
def llama_n_embd(ctx: llama_context_p) -> c_int:
|
||||
# LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||
def llama_n_embd(ctx: llama_context_p) -> int:
|
||||
return _lib.llama_n_embd(ctx)
|
||||
|
||||
|
||||
|
@ -387,6 +500,7 @@ _lib.llama_n_embd.restype = c_int
|
|||
# Can be mutated in order to change the probabilities of the next token
|
||||
# Rows: n_tokens
|
||||
# Cols: n_vocab
|
||||
# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
def llama_get_logits(
|
||||
ctx: llama_context_p,
|
||||
): # type: (...) -> Array[float] # type: ignore
|
||||
|
@ -399,6 +513,7 @@ _lib.llama_get_logits.restype = c_float_p
|
|||
|
||||
# Get the embeddings for the input
|
||||
# shape: [n_embd] (1-dimensional)
|
||||
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
def llama_get_embeddings(
|
||||
ctx: llama_context_p,
|
||||
): # type: (...) -> Array[float] # type: ignore
|
||||
|
@ -410,6 +525,7 @@ _lib.llama_get_embeddings.restype = c_float_p
|
|||
|
||||
|
||||
# Token Id -> String. Uses the vocabulary in the provided context
|
||||
# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
||||
def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
|
||||
return _lib.llama_token_to_str(ctx, token)
|
||||
|
||||
|
@ -420,7 +536,8 @@ _lib.llama_token_to_str.restype = c_char_p
|
|||
# Special tokens
|
||||
|
||||
|
||||
def llama_token_bos() -> llama_token:
|
||||
# LLAMA_API llama_token llama_token_bos();
|
||||
def llama_token_bos() -> int:
|
||||
return _lib.llama_token_bos()
|
||||
|
||||
|
||||
|
@ -428,7 +545,8 @@ _lib.llama_token_bos.argtypes = []
|
|||
_lib.llama_token_bos.restype = llama_token
|
||||
|
||||
|
||||
def llama_token_eos() -> llama_token:
|
||||
# LLAMA_API llama_token llama_token_eos();
|
||||
def llama_token_eos() -> int:
|
||||
return _lib.llama_token_eos()
|
||||
|
||||
|
||||
|
@ -436,7 +554,8 @@ _lib.llama_token_eos.argtypes = []
|
|||
_lib.llama_token_eos.restype = llama_token
|
||||
|
||||
|
||||
def llama_token_nl() -> llama_token:
|
||||
# LLAMA_API llama_token llama_token_nl();
|
||||
def llama_token_nl() -> int:
|
||||
return _lib.llama_token_nl()
|
||||
|
||||
|
||||
|
@ -448,6 +567,7 @@ _lib.llama_token_nl.restype = llama_token
|
|||
|
||||
|
||||
# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
||||
# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
||||
def llama_sample_repetition_penalty(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
|
@ -471,6 +591,7 @@ _lib.llama_sample_repetition_penalty.restype = None
|
|||
|
||||
|
||||
# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
||||
# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
||||
def llama_sample_frequency_and_presence_penalties(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
|
@ -501,6 +622,7 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None
|
|||
|
||||
|
||||
# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||
# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
def llama_sample_softmax(
|
||||
ctx: llama_context_p, candidates # type: _Pointer[llama_token_data]
|
||||
):
|
||||
|
@ -515,6 +637,7 @@ _lib.llama_sample_softmax.restype = None
|
|||
|
||||
|
||||
# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
|
||||
def llama_sample_top_k(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
|
@ -534,6 +657,7 @@ _lib.llama_sample_top_k.restype = None
|
|||
|
||||
|
||||
# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
def llama_sample_top_p(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
|
@ -553,6 +677,7 @@ _lib.llama_sample_top_p.restype = None
|
|||
|
||||
|
||||
# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||
# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
|
||||
def llama_sample_tail_free(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
|
@ -572,6 +697,7 @@ _lib.llama_sample_tail_free.restype = None
|
|||
|
||||
|
||||
# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
||||
# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
def llama_sample_typical(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
|
@ -590,6 +716,7 @@ _lib.llama_sample_typical.argtypes = [
|
|||
_lib.llama_sample_typical.restype = None
|
||||
|
||||
|
||||
# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
||||
def llama_sample_temperature(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
|
@ -612,6 +739,7 @@ _lib.llama_sample_temperature.restype = None
|
|||
# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||
# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
||||
# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
||||
# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
|
||||
def llama_sample_token_mirostat(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
|
@ -619,7 +747,7 @@ def llama_sample_token_mirostat(
|
|||
eta: c_float,
|
||||
m: c_int,
|
||||
mu, # type: _Pointer[c_float]
|
||||
) -> llama_token:
|
||||
) -> int:
|
||||
return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
|
||||
|
||||
|
||||
|
@ -639,13 +767,14 @@ _lib.llama_sample_token_mirostat.restype = llama_token
|
|||
# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||
# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||
# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
||||
# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
|
||||
def llama_sample_token_mirostat_v2(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
tau: c_float,
|
||||
eta: c_float,
|
||||
mu, # type: _Pointer[c_float]
|
||||
) -> llama_token:
|
||||
) -> int:
|
||||
return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
|
||||
|
||||
|
||||
|
@ -660,10 +789,11 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token
|
|||
|
||||
|
||||
# @details Selects the token with the highest probability.
|
||||
# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
def llama_sample_token_greedy(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
) -> llama_token:
|
||||
) -> int:
|
||||
return _lib.llama_sample_token_greedy(ctx, candidates)
|
||||
|
||||
|
||||
|
@ -675,10 +805,11 @@ _lib.llama_sample_token_greedy.restype = llama_token
|
|||
|
||||
|
||||
# @details Randomly selects a token from the candidates based on their probabilities.
|
||||
# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
def llama_sample_token(
|
||||
ctx: llama_context_p,
|
||||
candidates, # type: _Pointer[llama_token_data_array]
|
||||
) -> llama_token:
|
||||
) -> int:
|
||||
return _lib.llama_sample_token(ctx, candidates)
|
||||
|
||||
|
||||
|
@ -692,6 +823,7 @@ _lib.llama_sample_token.restype = llama_token
|
|||
# Performance information
|
||||
|
||||
|
||||
# LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
||||
def llama_print_timings(ctx: llama_context_p):
|
||||
_lib.llama_print_timings(ctx)
|
||||
|
||||
|
@ -700,6 +832,7 @@ _lib.llama_print_timings.argtypes = [llama_context_p]
|
|||
_lib.llama_print_timings.restype = None
|
||||
|
||||
|
||||
# LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
||||
def llama_reset_timings(ctx: llama_context_p):
|
||||
_lib.llama_reset_timings(ctx)
|
||||
|
||||
|
@ -709,9 +842,19 @@ _lib.llama_reset_timings.restype = None
|
|||
|
||||
|
||||
# Print system information
|
||||
# LLAMA_API const char * llama_print_system_info(void);
|
||||
def llama_print_system_info() -> bytes:
|
||||
return _lib.llama_print_system_info()
|
||||
|
||||
|
||||
_lib.llama_print_system_info.argtypes = []
|
||||
_lib.llama_print_system_info.restype = c_char_p
|
||||
|
||||
###################################################################################################
|
||||
|
||||
|
||||
_llama_initialized = False
|
||||
|
||||
if not _llama_initialized:
|
||||
llama_init_backend()
|
||||
_llama_initialized = True
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import List, Optional, Dict, Union
|
||||
from typing import List, Optional, Dict
|
||||
from typing_extensions import TypedDict, NotRequired, Literal
|
||||
|
||||
|
||||
|
@ -22,9 +22,9 @@ class Embedding(TypedDict):
|
|||
|
||||
class CompletionLogprobs(TypedDict):
|
||||
text_offset: List[int]
|
||||
token_logprobs: List[float]
|
||||
token_logprobs: List[Optional[float]]
|
||||
tokens: List[str]
|
||||
top_logprobs: List[Dict[str, float]]
|
||||
top_logprobs: List[Optional[Dict[str, float]]]
|
||||
|
||||
|
||||
class CompletionChoice(TypedDict):
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import json
|
||||
import logging
|
||||
import multiprocessing
|
||||
from threading import Lock
|
||||
from typing import List, Optional, Union, Iterator, Dict
|
||||
|
@ -16,7 +17,16 @@ class Settings(BaseSettings):
|
|||
model: str = Field(
|
||||
description="The path to the model to use for generating completions."
|
||||
)
|
||||
model_alias: Optional[str] = Field(
|
||||
default=None,
|
||||
description="The alias of the model to use for generating completions.",
|
||||
)
|
||||
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
|
||||
n_gpu_layers: int = Field(
|
||||
default=0,
|
||||
ge=0,
|
||||
description="The number of layers to put on the GPU. The rest will be on the CPU.",
|
||||
)
|
||||
n_batch: int = Field(
|
||||
default=512, ge=1, description="The batch size to use per eval."
|
||||
)
|
||||
|
@ -59,6 +69,7 @@ class Settings(BaseSettings):
|
|||
|
||||
router = APIRouter()
|
||||
|
||||
settings: Optional[Settings] = None
|
||||
llama: Optional[llama_cpp.Llama] = None
|
||||
|
||||
|
||||
|
@ -80,6 +91,7 @@ def create_app(settings: Optional[Settings] = None):
|
|||
global llama
|
||||
llama = llama_cpp.Llama(
|
||||
model_path=settings.model,
|
||||
n_gpu_layers=settings.n_gpu_layers,
|
||||
f16_kv=settings.f16_kv,
|
||||
use_mlock=settings.use_mlock,
|
||||
use_mmap=settings.use_mmap,
|
||||
|
@ -95,6 +107,12 @@ def create_app(settings: Optional[Settings] = None):
|
|||
if settings.cache:
|
||||
cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size)
|
||||
llama.set_cache(cache)
|
||||
|
||||
def set_settings(_settings: Settings):
|
||||
global settings
|
||||
settings = _settings
|
||||
|
||||
set_settings(settings)
|
||||
return app
|
||||
|
||||
|
||||
|
@ -106,6 +124,10 @@ def get_llama():
|
|||
yield llama
|
||||
|
||||
|
||||
def get_settings():
|
||||
yield settings
|
||||
|
||||
|
||||
model_field = Field(description="The model to use for generating completions.")
|
||||
|
||||
max_tokens_field = Field(
|
||||
|
@ -152,9 +174,23 @@ repeat_penalty_field = Field(
|
|||
+ "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
|
||||
)
|
||||
|
||||
presence_penalty_field = Field(
|
||||
default=0.0,
|
||||
ge=-2.0,
|
||||
le=2.0,
|
||||
description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
|
||||
)
|
||||
|
||||
frequency_penalty_field = Field(
|
||||
default=0.0,
|
||||
ge=-2.0,
|
||||
le=2.0,
|
||||
description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
|
||||
)
|
||||
|
||||
|
||||
class CreateCompletionRequest(BaseModel):
|
||||
prompt: Optional[str] = Field(
|
||||
prompt: Union[str, List[str]] = Field(
|
||||
default="", description="The prompt to generate completions for."
|
||||
)
|
||||
suffix: Optional[str] = Field(
|
||||
|
@ -168,20 +204,20 @@ class CreateCompletionRequest(BaseModel):
|
|||
default=False,
|
||||
description="Whether to echo the prompt in the generated text. Useful for chatbots.",
|
||||
)
|
||||
stop: Optional[List[str]] = stop_field
|
||||
stop: Optional[Union[str, List[str]]] = stop_field
|
||||
stream: bool = stream_field
|
||||
logprobs: Optional[int] = Field(
|
||||
default=None,
|
||||
ge=0,
|
||||
description="The number of logprobs to generate. If None, no logprobs are generated.",
|
||||
)
|
||||
presence_penalty: Optional[float] = presence_penalty_field
|
||||
frequency_penalty: Optional[float] = frequency_penalty_field
|
||||
|
||||
# ignored or currently unsupported
|
||||
model: Optional[str] = model_field
|
||||
n: Optional[int] = 1
|
||||
logprobs: Optional[int] = Field(None)
|
||||
presence_penalty: Optional[float] = 0
|
||||
frequency_penalty: Optional[float] = 0
|
||||
best_of: Optional[int] = 1
|
||||
logit_bias: Optional[Dict[str, float]] = Field(None)
|
||||
user: Optional[str] = Field(None)
|
||||
|
@ -209,10 +245,13 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
|
|||
def create_completion(
|
||||
request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
|
||||
):
|
||||
if isinstance(request.prompt, list):
|
||||
assert len(request.prompt) <= 1
|
||||
request.prompt = request.prompt[0] if len(request.prompt) > 0 else ""
|
||||
|
||||
completion_or_chunks = llama(
|
||||
**request.dict(
|
||||
exclude={
|
||||
"model",
|
||||
"n",
|
||||
"best_of",
|
||||
"logit_bias",
|
||||
|
@ -221,15 +260,22 @@ def create_completion(
|
|||
)
|
||||
)
|
||||
if request.stream:
|
||||
|
||||
async def server_sent_events(
|
||||
chunks: Iterator[llama_cpp.CompletionChunk],
|
||||
):
|
||||
for chunk in chunks:
|
||||
yield dict(data=json.dumps(chunk))
|
||||
|
||||
chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks # type: ignore
|
||||
return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
|
||||
return EventSourceResponse(server_sent_events(chunks))
|
||||
completion: llama_cpp.Completion = completion_or_chunks # type: ignore
|
||||
return completion
|
||||
|
||||
|
||||
class CreateEmbeddingRequest(BaseModel):
|
||||
model: Optional[str] = model_field
|
||||
input: str = Field(description="The input to embed.")
|
||||
input: Union[str, List[str]] = Field(description="The input to embed.")
|
||||
user: Optional[str]
|
||||
|
||||
class Config:
|
||||
|
@ -250,7 +296,7 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
|
|||
def create_embedding(
|
||||
request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
|
||||
):
|
||||
return llama.create_embedding(**request.dict(exclude={"model", "user"}))
|
||||
return llama.create_embedding(**request.dict(exclude={"user"}))
|
||||
|
||||
|
||||
class ChatCompletionRequestMessage(BaseModel):
|
||||
|
@ -269,12 +315,12 @@ class CreateChatCompletionRequest(BaseModel):
|
|||
top_p: float = top_p_field
|
||||
stop: Optional[List[str]] = stop_field
|
||||
stream: bool = stream_field
|
||||
presence_penalty: Optional[float] = presence_penalty_field
|
||||
frequency_penalty: Optional[float] = frequency_penalty_field
|
||||
|
||||
# ignored or currently unsupported
|
||||
model: Optional[str] = model_field
|
||||
n: Optional[int] = 1
|
||||
presence_penalty: Optional[float] = 0
|
||||
frequency_penalty: Optional[float] = 0
|
||||
logit_bias: Optional[Dict[str, float]] = Field(None)
|
||||
user: Optional[str] = Field(None)
|
||||
|
||||
|
@ -311,7 +357,6 @@ def create_chat_completion(
|
|||
completion_or_chunks = llama.create_chat_completion(
|
||||
**request.dict(
|
||||
exclude={
|
||||
"model",
|
||||
"n",
|
||||
"logit_bias",
|
||||
"user",
|
||||
|
@ -354,13 +399,16 @@ GetModelResponse = create_model_from_typeddict(ModelList)
|
|||
|
||||
@router.get("/v1/models", response_model=GetModelResponse)
|
||||
def get_models(
|
||||
settings: Settings = Depends(get_settings),
|
||||
llama: llama_cpp.Llama = Depends(get_llama),
|
||||
) -> ModelList:
|
||||
return {
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": llama.model_path,
|
||||
"id": settings.model_alias
|
||||
if settings.model_alias is not None
|
||||
else llama.model_path,
|
||||
"object": "model",
|
||||
"owned_by": "me",
|
||||
"permissions": [],
|
||||
|
|
12
poetry.lock
generated
12
poetry.lock
generated
|
@ -463,14 +463,14 @@ socks = ["socksio (>=1.0.0,<2.0.0)"]
|
|||
|
||||
[[package]]
|
||||
name = "httpx"
|
||||
version = "0.24.0"
|
||||
version = "0.24.1"
|
||||
description = "The next generation HTTP client."
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "httpx-0.24.0-py3-none-any.whl", hash = "sha256:447556b50c1921c351ea54b4fe79d91b724ed2b027462ab9a329465d147d5a4e"},
|
||||
{file = "httpx-0.24.0.tar.gz", hash = "sha256:507d676fc3e26110d41df7d35ebd8b3b8585052450f4097401c9be59d928c63e"},
|
||||
{file = "httpx-0.24.1-py3-none-any.whl", hash = "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd"},
|
||||
{file = "httpx-0.24.1.tar.gz", hash = "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -800,14 +800,14 @@ mkdocs = ">=1.1"
|
|||
|
||||
[[package]]
|
||||
name = "mkdocs-material"
|
||||
version = "9.1.11"
|
||||
version = "9.1.14"
|
||||
description = "Documentation that simply works"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "mkdocs_material-9.1.11-py3-none-any.whl", hash = "sha256:fbc86d50ec2cf34d40d5c4365780f290ceedde23f1a0704323b34e7f16b0c0dd"},
|
||||
{file = "mkdocs_material-9.1.11.tar.gz", hash = "sha256:f5d473eb79d6640a5e668d4b2ab5b9de5e76ae0a0e2d864112df0cfe9016dc1d"},
|
||||
{file = "mkdocs_material-9.1.14-py3-none-any.whl", hash = "sha256:b56a9f955ed32d38333715cbbf68ce38f683bf38610c65094fa4ef2db9f08bcd"},
|
||||
{file = "mkdocs_material-9.1.14.tar.gz", hash = "sha256:1ae74cc5464ef2f64574d4884512efed7f4db386fb9bc6af20fd427d7a702f49"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "llama_cpp_python"
|
||||
version = "0.1.48"
|
||||
version = "0.1.55"
|
||||
description = "Python bindings for the llama.cpp library"
|
||||
authors = ["Andrei Betlen <abetlen@gmail.com>"]
|
||||
license = "MIT"
|
||||
|
@ -24,9 +24,9 @@ black = "^23.3.0"
|
|||
twine = "^4.0.2"
|
||||
mkdocs = "^1.4.3"
|
||||
mkdocstrings = {extras = ["python"], version = "^0.21.2"}
|
||||
mkdocs-material = "^9.1.11"
|
||||
mkdocs-material = "^9.1.14"
|
||||
pytest = "^7.3.1"
|
||||
httpx = "^0.24.0"
|
||||
httpx = "^0.24.1"
|
||||
scikit-build = "0.13"
|
||||
|
||||
[tool.poetry.extras]
|
||||
|
|
2
setup.py
2
setup.py
|
@ -10,7 +10,7 @@ setup(
|
|||
description="A Python wrapper for llama.cpp",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
version="0.1.48",
|
||||
version="0.1.55",
|
||||
author="Andrei Betlen",
|
||||
author_email="abetlen@gmail.com",
|
||||
license="MIT",
|
||||
|
|
|
@ -17,7 +17,7 @@ def test_llama():
|
|||
# @pytest.mark.skip(reason="need to update sample mocking")
|
||||
def test_llama_patch(monkeypatch):
|
||||
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
|
||||
n_vocab = int(llama_cpp.llama_n_vocab(llama.ctx))
|
||||
n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
|
||||
|
||||
## Set up mock function
|
||||
def mock_eval(*args, **kwargs):
|
||||
|
@ -107,7 +107,7 @@ def test_llama_pickle():
|
|||
|
||||
def test_utf8(monkeypatch):
|
||||
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
|
||||
n_vocab = int(llama_cpp.llama_n_vocab(llama.ctx))
|
||||
n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
|
||||
|
||||
## Set up mock function
|
||||
def mock_eval(*args, **kwargs):
|
||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25
|
||||
Subproject commit 66874d4fbcc7866377246efbcee938e8cc9c7d76
|
Loading…
Add table
Reference in a new issue