Merge branch 'Maximilian-Winter/main' into main
This commit is contained in:
commit
6bd1075291
7 changed files with 288 additions and 1 deletions
51
docker/Dockerfile
Normal file
51
docker/Dockerfile
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
# Define the image argument and provide a default value
|
||||||
|
ARG IMAGE=python:3-slim-bullseye
|
||||||
|
|
||||||
|
# Use the image as specified
|
||||||
|
FROM ${IMAGE}
|
||||||
|
|
||||||
|
# Re-declare the ARG after FROM
|
||||||
|
ARG IMAGE
|
||||||
|
|
||||||
|
# Update and upgrade the existing packages
|
||||||
|
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
ninja-build \
|
||||||
|
build-essential
|
||||||
|
|
||||||
|
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
|
||||||
|
|
||||||
|
# Perform the conditional installations based on the image
|
||||||
|
RUN echo "Image: ${IMAGE}" && \
|
||||||
|
if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
|
||||||
|
echo "OpenBLAS install:" && \
|
||||||
|
apt-get install -y --no-install-recommends libopenblas-dev && \
|
||||||
|
LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
|
||||||
|
else \
|
||||||
|
echo "CuBLAS install:" && \
|
||||||
|
LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Clean up apt cache
|
||||||
|
RUN rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Set a working directory for better clarity
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy files to the app directory
|
||||||
|
RUN echo "Installing model...this can take some time..."
|
||||||
|
COPY ./model.bin /app/model.bin
|
||||||
|
COPY ./start_server.sh /app/start_server.sh
|
||||||
|
|
||||||
|
# Make the server start script executable
|
||||||
|
RUN chmod +x /app/start_server.sh
|
||||||
|
|
||||||
|
# Set environment variable for the host
|
||||||
|
ENV HOST=0.0.0.0
|
||||||
|
|
||||||
|
# Expose a port for the server
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
# Run the server start script
|
||||||
|
CMD ["/bin/sh", "/app/start_server.sh"]
|
46
docker/README.md
Normal file
46
docker/README.md
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
# Dockerfiles for building the llama-cpp-python server
|
||||||
|
- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS
|
||||||
|
- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS
|
||||||
|
- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke)
|
||||||
|
- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
|
||||||
|
|
||||||
|
# Get model from Hugging Face
|
||||||
|
`python3 ./hug_model.py`
|
||||||
|
|
||||||
|
You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
|
||||||
|
```
|
||||||
|
docker $ ls -lh *.bin
|
||||||
|
-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>.q5_1.bin
|
||||||
|
lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> <downloaded-model-file>.q5_1.bin
|
||||||
|
```
|
||||||
|
**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
|
||||||
|
**TWICE** as much disk space as the size of the model:
|
||||||
|
|
||||||
|
| Model | Quantized size |
|
||||||
|
|------:|----------------:|
|
||||||
|
| 7B | 5 GB |
|
||||||
|
| 13B | 10 GB |
|
||||||
|
| 30B | 25 GB |
|
||||||
|
| 65B | 50 GB |
|
||||||
|
|
||||||
|
**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
|
||||||
|
|
||||||
|
# Install Docker Server
|
||||||
|
|
||||||
|
**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
|
||||||
|
|
||||||
|
[Install Docker Engine](https://docs.docker.com/engine/install)
|
||||||
|
|
||||||
|
# Use OpenBLAS
|
||||||
|
Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
|
||||||
|
## Build:
|
||||||
|
`docker build --build-arg -t openblas .`
|
||||||
|
## Run:
|
||||||
|
`docker run --cap-add SYS_RESOURCE -t openblas`
|
||||||
|
|
||||||
|
# Use CuBLAS
|
||||||
|
Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
|
||||||
|
## Build:
|
||||||
|
`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
|
||||||
|
## Run:
|
||||||
|
`docker run --cap-add SYS_RESOURCE -t cublas`
|
116
docker/hug_model.py
Normal file
116
docker/hug_model.py
Normal file
|
@ -0,0 +1,116 @@
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import struct
|
||||||
|
|
||||||
|
def make_request(url, params=None):
|
||||||
|
print(f"Making request to {url}...")
|
||||||
|
response = requests.get(url, params=params)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return json.loads(response.text)
|
||||||
|
else:
|
||||||
|
print(f"Request failed with status code {response.status_code}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def check_magic_and_version(filename):
|
||||||
|
with open(filename, 'rb') as f:
|
||||||
|
# Read the first 6 bytes from the file
|
||||||
|
data = f.read(6)
|
||||||
|
|
||||||
|
# Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
|
||||||
|
# and the next 2 bytes as a little-endian unsigned short
|
||||||
|
magic, version = struct.unpack('<I H', data)
|
||||||
|
|
||||||
|
print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
|
||||||
|
|
||||||
|
return magic, version
|
||||||
|
|
||||||
|
def download_file(url, destination):
|
||||||
|
print(f"Downloading {url} to {destination}...")
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
if response.status_code == 200:
|
||||||
|
with open(destination, 'wb') as f:
|
||||||
|
total_downloaded = 0
|
||||||
|
for chunk in response.iter_content(chunk_size=1024):
|
||||||
|
if chunk: # filter out keep-alive new chunks
|
||||||
|
f.write(chunk)
|
||||||
|
total_downloaded += len(chunk)
|
||||||
|
if total_downloaded >= 10485760: # 10 MB
|
||||||
|
print('.', end='', flush=True)
|
||||||
|
total_downloaded = 0
|
||||||
|
print("\nDownload complete.")
|
||||||
|
|
||||||
|
# Creating a symbolic link from destination to "model.bin"
|
||||||
|
if os.path.isfile("model.bin"):
|
||||||
|
os.remove("model.bin") # remove the existing link if any
|
||||||
|
os.symlink(destination, "model.bin")
|
||||||
|
else:
|
||||||
|
print(f"Download failed with status code {response.status_code}")
|
||||||
|
|
||||||
|
def get_user_choice(model_list):
|
||||||
|
# Print the enumerated list
|
||||||
|
print("\n")
|
||||||
|
for i, (model_id, rfilename) in enumerate(model_list):
|
||||||
|
print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
|
||||||
|
|
||||||
|
# Get user's choice
|
||||||
|
choice = input("Choose a model to download by entering the corresponding number: ")
|
||||||
|
try:
|
||||||
|
index = int(choice) - 1
|
||||||
|
if 0 <= index < len(model_list):
|
||||||
|
# Return the chosen model
|
||||||
|
return model_list[index]
|
||||||
|
else:
|
||||||
|
print("Invalid choice.")
|
||||||
|
except ValueError:
|
||||||
|
print("Invalid input. Please enter a number corresponding to a model.")
|
||||||
|
except IndexError:
|
||||||
|
print("Invalid choice. Index out of range.")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Create an argument parser
|
||||||
|
parser = argparse.ArgumentParser(description='Process the model version.')
|
||||||
|
parser.add_argument('-v', '--version', type=int, default=0x0003,
|
||||||
|
help='an integer for the version to be used')
|
||||||
|
|
||||||
|
# Parse the arguments
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Define the parameters
|
||||||
|
params = {
|
||||||
|
"author": "TheBloke", # Filter by author
|
||||||
|
"tags": "llama"
|
||||||
|
}
|
||||||
|
|
||||||
|
models = make_request('https://huggingface.co/api/models', params=params)
|
||||||
|
if models is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
model_list = []
|
||||||
|
# Iterate over the models
|
||||||
|
for model in models:
|
||||||
|
model_id = model['id']
|
||||||
|
model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
|
||||||
|
if model_info is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for sibling in model_info.get('siblings', []):
|
||||||
|
rfilename = sibling.get('rfilename')
|
||||||
|
if rfilename and 'q5_1' in rfilename:
|
||||||
|
model_list.append((model_id, rfilename))
|
||||||
|
|
||||||
|
model_choice = get_user_choice(model_list)
|
||||||
|
if model_choice is not None:
|
||||||
|
model_id, rfilename = model_choice
|
||||||
|
url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
|
||||||
|
download_file(url, rfilename)
|
||||||
|
_, version = check_magic_and_version(rfilename)
|
||||||
|
if version != args.version:
|
||||||
|
print(f"Warning: Expected version {args.version}, but found different version in the file.")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
11
docker/start_server.sh
Executable file
11
docker/start_server.sh
Executable file
|
@ -0,0 +1,11 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
# For mmap support
|
||||||
|
ulimit -l unlimited
|
||||||
|
|
||||||
|
if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
|
||||||
|
python3 -B -m llama_cpp.server --model /app/model.bin
|
||||||
|
else
|
||||||
|
# You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
|
||||||
|
python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
|
||||||
|
fi
|
|
@ -4,7 +4,17 @@ import uuid
|
||||||
import time
|
import time
|
||||||
import math
|
import math
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple
|
from typing import (
|
||||||
|
List,
|
||||||
|
Optional,
|
||||||
|
Union,
|
||||||
|
Generator,
|
||||||
|
Sequence,
|
||||||
|
Iterator,
|
||||||
|
Deque,
|
||||||
|
Tuple,
|
||||||
|
Callable,
|
||||||
|
)
|
||||||
from collections import deque, OrderedDict
|
from collections import deque, OrderedDict
|
||||||
|
|
||||||
from . import llama_cpp
|
from . import llama_cpp
|
||||||
|
@ -72,6 +82,24 @@ class LlamaState:
|
||||||
self.llama_state_size = llama_state_size
|
self.llama_state_size = llama_state_size
|
||||||
|
|
||||||
|
|
||||||
|
LogitsProcessor = Callable[[List[int], List[float]], List[float]]
|
||||||
|
|
||||||
|
|
||||||
|
class LogitsProcessorList(List[LogitsProcessor]):
|
||||||
|
def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]:
|
||||||
|
for processor in self:
|
||||||
|
scores = processor(input_ids, scores)
|
||||||
|
return scores
|
||||||
|
|
||||||
|
|
||||||
|
StoppingCriteria = Callable[[List[int], List[float]], bool]
|
||||||
|
|
||||||
|
|
||||||
|
class StoppingCriteriaList(List[StoppingCriteria]):
|
||||||
|
def __call__(self, input_ids: List[int], logits: List[float]) -> bool:
|
||||||
|
return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
|
||||||
|
|
||||||
|
|
||||||
class Llama:
|
class Llama:
|
||||||
"""High-level Python wrapper for a llama.cpp model."""
|
"""High-level Python wrapper for a llama.cpp model."""
|
||||||
|
|
||||||
|
@ -314,6 +342,7 @@ class Llama:
|
||||||
mirostat_tau: llama_cpp.c_float,
|
mirostat_tau: llama_cpp.c_float,
|
||||||
mirostat_eta: llama_cpp.c_float,
|
mirostat_eta: llama_cpp.c_float,
|
||||||
penalize_nl: bool = True,
|
penalize_nl: bool = True,
|
||||||
|
logits_processor: Optional[LogitsProcessorList] = None,
|
||||||
):
|
):
|
||||||
assert self.ctx is not None
|
assert self.ctx is not None
|
||||||
assert len(self.eval_logits) > 0
|
assert len(self.eval_logits) > 0
|
||||||
|
@ -326,6 +355,10 @@ class Llama:
|
||||||
else last_n_tokens_size
|
else last_n_tokens_size
|
||||||
)
|
)
|
||||||
logits = self.eval_logits[-1]
|
logits = self.eval_logits[-1]
|
||||||
|
|
||||||
|
if logits_processor is not None:
|
||||||
|
logits = logits_processor(list(self.eval_tokens), logits)
|
||||||
|
|
||||||
nl_logit = logits[self._token_nl]
|
nl_logit = logits[self._token_nl]
|
||||||
candidates = self._candidates
|
candidates = self._candidates
|
||||||
for i, logit in enumerate(logits):
|
for i, logit in enumerate(logits):
|
||||||
|
@ -434,6 +467,7 @@ class Llama:
|
||||||
mirostat_eta: float = 0.1,
|
mirostat_eta: float = 0.1,
|
||||||
mirostat_tau: float = 5.0,
|
mirostat_tau: float = 5.0,
|
||||||
penalize_nl: bool = True,
|
penalize_nl: bool = True,
|
||||||
|
logits_processor: Optional[LogitsProcessorList] = None,
|
||||||
):
|
):
|
||||||
"""Sample a token from the model.
|
"""Sample a token from the model.
|
||||||
|
|
||||||
|
@ -466,6 +500,7 @@ class Llama:
|
||||||
mirostat_tau=llama_cpp.c_float(mirostat_tau),
|
mirostat_tau=llama_cpp.c_float(mirostat_tau),
|
||||||
mirostat_eta=llama_cpp.c_float(mirostat_eta),
|
mirostat_eta=llama_cpp.c_float(mirostat_eta),
|
||||||
penalize_nl=penalize_nl,
|
penalize_nl=penalize_nl,
|
||||||
|
logits_processor=logits_processor,
|
||||||
)
|
)
|
||||||
|
|
||||||
def generate(
|
def generate(
|
||||||
|
@ -482,6 +517,8 @@ class Llama:
|
||||||
mirostat_mode: int = 0,
|
mirostat_mode: int = 0,
|
||||||
mirostat_tau: float = 5.0,
|
mirostat_tau: float = 5.0,
|
||||||
mirostat_eta: float = 0.1,
|
mirostat_eta: float = 0.1,
|
||||||
|
logits_processor: Optional[LogitsProcessorList] = None,
|
||||||
|
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
||||||
) -> Generator[int, Optional[Sequence[int]], None]:
|
) -> Generator[int, Optional[Sequence[int]], None]:
|
||||||
"""Create a generator of tokens from a prompt.
|
"""Create a generator of tokens from a prompt.
|
||||||
|
|
||||||
|
@ -539,7 +576,12 @@ class Llama:
|
||||||
mirostat_mode=mirostat_mode,
|
mirostat_mode=mirostat_mode,
|
||||||
mirostat_tau=mirostat_tau,
|
mirostat_tau=mirostat_tau,
|
||||||
mirostat_eta=mirostat_eta,
|
mirostat_eta=mirostat_eta,
|
||||||
|
logits_processor=logits_processor,
|
||||||
)
|
)
|
||||||
|
if stopping_criteria is not None and stopping_criteria(
|
||||||
|
list(self.eval_tokens), self.eval_logits[-1]
|
||||||
|
):
|
||||||
|
return
|
||||||
tokens_or_none = yield token
|
tokens_or_none = yield token
|
||||||
tokens = [token]
|
tokens = [token]
|
||||||
if tokens_or_none is not None:
|
if tokens_or_none is not None:
|
||||||
|
@ -637,6 +679,7 @@ class Llama:
|
||||||
model: Optional[str] = None,
|
model: Optional[str] = None,
|
||||||
) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
|
) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
|
||||||
assert self.ctx is not None
|
assert self.ctx is not None
|
||||||
|
|
||||||
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
|
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
|
||||||
created: int = int(time.time())
|
created: int = int(time.time())
|
||||||
completion_tokens: List[int] = []
|
completion_tokens: List[int] = []
|
||||||
|
@ -1334,6 +1377,11 @@ class Llama:
|
||||||
assert self.ctx is not None
|
assert self.ctx is not None
|
||||||
return llama_cpp.llama_n_vocab(self.ctx)
|
return llama_cpp.llama_n_vocab(self.ctx)
|
||||||
|
|
||||||
|
def tokenizer(self) -> "LlamaTokenizer":
|
||||||
|
"""Return the tokenizer for this model."""
|
||||||
|
assert self.ctx is not None
|
||||||
|
return LlamaTokenizer(self)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def token_eos() -> int:
|
def token_eos() -> int:
|
||||||
"""Return the end-of-sequence token."""
|
"""Return the end-of-sequence token."""
|
||||||
|
@ -1364,3 +1412,18 @@ class Llama:
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
return longest_prefix
|
return longest_prefix
|
||||||
|
|
||||||
|
|
||||||
|
class LlamaTokenizer:
|
||||||
|
def __init__(self, llama: Llama):
|
||||||
|
self.llama = llama
|
||||||
|
|
||||||
|
def encode(self, text: str) -> List[int]:
|
||||||
|
return self.llama.tokenize(text.encode("utf-8", errors="ignore"))
|
||||||
|
|
||||||
|
def decode(self, tokens: List[int]) -> str:
|
||||||
|
return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
|
||||||
|
return cls(Llama(model_path=path, vocab_only=True))
|
||||||
|
|
Loading…
Reference in a new issue