Merge branch 'abetlen:main' into main

This commit is contained in:
Maximilian Winter 2023-05-25 17:09:19 +02:00 committed by GitHub
commit c6a9659972
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 224 additions and 0 deletions

51
docker/Dockerfile Normal file
View file

@ -0,0 +1,51 @@
# Define the image argument and provide a default value
ARG IMAGE=python:3-slim-bullseye
# Use the image as specified
FROM ${IMAGE}
# Re-declare the ARG after FROM
ARG IMAGE
# Update and upgrade the existing packages
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
python3 \
python3-pip \
ninja-build \
build-essential
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
# Perform the conditional installations based on the image
RUN echo "Image: ${IMAGE}" && \
if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
echo "OpenBLAS install:" && \
apt-get install -y --no-install-recommends libopenblas-dev && \
LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
else \
echo "CuBLAS install:" && \
LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
fi
# Clean up apt cache
RUN rm -rf /var/lib/apt/lists/*
# Set a working directory for better clarity
WORKDIR /app
# Copy files to the app directory
RUN echo "Installing model...this can take some time..."
COPY ./model.bin /app/model.bin
COPY ./start_server.sh /app/start_server.sh
# Make the server start script executable
RUN chmod +x /app/start_server.sh
# Set environment variable for the host
ENV HOST=0.0.0.0
# Expose a port for the server
EXPOSE 8000
# Run the server start script
CMD ["/bin/sh", "/app/start_server.sh"]

46
docker/README.md Normal file
View file

@ -0,0 +1,46 @@
# Dockerfiles for building the llama-cpp-python server
- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS
- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS
- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke)
- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
# Get model from Hugging Face
`python3 ./hug_model.py`
You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
```
docker $ ls -lh *.bin
-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>.q5_1.bin
lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> <downloaded-model-file>.q5_1.bin
```
**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
**TWICE** as much disk space as the size of the model:
| Model | Quantized size |
|------:|----------------:|
| 7B | 5 GB |
| 13B | 10 GB |
| 30B | 25 GB |
| 65B | 50 GB |
**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
# Install Docker Server
**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
[Install Docker Engine](https://docs.docker.com/engine/install)
# Use OpenBLAS
Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
## Build:
`docker build --build-arg -t openblas .`
## Run:
`docker run --cap-add SYS_RESOURCE -t openblas`
# Use CuBLAS
Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
## Build:
`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
## Run:
`docker run --cap-add SYS_RESOURCE -t cublas`

116
docker/hug_model.py Normal file
View file

@ -0,0 +1,116 @@
import requests
import json
import os
import struct
def make_request(url, params=None):
print(f"Making request to {url}...")
response = requests.get(url, params=params)
if response.status_code == 200:
return json.loads(response.text)
else:
print(f"Request failed with status code {response.status_code}")
return None
def check_magic_and_version(filename):
with open(filename, 'rb') as f:
# Read the first 6 bytes from the file
data = f.read(6)
# Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
# and the next 2 bytes as a little-endian unsigned short
magic, version = struct.unpack('<I H', data)
print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
return magic, version
def download_file(url, destination):
print(f"Downloading {url} to {destination}...")
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(destination, 'wb') as f:
total_downloaded = 0
for chunk in response.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
total_downloaded += len(chunk)
if total_downloaded >= 10485760: # 10 MB
print('.', end='', flush=True)
total_downloaded = 0
print("\nDownload complete.")
# Creating a symbolic link from destination to "model.bin"
if os.path.isfile("model.bin"):
os.remove("model.bin") # remove the existing link if any
os.symlink(destination, "model.bin")
else:
print(f"Download failed with status code {response.status_code}")
def get_user_choice(model_list):
# Print the enumerated list
print("\n")
for i, (model_id, rfilename) in enumerate(model_list):
print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
# Get user's choice
choice = input("Choose a model to download by entering the corresponding number: ")
try:
index = int(choice) - 1
if 0 <= index < len(model_list):
# Return the chosen model
return model_list[index]
else:
print("Invalid choice.")
except ValueError:
print("Invalid input. Please enter a number corresponding to a model.")
except IndexError:
print("Invalid choice. Index out of range.")
return None
import argparse
def main():
# Create an argument parser
parser = argparse.ArgumentParser(description='Process the model version.')
parser.add_argument('-v', '--version', type=int, default=0x0003,
help='an integer for the version to be used')
# Parse the arguments
args = parser.parse_args()
# Define the parameters
params = {
"author": "TheBloke", # Filter by author
"tags": "llama"
}
models = make_request('https://huggingface.co/api/models', params=params)
if models is None:
return
model_list = []
# Iterate over the models
for model in models:
model_id = model['id']
model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
if model_info is None:
continue
for sibling in model_info.get('siblings', []):
rfilename = sibling.get('rfilename')
if rfilename and 'q5_1' in rfilename:
model_list.append((model_id, rfilename))
model_choice = get_user_choice(model_list)
if model_choice is not None:
model_id, rfilename = model_choice
url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
download_file(url, rfilename)
_, version = check_magic_and_version(rfilename)
if version != args.version:
print(f"Warning: Expected version {args.version}, but found different version in the file.")
if __name__ == '__main__':
main()

11
docker/start_server.sh Executable file
View file

@ -0,0 +1,11 @@
#!/bin/sh
# For mmap support
ulimit -l unlimited
if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
python3 -B -m llama_cpp.server --model /app/model.bin
else
# You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
fi