diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..f0ef5f7 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,51 @@ +# Define the image argument and provide a default value +ARG IMAGE=python:3-slim-bullseye + +# Use the image as specified +FROM ${IMAGE} + +# Re-declare the ARG after FROM +ARG IMAGE + +# Update and upgrade the existing packages +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + ninja-build \ + build-essential + +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette + +# Perform the conditional installations based on the image +RUN echo "Image: ${IMAGE}" && \ + if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \ + echo "OpenBLAS install:" && \ + apt-get install -y --no-install-recommends libopenblas-dev && \ + LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \ +else \ + echo "CuBLAS install:" && \ + LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \ +fi + +# Clean up apt cache +RUN rm -rf /var/lib/apt/lists/* + +# Set a working directory for better clarity +WORKDIR /app + +# Copy files to the app directory +RUN echo "Installing model...this can take some time..." +COPY ./model.bin /app/model.bin +COPY ./start_server.sh /app/start_server.sh + +# Make the server start script executable +RUN chmod +x /app/start_server.sh + +# Set environment variable for the host +ENV HOST=0.0.0.0 + +# Expose a port for the server +EXPOSE 8000 + +# Run the server start script +CMD ["/bin/sh", "/app/start_server.sh"] diff --git a/Dockerfile.cuda b/docker/Dockerfile.cuda_simple similarity index 100% rename from Dockerfile.cuda rename to docker/Dockerfile.cuda_simple diff --git a/Dockerfile b/docker/Dockerfile.openblas_simple similarity index 100% rename from Dockerfile rename to docker/Dockerfile.openblas_simple diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..100bcbd --- /dev/null +++ b/docker/README.md @@ -0,0 +1,46 @@ +# Dockerfiles for building the llama-cpp-python server +- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS +- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS +- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke) +- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` + +# Get model from Hugging Face +`python3 ./hug_model.py` + +You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. +``` +docker $ ls -lh *.bin +-rw-rw-r-- 1 user user 4.8G May 23 18:30 .q5_1.bin +lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> .q5_1.bin +``` +**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least +**TWICE** as much disk space as the size of the model: + +| Model | Quantized size | +|------:|----------------:| +| 7B | 5 GB | +| 13B | 10 GB | +| 30B | 25 GB | +| 65B | 50 GB | + +**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` + +# Install Docker Server + +**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR! + +[Install Docker Engine](https://docs.docker.com/engine/install) + +# Use OpenBLAS +Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS: +## Build: +`docker build --build-arg -t openblas .` +## Run: +`docker run --cap-add SYS_RESOURCE -t openblas` + +# Use CuBLAS +Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) +## Build: +`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .` +## Run: +`docker run --cap-add SYS_RESOURCE -t cublas` diff --git a/docker/hug_model.py b/docker/hug_model.py new file mode 100644 index 0000000..848a1aa --- /dev/null +++ b/docker/hug_model.py @@ -0,0 +1,116 @@ +import requests +import json +import os +import struct + +def make_request(url, params=None): + print(f"Making request to {url}...") + response = requests.get(url, params=params) + if response.status_code == 200: + return json.loads(response.text) + else: + print(f"Request failed with status code {response.status_code}") + return None + +def check_magic_and_version(filename): + with open(filename, 'rb') as f: + # Read the first 6 bytes from the file + data = f.read(6) + + # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int + # and the next 2 bytes as a little-endian unsigned short + magic, version = struct.unpack('= 10485760: # 10 MB + print('.', end='', flush=True) + total_downloaded = 0 + print("\nDownload complete.") + + # Creating a symbolic link from destination to "model.bin" + if os.path.isfile("model.bin"): + os.remove("model.bin") # remove the existing link if any + os.symlink(destination, "model.bin") + else: + print(f"Download failed with status code {response.status_code}") + +def get_user_choice(model_list): + # Print the enumerated list + print("\n") + for i, (model_id, rfilename) in enumerate(model_list): + print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}") + + # Get user's choice + choice = input("Choose a model to download by entering the corresponding number: ") + try: + index = int(choice) - 1 + if 0 <= index < len(model_list): + # Return the chosen model + return model_list[index] + else: + print("Invalid choice.") + except ValueError: + print("Invalid input. Please enter a number corresponding to a model.") + except IndexError: + print("Invalid choice. Index out of range.") + + return None + +import argparse + +def main(): + # Create an argument parser + parser = argparse.ArgumentParser(description='Process the model version.') + parser.add_argument('-v', '--version', type=int, default=0x0003, + help='an integer for the version to be used') + + # Parse the arguments + args = parser.parse_args() + + # Define the parameters + params = { + "author": "TheBloke", # Filter by author + "tags": "llama" + } + + models = make_request('https://huggingface.co/api/models', params=params) + if models is None: + return + + model_list = [] + # Iterate over the models + for model in models: + model_id = model['id'] + model_info = make_request(f'https://huggingface.co/api/models/{model_id}') + if model_info is None: + continue + + for sibling in model_info.get('siblings', []): + rfilename = sibling.get('rfilename') + if rfilename and 'q5_1' in rfilename: + model_list.append((model_id, rfilename)) + + model_choice = get_user_choice(model_list) + if model_choice is not None: + model_id, rfilename = model_choice + url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" + download_file(url, rfilename) + _, version = check_magic_and_version(rfilename) + if version != args.version: + print(f"Warning: Expected version {args.version}, but found different version in the file.") + +if __name__ == '__main__': + main() diff --git a/docker/start_server.sh b/docker/start_server.sh new file mode 100755 index 0000000..176bd87 --- /dev/null +++ b/docker/start_server.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +# For mmap support +ulimit -l unlimited + +if [ "$IMAGE" = "python:3-slim-bullseye" ]; then + python3 -B -m llama_cpp.server --model /app/model.bin +else + # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM + python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000 +fi