diff --git a/.gitignore b/.gitignore index fd64c09..79093b4 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ + +# downloaded model .bin files +docker/open_llama/*.bin diff --git a/docker/README.md b/docker/README.md index 100bcbd..053d311 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,46 +1,66 @@ -# Dockerfiles for building the llama-cpp-python server -- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS -- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS -- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke) -- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` - -# Get model from Hugging Face -`python3 ./hug_model.py` +# Install Docker Server +**Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR! + +[Install Docker Engine](https://docs.docker.com/engine/install) + +**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) + +# Simple Dockerfiles for building the llama-cpp-python server with external model bin files +## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image +``` +cd ./openblas_simple +docker build -t openblas_simple . +docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t openblas_simple +``` +where `/` is the full path to the model file on the Docker host system. + +## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image +``` +cd ./cuda_simple +docker build -t cuda_simple . +docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple +``` +where `/` is the full path to the model file on the Docker host system. + +# "Open-Llama-in-a-box" +## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server +``` +$ cd ./open_llama +./build.sh +./start.sh +``` + +# Manually choose your own Llama model from Hugging Face +`python3 ./hug_model.py -a TheBloke -t llama` You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. ``` docker $ ls -lh *.bin --rw-rw-r-- 1 user user 4.8G May 23 18:30 .q5_1.bin -lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> .q5_1.bin +-rw-rw-r-- 1 user user 4.8G May 23 18:30 q5_1.bin +lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> q5_1.bin ``` **Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least **TWICE** as much disk space as the size of the model: | Model | Quantized size | |------:|----------------:| +| 3B | 3 GB | | 7B | 5 GB | | 13B | 10 GB | -| 30B | 25 GB | +| 33B | 25 GB | | 65B | 50 GB | **Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` -# Install Docker Server - -**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR! - -[Install Docker Engine](https://docs.docker.com/engine/install) - -# Use OpenBLAS +## Use OpenBLAS Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS: -## Build: -`docker build --build-arg -t openblas .` -## Run: +### Build: +`docker build -t openblas .` +### Run: `docker run --cap-add SYS_RESOURCE -t openblas` -# Use CuBLAS -Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) -## Build: +## Use CuBLAS +### Build: `docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .` -## Run: +### Run: `docker run --cap-add SYS_RESOURCE -t cublas` diff --git a/docker/Dockerfile.cuda_simple b/docker/cuda_simple/Dockerfile similarity index 82% rename from docker/Dockerfile.cuda_simple rename to docker/cuda_simple/Dockerfile index dda7a9f..24906d5 100644 --- a/docker/Dockerfile.cuda_simple +++ b/docker/cuda_simple/Dockerfile @@ -1,5 +1,5 @@ ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04" -FROM ${CUDA_IMAGE} +FROM nvidia/cuda:${CUDA_IMAGE} # We need to set the host to 0.0.0.0 to allow outside access ENV HOST 0.0.0.0 @@ -10,7 +10,7 @@ COPY . . RUN apt update && apt install -y python3 python3-pip RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette -RUN LLAMA_CUBLAS=1 python3 setup.py develop +RUN LLAMA_CUBLAS=1 pip install llama-cpp-python # Run the server CMD python3 -m llama_cpp.server diff --git a/docker/Dockerfile b/docker/open_llama/Dockerfile similarity index 100% rename from docker/Dockerfile rename to docker/open_llama/Dockerfile diff --git a/docker/open_llama/build.sh b/docker/open_llama/build.sh new file mode 100755 index 0000000..3a6457d --- /dev/null +++ b/docker/open_llama/build.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +MODEL="open_llama_3b" +# Get open_llama_3b_ggml q5_1 quantization +python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1" +ls -lh *.bin + +# Build the default OpenBLAS image +docker build -t $MODEL . +docker images | egrep "^(REPOSITORY|$MODEL)" + +echo +echo "To start the docker container run:" +echo "docker run -t -p 8000:8000 $MODEL" diff --git a/docker/hug_model.py b/docker/open_llama/hug_model.py similarity index 71% rename from docker/hug_model.py rename to docker/open_llama/hug_model.py index 848a1aa..13c5b6b 100644 --- a/docker/hug_model.py +++ b/docker/open_llama/hug_model.py @@ -2,6 +2,7 @@ import requests import json import os import struct +import argparse def make_request(url, params=None): print(f"Making request to {url}...") @@ -69,21 +70,30 @@ def get_user_choice(model_list): return None -import argparse - def main(): # Create an argument parser - parser = argparse.ArgumentParser(description='Process the model version.') + parser = argparse.ArgumentParser(description='Process some parameters.') + + # Arguments parser.add_argument('-v', '--version', type=int, default=0x0003, - help='an integer for the version to be used') + help='hexadecimal version number of ggml file') + parser.add_argument('-a', '--author', type=str, default='TheBloke', + help='HuggingFace author filter') + parser.add_argument('-t', '--tag', type=str, default='llama', + help='HuggingFace tag filter') + parser.add_argument('-s', '--search', type=str, default='', + help='HuggingFace search filter') + parser.add_argument('-f', '--filename', type=str, default='q5_1', + help='HuggingFace model repository filename substring match') # Parse the arguments args = parser.parse_args() # Define the parameters params = { - "author": "TheBloke", # Filter by author - "tags": "llama" + "author": args.author, + "tags": args.tag, + "search": args.search } models = make_request('https://huggingface.co/api/models', params=params) @@ -100,17 +110,30 @@ def main(): for sibling in model_info.get('siblings', []): rfilename = sibling.get('rfilename') - if rfilename and 'q5_1' in rfilename: + if rfilename and args.filename in rfilename: model_list.append((model_id, rfilename)) - model_choice = get_user_choice(model_list) + # Choose the model + model_list.sort(key=lambda x: x[0]) + if len(model_list) == 0: + print("No models found") + exit(1) + elif len(model_list) == 1: + model_choice = model_list[0] + else: + model_choice = get_user_choice(model_list) + if model_choice is not None: model_id, rfilename = model_choice url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" - download_file(url, rfilename) - _, version = check_magic_and_version(rfilename) + dest = f"{model_id.replace('/', '_')}_{rfilename}" + download_file(url, dest) + _, version = check_magic_and_version(dest) if version != args.version: - print(f"Warning: Expected version {args.version}, but found different version in the file.") + print(f"Warning: Expected version {args.version}, but found different version in the file.") + else: + print("Error - model choice was None") + exit(2) if __name__ == '__main__': main() diff --git a/docker/open_llama/start.sh b/docker/open_llama/start.sh new file mode 100755 index 0000000..7ee8f74 --- /dev/null +++ b/docker/open_llama/start.sh @@ -0,0 +1,28 @@ +#!/bin/sh + +MODEL="open_llama_3b" + +# Start Docker container +docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL & +sleep 10 +echo +docker ps | egrep "(^CONTAINER|$MODEL)" + +# Test the model works +echo +curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{ + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": [ + "\n", + "###" + ] +}' | grep Paris +if [ $? -eq 0 ] +then + echo + echo "$MODEL is working!!" +else + echo + echo "ERROR: $MODEL not replying." + exit 1 +fi diff --git a/docker/start_server.sh b/docker/open_llama/start_server.sh similarity index 94% rename from docker/start_server.sh rename to docker/open_llama/start_server.sh index 176bd87..d3329ee 100755 --- a/docker/start_server.sh +++ b/docker/open_llama/start_server.sh @@ -1,6 +1,6 @@ #!/bin/sh -# For mmap support +# For mlock support ulimit -l unlimited if [ "$IMAGE" = "python:3-slim-bullseye" ]; then diff --git a/docker/Dockerfile.openblas_simple b/docker/openblas_simple/Dockerfile similarity index 86% rename from docker/Dockerfile.openblas_simple rename to docker/openblas_simple/Dockerfile index f58506f..1a95cae 100644 --- a/docker/Dockerfile.openblas_simple +++ b/docker/openblas_simple/Dockerfile @@ -9,7 +9,7 @@ COPY . . RUN apt update && apt install -y libopenblas-dev ninja-build build-essential RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette -RUN LLAMA_OPENBLAS=1 python3 setup.py develop +RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose # Run the server CMD python3 -m llama_cpp.server diff --git a/pyproject.toml b/pyproject.toml index 45e1b8a..d68dc53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.56" +version = "0.1.57" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index ac52c78..ecbc70b 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.56", + version="0.1.57", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 66874d4..ffb06a3 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 66874d4fbcc7866377246efbcee938e8cc9c7d76 +Subproject commit ffb06a345e3a9e30d39aaa5b46a23201a74be6de