From 483b6ba53af349050458d3223e41aa71829f1391 Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Wed, 31 May 2023 15:16:32 +0000 Subject: [PATCH 1/8] Updated README.md instructions on how to use *_simple/Dockerfiles --- docker/README.md | 21 ++++++++++++++----- docker/{ => auto_docker}/Dockerfile | 0 docker/{ => auto_docker}/hug_model.py | 0 docker/{ => auto_docker}/start_server.sh | 0 .../Dockerfile} | 4 ++-- .../Dockerfile} | 2 +- 6 files changed, 19 insertions(+), 8 deletions(-) rename docker/{ => auto_docker}/Dockerfile (100%) rename docker/{ => auto_docker}/hug_model.py (100%) rename docker/{ => auto_docker}/start_server.sh (100%) rename docker/{Dockerfile.cuda_simple => cuda_simple/Dockerfile} (82%) rename docker/{Dockerfile.openblas_simple => openblas_simple/Dockerfile} (86%) diff --git a/docker/README.md b/docker/README.md index 100bcbd..130d180 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,10 +1,21 @@ -# Dockerfiles for building the llama-cpp-python server -- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS -- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS -- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke) +# Simple Dockerfiles for building the llama-cpp-python server with external model bin files +- `./openblas_simple/Dockerfile` - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image + - `cd ./openblas_simple` + - `docker build -t openblas_simple .` + - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t openblas_simple` + where `/` is the full path to the model file on the Docker host system. +- `./cuda_simple/Dockerfile` - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image + - `cd ./cuda_simple` + - `docker build -t cuda_simple .` + - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple` + where `/` is the full path to the model file on the Docker host system. + +# "Bot-in-a-box" - a method to build a Docker image by choosing a model to be downloaded and loading into a Docker image + - `cd ./auto_docker`: + - `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke) - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` -# Get model from Hugging Face +## Get model from Hugging Face `python3 ./hug_model.py` You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. diff --git a/docker/Dockerfile b/docker/auto_docker/Dockerfile similarity index 100% rename from docker/Dockerfile rename to docker/auto_docker/Dockerfile diff --git a/docker/hug_model.py b/docker/auto_docker/hug_model.py similarity index 100% rename from docker/hug_model.py rename to docker/auto_docker/hug_model.py diff --git a/docker/start_server.sh b/docker/auto_docker/start_server.sh similarity index 100% rename from docker/start_server.sh rename to docker/auto_docker/start_server.sh diff --git a/docker/Dockerfile.cuda_simple b/docker/cuda_simple/Dockerfile similarity index 82% rename from docker/Dockerfile.cuda_simple rename to docker/cuda_simple/Dockerfile index dda7a9f..24906d5 100644 --- a/docker/Dockerfile.cuda_simple +++ b/docker/cuda_simple/Dockerfile @@ -1,5 +1,5 @@ ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04" -FROM ${CUDA_IMAGE} +FROM nvidia/cuda:${CUDA_IMAGE} # We need to set the host to 0.0.0.0 to allow outside access ENV HOST 0.0.0.0 @@ -10,7 +10,7 @@ COPY . . RUN apt update && apt install -y python3 python3-pip RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette -RUN LLAMA_CUBLAS=1 python3 setup.py develop +RUN LLAMA_CUBLAS=1 pip install llama-cpp-python # Run the server CMD python3 -m llama_cpp.server diff --git a/docker/Dockerfile.openblas_simple b/docker/openblas_simple/Dockerfile similarity index 86% rename from docker/Dockerfile.openblas_simple rename to docker/openblas_simple/Dockerfile index f58506f..1a95cae 100644 --- a/docker/Dockerfile.openblas_simple +++ b/docker/openblas_simple/Dockerfile @@ -9,7 +9,7 @@ COPY . . RUN apt update && apt install -y libopenblas-dev ninja-build build-essential RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette -RUN LLAMA_OPENBLAS=1 python3 setup.py develop +RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose # Run the server CMD python3 -m llama_cpp.server From 217d78320fb6096e0696182816df0bf3ae5b961a Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Wed, 31 May 2023 16:00:31 +0000 Subject: [PATCH 2/8] Added paramterised search and d/l for Hugging Face. Updated README.md --- .gitignore | 3 +++ docker/README.md | 41 +++++++++++++++++---------------- docker/auto_docker/hug_model.py | 30 ++++++++++++++++++------ 3 files changed, 47 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index fd64c09..8db9bcb 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ + +# model .bin files +docker/auto_docker/*.bin diff --git a/docker/README.md b/docker/README.md index 130d180..e61095f 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,3 +1,11 @@ +# Install Docker Server + +**Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR! + +[Install Docker Engine](https://docs.docker.com/engine/install) + +**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) + # Simple Dockerfiles for building the llama-cpp-python server with external model bin files - `./openblas_simple/Dockerfile` - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image - `cd ./openblas_simple` @@ -15,14 +23,14 @@ - `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke) - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` -## Get model from Hugging Face -`python3 ./hug_model.py` - -You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. +## Download a Llama Model from Hugging Face +- To download a MIT licensed Llama model run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml` +- To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama` +- You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. ``` docker $ ls -lh *.bin --rw-rw-r-- 1 user user 4.8G May 23 18:30 .q5_1.bin -lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> .q5_1.bin +-rw-rw-r-- 1 user user 4.8G May 23 18:30 q5_1.bin +lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> q5_1.bin ``` **Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least **TWICE** as much disk space as the size of the model: @@ -36,22 +44,15 @@ lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> .q5 **Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` -# Install Docker Server - -**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR! - -[Install Docker Engine](https://docs.docker.com/engine/install) - -# Use OpenBLAS +## Use OpenBLAS Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS: -## Build: -`docker build --build-arg -t openblas .` -## Run: +### Build: +`docker build -t openblas .` +### Run: `docker run --cap-add SYS_RESOURCE -t openblas` -# Use CuBLAS -Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) -## Build: +## Use CuBLAS +### Build: `docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .` -## Run: +### Run: `docker run --cap-add SYS_RESOURCE -t cublas` diff --git a/docker/auto_docker/hug_model.py b/docker/auto_docker/hug_model.py index 848a1aa..86a8214 100644 --- a/docker/auto_docker/hug_model.py +++ b/docker/auto_docker/hug_model.py @@ -2,6 +2,7 @@ import requests import json import os import struct +import argparse def make_request(url, params=None): print(f"Making request to {url}...") @@ -69,21 +70,28 @@ def get_user_choice(model_list): return None -import argparse - def main(): # Create an argument parser - parser = argparse.ArgumentParser(description='Process the model version.') + parser = argparse.ArgumentParser(description='Process some parameters.') + + # Arguments parser.add_argument('-v', '--version', type=int, default=0x0003, help='an integer for the version to be used') + parser.add_argument('-a', '--author', type=str, default='TheBloke', + help='an author to be filtered') + parser.add_argument('-t', '--tags', type=str, default='llama', + help='tags for the content') + parser.add_argument('-s', '--search', type=str, default='', + help='search term') # Parse the arguments args = parser.parse_args() # Define the parameters params = { - "author": "TheBloke", # Filter by author - "tags": "llama" + "author": args.author, + "tags": args.tags, + "search": args.search } models = make_request('https://huggingface.co/api/models', params=params) @@ -103,14 +111,22 @@ def main(): if rfilename and 'q5_1' in rfilename: model_list.append((model_id, rfilename)) - model_choice = get_user_choice(model_list) + # Choose the model + if len(model_list) == 1: + model_choice = model_list[0] + else: + model_choice = get_user_choice(model_list) + if model_choice is not None: model_id, rfilename = model_choice url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" download_file(url, rfilename) _, version = check_magic_and_version(rfilename) if version != args.version: - print(f"Warning: Expected version {args.version}, but found different version in the file.") + print(f"Warning: Expected version {args.version}, but found different version in the file.") + else: + print("Error - model choice was None") + exit(1) if __name__ == '__main__': main() From 5377f9784aec86970cacaf0f61689a4f41badde9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 31 May 2023 23:24:52 -0400 Subject: [PATCH 3/8] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 66874d4..ffb06a3 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 66874d4fbcc7866377246efbcee938e8cc9c7d76 +Subproject commit ffb06a345e3a9e30d39aaa5b46a23201a74be6de From 71f4582d4469ba74529386abb66a835e3ad1c374 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 31 May 2023 23:25:39 -0400 Subject: [PATCH 4/8] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9f83e19..0a0e569 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.56" +version = "0.1.57" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 39e1416..04d0554 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.56", + version="0.1.57", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From cf4931a4006a1d701f2c4ea5b2ce3cb02350d57d Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Fri, 2 Jun 2023 08:48:54 +0000 Subject: [PATCH 5/8] Working Open Llama 3B in a box --- docker/README.md | 5 ++-- docker/{auto_docker => open_llama}/Dockerfile | 0 docker/open_llama/build.sh | 14 +++++++++ .../{auto_docker => open_llama}/hug_model.py | 29 ++++++++++++------- docker/open_llama/start.sh | 28 ++++++++++++++++++ .../start_server.sh | 2 +- 6 files changed, 64 insertions(+), 14 deletions(-) rename docker/{auto_docker => open_llama}/Dockerfile (100%) create mode 100755 docker/open_llama/build.sh rename docker/{auto_docker => open_llama}/hug_model.py (83%) create mode 100755 docker/open_llama/start.sh rename docker/{auto_docker => open_llama}/start_server.sh (94%) diff --git a/docker/README.md b/docker/README.md index e61095f..2fb7ef8 100644 --- a/docker/README.md +++ b/docker/README.md @@ -24,7 +24,7 @@ - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` ## Download a Llama Model from Hugging Face -- To download a MIT licensed Llama model run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml` +- To download a MIT licensed Llama model you can run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml -f ggml-model-q5_1.bin` - To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama` - You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. ``` @@ -37,9 +37,10 @@ lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> q5_ | Model | Quantized size | |------:|----------------:| +| 3B | 3 GB | | 7B | 5 GB | | 13B | 10 GB | -| 30B | 25 GB | +| 33B | 25 GB | | 65B | 50 GB | **Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` diff --git a/docker/auto_docker/Dockerfile b/docker/open_llama/Dockerfile similarity index 100% rename from docker/auto_docker/Dockerfile rename to docker/open_llama/Dockerfile diff --git a/docker/open_llama/build.sh b/docker/open_llama/build.sh new file mode 100755 index 0000000..3a6457d --- /dev/null +++ b/docker/open_llama/build.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +MODEL="open_llama_3b" +# Get open_llama_3b_ggml q5_1 quantization +python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1" +ls -lh *.bin + +# Build the default OpenBLAS image +docker build -t $MODEL . +docker images | egrep "^(REPOSITORY|$MODEL)" + +echo +echo "To start the docker container run:" +echo "docker run -t -p 8000:8000 $MODEL" diff --git a/docker/auto_docker/hug_model.py b/docker/open_llama/hug_model.py similarity index 83% rename from docker/auto_docker/hug_model.py rename to docker/open_llama/hug_model.py index 86a8214..13c5b6b 100644 --- a/docker/auto_docker/hug_model.py +++ b/docker/open_llama/hug_model.py @@ -76,13 +76,15 @@ def main(): # Arguments parser.add_argument('-v', '--version', type=int, default=0x0003, - help='an integer for the version to be used') + help='hexadecimal version number of ggml file') parser.add_argument('-a', '--author', type=str, default='TheBloke', - help='an author to be filtered') - parser.add_argument('-t', '--tags', type=str, default='llama', - help='tags for the content') + help='HuggingFace author filter') + parser.add_argument('-t', '--tag', type=str, default='llama', + help='HuggingFace tag filter') parser.add_argument('-s', '--search', type=str, default='', - help='search term') + help='HuggingFace search filter') + parser.add_argument('-f', '--filename', type=str, default='q5_1', + help='HuggingFace model repository filename substring match') # Parse the arguments args = parser.parse_args() @@ -90,7 +92,7 @@ def main(): # Define the parameters params = { "author": args.author, - "tags": args.tags, + "tags": args.tag, "search": args.search } @@ -108,11 +110,15 @@ def main(): for sibling in model_info.get('siblings', []): rfilename = sibling.get('rfilename') - if rfilename and 'q5_1' in rfilename: + if rfilename and args.filename in rfilename: model_list.append((model_id, rfilename)) # Choose the model - if len(model_list) == 1: + model_list.sort(key=lambda x: x[0]) + if len(model_list) == 0: + print("No models found") + exit(1) + elif len(model_list) == 1: model_choice = model_list[0] else: model_choice = get_user_choice(model_list) @@ -120,13 +126,14 @@ def main(): if model_choice is not None: model_id, rfilename = model_choice url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" - download_file(url, rfilename) - _, version = check_magic_and_version(rfilename) + dest = f"{model_id.replace('/', '_')}_{rfilename}" + download_file(url, dest) + _, version = check_magic_and_version(dest) if version != args.version: print(f"Warning: Expected version {args.version}, but found different version in the file.") else: print("Error - model choice was None") - exit(1) + exit(2) if __name__ == '__main__': main() diff --git a/docker/open_llama/start.sh b/docker/open_llama/start.sh new file mode 100755 index 0000000..7ee8f74 --- /dev/null +++ b/docker/open_llama/start.sh @@ -0,0 +1,28 @@ +#!/bin/sh + +MODEL="open_llama_3b" + +# Start Docker container +docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL & +sleep 10 +echo +docker ps | egrep "(^CONTAINER|$MODEL)" + +# Test the model works +echo +curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{ + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": [ + "\n", + "###" + ] +}' | grep Paris +if [ $? -eq 0 ] +then + echo + echo "$MODEL is working!!" +else + echo + echo "ERROR: $MODEL not replying." + exit 1 +fi diff --git a/docker/auto_docker/start_server.sh b/docker/open_llama/start_server.sh similarity index 94% rename from docker/auto_docker/start_server.sh rename to docker/open_llama/start_server.sh index 176bd87..d3329ee 100755 --- a/docker/auto_docker/start_server.sh +++ b/docker/open_llama/start_server.sh @@ -1,6 +1,6 @@ #!/bin/sh -# For mmap support +# For mlock support ulimit -l unlimited if [ "$IMAGE" = "python:3-slim-bullseye" ]; then From f24e7a7e5229448ba64ab819287d07887567840d Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Fri, 2 Jun 2023 10:44:52 +0000 Subject: [PATCH 6/8] Updated instructions --- docker/README.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docker/README.md b/docker/README.md index 2fb7ef8..f4954d1 100644 --- a/docker/README.md +++ b/docker/README.md @@ -18,14 +18,15 @@ - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple` where `/` is the full path to the model file on the Docker host system. -# "Bot-in-a-box" - a method to build a Docker image by choosing a model to be downloaded and loading into a Docker image - - `cd ./auto_docker`: - - `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke) -- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` - -## Download a Llama Model from Hugging Face -- To download a MIT licensed Llama model you can run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml -f ggml-model-q5_1.bin` -- To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama` +# "Open-Llama-in-a-box" - Download a MIT licensed Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server +``` +$ cd ./open_llama +./build.sh +./start.sh +``` + +# Manually choose your own Llama model from Hugging Face +- `python3 ./hug_model.py -a TheBloke -t llama` - You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. ``` docker $ ls -lh *.bin From d4eef735d9d70cf1d8a9e098914b16ccf70f06fe Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Fri, 2 Jun 2023 11:03:19 +0000 Subject: [PATCH 7/8] Fixed .gitignore to ignore any downloaded model .bin files. Cleaned up README.md again --- .gitignore | 4 ++-- docker/README.md | 25 +++++++++++++++---------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 8db9bcb..79093b4 100644 --- a/.gitignore +++ b/.gitignore @@ -165,5 +165,5 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ -# model .bin files -docker/auto_docker/*.bin +# downloaded model .bin files +docker/open_llama/*.bin diff --git a/docker/README.md b/docker/README.md index f4954d1..c7e92d0 100644 --- a/docker/README.md +++ b/docker/README.md @@ -7,16 +7,21 @@ **Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) # Simple Dockerfiles for building the llama-cpp-python server with external model bin files -- `./openblas_simple/Dockerfile` - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image - - `cd ./openblas_simple` - - `docker build -t openblas_simple .` - - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t openblas_simple` - where `/` is the full path to the model file on the Docker host system. -- `./cuda_simple/Dockerfile` - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image - - `cd ./cuda_simple` - - `docker build -t cuda_simple .` - - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple` - where `/` is the full path to the model file on the Docker host system. +## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image +``` +cd ./openblas_simple +docker build -t openblas_simple . +docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t openblas_simple +``` +where `/` is the full path to the model file on the Docker host system. + +## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image +``` +cd ./cuda_simple +docker build -t cuda_simple . +docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple +``` +where `/` is the full path to the model file on the Docker host system. # "Open-Llama-in-a-box" - Download a MIT licensed Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server ``` From 30d32e996b3bbb4ad641ab275cf1d985f950d1cd Mon Sep 17 00:00:00 2001 From: Gary Mulder Date: Fri, 2 Jun 2023 11:08:59 +0000 Subject: [PATCH 8/8] More README.md corrections and cleanup --- docker/README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docker/README.md b/docker/README.md index c7e92d0..053d311 100644 --- a/docker/README.md +++ b/docker/README.md @@ -4,7 +4,7 @@ [Install Docker Engine](https://docs.docker.com/engine/install) -**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) +**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) # Simple Dockerfiles for building the llama-cpp-python server with external model bin files ## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image @@ -23,7 +23,8 @@ docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v : ``` where `/` is the full path to the model file on the Docker host system. -# "Open-Llama-in-a-box" - Download a MIT licensed Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server +# "Open-Llama-in-a-box" +## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server ``` $ cd ./open_llama ./build.sh @@ -31,8 +32,8 @@ $ cd ./open_llama ``` # Manually choose your own Llama model from Hugging Face -- `python3 ./hug_model.py -a TheBloke -t llama` -- You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. +`python3 ./hug_model.py -a TheBloke -t llama` +You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. ``` docker $ ls -lh *.bin -rw-rw-r-- 1 user user 4.8G May 23 18:30 q5_1.bin