diff --git a/docker/README.md b/docker/README.md index e61095f..2fb7ef8 100644 --- a/docker/README.md +++ b/docker/README.md @@ -24,7 +24,7 @@ - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` ## Download a Llama Model from Hugging Face -- To download a MIT licensed Llama model run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml` +- To download a MIT licensed Llama model you can run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml -f ggml-model-q5_1.bin` - To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama` - You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. ``` @@ -37,9 +37,10 @@ lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> q5_ | Model | Quantized size | |------:|----------------:| +| 3B | 3 GB | | 7B | 5 GB | | 13B | 10 GB | -| 30B | 25 GB | +| 33B | 25 GB | | 65B | 50 GB | **Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` diff --git a/docker/auto_docker/Dockerfile b/docker/open_llama/Dockerfile similarity index 100% rename from docker/auto_docker/Dockerfile rename to docker/open_llama/Dockerfile diff --git a/docker/open_llama/build.sh b/docker/open_llama/build.sh new file mode 100755 index 0000000..3a6457d --- /dev/null +++ b/docker/open_llama/build.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +MODEL="open_llama_3b" +# Get open_llama_3b_ggml q5_1 quantization +python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1" +ls -lh *.bin + +# Build the default OpenBLAS image +docker build -t $MODEL . +docker images | egrep "^(REPOSITORY|$MODEL)" + +echo +echo "To start the docker container run:" +echo "docker run -t -p 8000:8000 $MODEL" diff --git a/docker/auto_docker/hug_model.py b/docker/open_llama/hug_model.py similarity index 83% rename from docker/auto_docker/hug_model.py rename to docker/open_llama/hug_model.py index 86a8214..13c5b6b 100644 --- a/docker/auto_docker/hug_model.py +++ b/docker/open_llama/hug_model.py @@ -76,13 +76,15 @@ def main(): # Arguments parser.add_argument('-v', '--version', type=int, default=0x0003, - help='an integer for the version to be used') + help='hexadecimal version number of ggml file') parser.add_argument('-a', '--author', type=str, default='TheBloke', - help='an author to be filtered') - parser.add_argument('-t', '--tags', type=str, default='llama', - help='tags for the content') + help='HuggingFace author filter') + parser.add_argument('-t', '--tag', type=str, default='llama', + help='HuggingFace tag filter') parser.add_argument('-s', '--search', type=str, default='', - help='search term') + help='HuggingFace search filter') + parser.add_argument('-f', '--filename', type=str, default='q5_1', + help='HuggingFace model repository filename substring match') # Parse the arguments args = parser.parse_args() @@ -90,7 +92,7 @@ def main(): # Define the parameters params = { "author": args.author, - "tags": args.tags, + "tags": args.tag, "search": args.search } @@ -108,11 +110,15 @@ def main(): for sibling in model_info.get('siblings', []): rfilename = sibling.get('rfilename') - if rfilename and 'q5_1' in rfilename: + if rfilename and args.filename in rfilename: model_list.append((model_id, rfilename)) # Choose the model - if len(model_list) == 1: + model_list.sort(key=lambda x: x[0]) + if len(model_list) == 0: + print("No models found") + exit(1) + elif len(model_list) == 1: model_choice = model_list[0] else: model_choice = get_user_choice(model_list) @@ -120,13 +126,14 @@ def main(): if model_choice is not None: model_id, rfilename = model_choice url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" - download_file(url, rfilename) - _, version = check_magic_and_version(rfilename) + dest = f"{model_id.replace('/', '_')}_{rfilename}" + download_file(url, dest) + _, version = check_magic_and_version(dest) if version != args.version: print(f"Warning: Expected version {args.version}, but found different version in the file.") else: print("Error - model choice was None") - exit(1) + exit(2) if __name__ == '__main__': main() diff --git a/docker/open_llama/start.sh b/docker/open_llama/start.sh new file mode 100755 index 0000000..7ee8f74 --- /dev/null +++ b/docker/open_llama/start.sh @@ -0,0 +1,28 @@ +#!/bin/sh + +MODEL="open_llama_3b" + +# Start Docker container +docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL & +sleep 10 +echo +docker ps | egrep "(^CONTAINER|$MODEL)" + +# Test the model works +echo +curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{ + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": [ + "\n", + "###" + ] +}' | grep Paris +if [ $? -eq 0 ] +then + echo + echo "$MODEL is working!!" +else + echo + echo "ERROR: $MODEL not replying." + exit 1 +fi diff --git a/docker/auto_docker/start_server.sh b/docker/open_llama/start_server.sh similarity index 94% rename from docker/auto_docker/start_server.sh rename to docker/open_llama/start_server.sh index 176bd87..d3329ee 100755 --- a/docker/auto_docker/start_server.sh +++ b/docker/open_llama/start_server.sh @@ -1,6 +1,6 @@ #!/bin/sh -# For mmap support +# For mlock support ulimit -l unlimited if [ "$IMAGE" = "python:3-slim-bullseye" ]; then