Working Open Llama 3B in a box

2023-06-02 08:48:54 +00:00 · 2023-06-02 08:48:54 +00:00 · cf4931a400
commit cf4931a400
parent 217d78320f
6 changed files with 64 additions and 14 deletions
--- a/docker/README.md
+++ b/docker/README.md
@ -24,7 +24,7 @@
 - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
 
 ## Download a Llama Model from Hugging Face
- To download a MIT licensed Llama model run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml`
+- To download a MIT licensed Llama model you can run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml -f ggml-model-q5_1.bin`
 - To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama`
 - You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
 ```
@ -37,9 +37,10 @@ lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_

 | Model |  Quantized size |
 |------:|----------------:|
+|    3B |            3 GB |
 |    7B |            5 GB |
 |   13B |           10 GB |
-|   30B |           25 GB |
+|   33B |           25 GB |
 |   65B |           50 GB |

 **Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
--- a/docker/auto_docker/Dockerfile
+++ b/docker/auto_docker/Dockerfile
--- a/docker/open_llama/build.sh
+++ b/docker/open_llama/build.sh
@ -0,0 +1,14 @@
+#!/bin/sh
+
+MODEL="open_llama_3b"
+# Get  open_llama_3b_ggml q5_1 quantization
+python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
+ls -lh *.bin
+
+# Build the default OpenBLAS image
+docker build -t $MODEL .
+docker images | egrep "^(REPOSITORY|$MODEL)"
+
+echo
+echo "To start the docker container run:"
+echo "docker run -t -p 8000:8000 $MODEL"
--- a/docker/auto_docker/hug_model.py
+++ b/docker/auto_docker/hug_model.py
@ -76,13 +76,15 @@ def main():

    # Arguments
    parser.add_argument('-v', '--version', type=int, default=0x0003,
-                        help='an integer for the version to be used')
+                        help='hexadecimal version number of ggml file')
    parser.add_argument('-a', '--author', type=str, default='TheBloke',
-                        help='an author to be filtered')
-    parser.add_argument('-t', '--tags', type=str, default='llama',
-                        help='tags for the content')
+                        help='HuggingFace author filter')
+    parser.add_argument('-t', '--tag', type=str, default='llama',
+                        help='HuggingFace tag filter')
    parser.add_argument('-s', '--search', type=str, default='',
-                        help='search term')
+                        help='HuggingFace search filter')
+    parser.add_argument('-f', '--filename', type=str, default='q5_1',
+                        help='HuggingFace model repository filename substring match')

    # Parse the arguments
    args = parser.parse_args()
@ -90,7 +92,7 @@ def main():
    # Define the parameters
    params = {
        "author": args.author,
-        "tags": args.tags,
+        "tags": args.tag,
        "search": args.search
    }

@ -108,11 +110,15 @@ def main():

        for sibling in model_info.get('siblings', []):
            rfilename = sibling.get('rfilename')
-            if rfilename and 'q5_1' in rfilename:
+            if rfilename and args.filename in rfilename:
                model_list.append((model_id, rfilename))

    # Choose the model
-    if len(model_list) == 1:
+    model_list.sort(key=lambda x: x[0])
+    if len(model_list) == 0:
+        print("No models found")
+        exit(1)
+    elif len(model_list) == 1:
        model_choice = model_list[0]
    else:
        model_choice = get_user_choice(model_list)
@ -120,13 +126,14 @@ def main():
    if model_choice is not None:
        model_id, rfilename = model_choice
        url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
-        download_file(url, rfilename)
-        _, version = check_magic_and_version(rfilename)
+        dest = f"{model_id.replace('/', '_')}_{rfilename}"
+        download_file(url, dest)
+        _, version = check_magic_and_version(dest)
        if version != args.version:
             print(f"Warning: Expected version {args.version}, but found different version in the file.")
    else:
        print("Error - model choice was None")
-        exit(1)
+        exit(2)

 if __name__ == '__main__':
    main()
--- a/docker/open_llama/start.sh
+++ b/docker/open_llama/start.sh
@ -0,0 +1,28 @@
+#!/bin/sh
+
+MODEL="open_llama_3b"
+
+# Start Docker container
+docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
+sleep 10
+echo
+docker ps | egrep "(^CONTAINER|$MODEL)"
+
+# Test the model works
+echo
+curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{
+  "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+  "stop": [
+    "\n",
+    "###"
+  ]
+}' | grep Paris
+if [ $? -eq 0 ]
+then
+    echo
+    echo "$MODEL is working!!"
+else
+    echo
+    echo "ERROR: $MODEL not replying."
+    exit 1
+fi
--- a/docker/auto_docker/start_server.sh
+++ b/docker/auto_docker/start_server.sh
@ -1,6 +1,6 @@
 #!/bin/sh

-# For mmap support
+# For mlock support
 ulimit -l unlimited

 if [ "$IMAGE" = "python:3-slim-bullseye" ]; then