From 483b6ba53af349050458d3223e41aa71829f1391 Mon Sep 17 00:00:00 2001
From: Gary Mulder <gjmulder@gmail.com>
Date: Wed, 31 May 2023 15:16:32 +0000
Subject: [PATCH 1/6] Updated README.md instructions on how to use
 *_simple/Dockerfiles

---
 docker/README.md                              | 21 ++++++++++++++-----
 docker/{ => auto_docker}/Dockerfile           |  0
 docker/{ => auto_docker}/hug_model.py         |  0
 docker/{ => auto_docker}/start_server.sh      |  0
 .../Dockerfile}                               |  4 ++--
 .../Dockerfile}                               |  2 +-
 6 files changed, 19 insertions(+), 8 deletions(-)
 rename docker/{ => auto_docker}/Dockerfile (100%)
 rename docker/{ => auto_docker}/hug_model.py (100%)
 rename docker/{ => auto_docker}/start_server.sh (100%)
 rename docker/{Dockerfile.cuda_simple => cuda_simple/Dockerfile} (82%)
 rename docker/{Dockerfile.openblas_simple => openblas_simple/Dockerfile} (86%)
diff --git a/docker/README.md b/docker/README.md
index 100bcbd..130d180 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,10 +1,21 @@
-# Dockerfiles for building the llama-cpp-python server
-- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS
-- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS
-- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke)
+# Simple Dockerfiles for building the llama-cpp-python server with external model bin files
+- `./openblas_simple/Dockerfile` - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image
+ - `cd ./openblas_simple`
+ - `docker build -t openblas_simple .`
+ - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple`
+   where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
+- `./cuda_simple/Dockerfile` - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image
+ - `cd ./cuda_simple`
+ - `docker build -t cuda_simple .`
+ - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple`
+   where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
+
+# "Bot-in-a-box" - a method to build a Docker image by choosing a model to be downloaded and loading into a Docker image
+ - `cd ./auto_docker`:
+ - `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke)
 - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
  
-# Get model from Hugging Face
+## Get model from Hugging Face
 `python3 ./hug_model.py`
 
 You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
diff --git a/docker/Dockerfile b/docker/auto_docker/Dockerfile
similarity index 100%
rename from docker/Dockerfile
rename to docker/auto_docker/Dockerfile
diff --git a/docker/hug_model.py b/docker/auto_docker/hug_model.py
similarity index 100%
rename from docker/hug_model.py
rename to docker/auto_docker/hug_model.py
diff --git a/docker/start_server.sh b/docker/auto_docker/start_server.sh
similarity index 100%
rename from docker/start_server.sh
rename to docker/auto_docker/start_server.sh
diff --git a/docker/Dockerfile.cuda_simple b/docker/cuda_simple/Dockerfile
similarity index 82%
rename from docker/Dockerfile.cuda_simple
rename to docker/cuda_simple/Dockerfile
index dda7a9f..24906d5 100644
--- a/docker/Dockerfile.cuda_simple
+++ b/docker/cuda_simple/Dockerfile
@@ -1,5 +1,5 @@
 ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
-FROM ${CUDA_IMAGE}
+FROM nvidia/cuda:${CUDA_IMAGE}
 
 # We need to set the host to 0.0.0.0 to allow outside access
 ENV HOST 0.0.0.0
@@ -10,7 +10,7 @@ COPY . .
 RUN apt update && apt install -y python3 python3-pip
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
 
-RUN LLAMA_CUBLAS=1 python3 setup.py develop
+RUN LLAMA_CUBLAS=1 pip install llama-cpp-python
 
 # Run the server
 CMD python3 -m llama_cpp.server
diff --git a/docker/Dockerfile.openblas_simple b/docker/openblas_simple/Dockerfile
similarity index 86%
rename from docker/Dockerfile.openblas_simple
rename to docker/openblas_simple/Dockerfile
index f58506f..1a95cae 100644
--- a/docker/Dockerfile.openblas_simple
+++ b/docker/openblas_simple/Dockerfile
@@ -9,7 +9,7 @@ COPY . .
 RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
 RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
 
-RUN LLAMA_OPENBLAS=1 python3 setup.py develop
+RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose
 
 # Run the server
 CMD python3 -m llama_cpp.server

From 217d78320fb6096e0696182816df0bf3ae5b961a Mon Sep 17 00:00:00 2001
From: Gary Mulder <gjmulder@gmail.com>
Date: Wed, 31 May 2023 16:00:31 +0000
Subject: [PATCH 2/6] Added paramterised search and d/l for Hugging Face.
 Updated README.md

---
 .gitignore                      |  3 +++
 docker/README.md                | 41 +++++++++++++++++----------------
 docker/auto_docker/hug_model.py | 30 ++++++++++++++++++------
 3 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/.gitignore b/.gitignore
index fd64c09..8db9bcb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,3 +164,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+
+# model .bin files
+docker/auto_docker/*.bin
diff --git a/docker/README.md b/docker/README.md
index 130d180..e61095f 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,3 +1,11 @@
+# Install Docker Server
+
+**Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
+
+[Install Docker Engine](https://docs.docker.com/engine/install)
+
+**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
+
 # Simple Dockerfiles for building the llama-cpp-python server with external model bin files
 - `./openblas_simple/Dockerfile` - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image
  - `cd ./openblas_simple`
@@ -15,14 +23,14 @@
  - `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke)
 - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
  
-## Get model from Hugging Face
-`python3 ./hug_model.py`
-
-You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
+## Download a Llama Model from Hugging Face
+- To download a MIT licensed Llama model run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml`
+- To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama`
+- You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
 ```
 docker $ ls -lh *.bin
--rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>.q5_1.bin
-lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>.q5_1.bin
+-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>q5_1.bin
+lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_1.bin
 ```
 **Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
 **TWICE** as much disk space as the size of the model:
@@ -36,22 +44,15 @@ lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>.q5
 
 **Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
 
-# Install Docker Server
-
-**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
-
-[Install Docker Engine](https://docs.docker.com/engine/install)
-
-# Use OpenBLAS
+## Use OpenBLAS
 Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
-## Build:
-`docker build --build-arg -t openblas .`
-## Run:
+### Build:
+`docker build -t openblas .`
+### Run:
 `docker run --cap-add SYS_RESOURCE -t openblas`
 
-# Use CuBLAS
-Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
-## Build:
+## Use CuBLAS
+### Build:
 `docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
-## Run:
+### Run:
 `docker run --cap-add SYS_RESOURCE -t cublas`
diff --git a/docker/auto_docker/hug_model.py b/docker/auto_docker/hug_model.py
index 848a1aa..86a8214 100644
--- a/docker/auto_docker/hug_model.py
+++ b/docker/auto_docker/hug_model.py
@@ -2,6 +2,7 @@ import requests
 import json
 import os
 import struct
+import argparse
 
 def make_request(url, params=None):
     print(f"Making request to {url}...")
@@ -69,21 +70,28 @@ def get_user_choice(model_list):
     
     return None
 
-import argparse
-
 def main():
     # Create an argument parser
-    parser = argparse.ArgumentParser(description='Process the model version.')
+    parser = argparse.ArgumentParser(description='Process some parameters.')
+
+    # Arguments
     parser.add_argument('-v', '--version', type=int, default=0x0003,
                         help='an integer for the version to be used')
+    parser.add_argument('-a', '--author', type=str, default='TheBloke',
+                        help='an author to be filtered')
+    parser.add_argument('-t', '--tags', type=str, default='llama',
+                        help='tags for the content')
+    parser.add_argument('-s', '--search', type=str, default='',
+                        help='search term')
 
     # Parse the arguments
     args = parser.parse_args()
 
     # Define the parameters
     params = {
-        "author": "TheBloke",  # Filter by author
-        "tags": "llama"
+        "author": args.author,
+        "tags": args.tags,
+        "search": args.search
     }
 
     models = make_request('https://huggingface.co/api/models', params=params)
@@ -103,14 +111,22 @@ def main():
             if rfilename and 'q5_1' in rfilename:
                 model_list.append((model_id, rfilename))
 
-    model_choice = get_user_choice(model_list)
+    # Choose the model
+    if len(model_list) == 1:
+        model_choice = model_list[0]
+    else:
+        model_choice = get_user_choice(model_list)
+
     if model_choice is not None:
         model_id, rfilename = model_choice
         url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
         download_file(url, rfilename)
         _, version = check_magic_and_version(rfilename)
         if version != args.version:
-            print(f"Warning: Expected version {args.version}, but found different version in the file.")
+             print(f"Warning: Expected version {args.version}, but found different version in the file.")
+    else:
+        print("Error - model choice was None")
+        exit(1)
 
 if __name__ == '__main__':
     main()

From cf4931a4006a1d701f2c4ea5b2ce3cb02350d57d Mon Sep 17 00:00:00 2001
From: Gary Mulder <gjmulder@gmail.com>
Date: Fri, 2 Jun 2023 08:48:54 +0000
Subject: [PATCH 3/6] Working Open Llama 3B in a box

---
 docker/README.md                              |  5 ++--
 docker/{auto_docker => open_llama}/Dockerfile |  0
 docker/open_llama/build.sh                    | 14 +++++++++
 .../{auto_docker => open_llama}/hug_model.py  | 29 ++++++++++++-------
 docker/open_llama/start.sh                    | 28 ++++++++++++++++++
 .../start_server.sh                           |  2 +-
 6 files changed, 64 insertions(+), 14 deletions(-)
 rename docker/{auto_docker => open_llama}/Dockerfile (100%)
 create mode 100755 docker/open_llama/build.sh
 rename docker/{auto_docker => open_llama}/hug_model.py (83%)
 create mode 100755 docker/open_llama/start.sh
 rename docker/{auto_docker => open_llama}/start_server.sh (94%)

diff --git a/docker/README.md b/docker/README.md
index e61095f..2fb7ef8 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -24,7 +24,7 @@
 - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
  
 ## Download a Llama Model from Hugging Face
-- To download a MIT licensed Llama model run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml`
+- To download a MIT licensed Llama model you can run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml -f ggml-model-q5_1.bin`
 - To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama`
 - You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
 ```
@@ -37,9 +37,10 @@ lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_
 
 | Model |  Quantized size |
 |------:|----------------:|
+|    3B |            3 GB |
 |    7B |            5 GB |
 |   13B |           10 GB |
-|   30B |           25 GB |
+|   33B |           25 GB |
 |   65B |           50 GB |
 
 **Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
diff --git a/docker/auto_docker/Dockerfile b/docker/open_llama/Dockerfile
similarity index 100%
rename from docker/auto_docker/Dockerfile
rename to docker/open_llama/Dockerfile
diff --git a/docker/open_llama/build.sh b/docker/open_llama/build.sh
new file mode 100755
index 0000000..3a6457d
--- /dev/null
+++ b/docker/open_llama/build.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+MODEL="open_llama_3b"
+# Get  open_llama_3b_ggml q5_1 quantization
+python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
+ls -lh *.bin
+
+# Build the default OpenBLAS image
+docker build -t $MODEL .
+docker images | egrep "^(REPOSITORY|$MODEL)"
+
+echo
+echo "To start the docker container run:"
+echo "docker run -t -p 8000:8000 $MODEL"
diff --git a/docker/auto_docker/hug_model.py b/docker/open_llama/hug_model.py
similarity index 83%
rename from docker/auto_docker/hug_model.py
rename to docker/open_llama/hug_model.py
index 86a8214..13c5b6b 100644
--- a/docker/auto_docker/hug_model.py
+++ b/docker/open_llama/hug_model.py
@@ -76,13 +76,15 @@ def main():
 
     # Arguments
     parser.add_argument('-v', '--version', type=int, default=0x0003,
-                        help='an integer for the version to be used')
+                        help='hexadecimal version number of ggml file')
     parser.add_argument('-a', '--author', type=str, default='TheBloke',
-                        help='an author to be filtered')
-    parser.add_argument('-t', '--tags', type=str, default='llama',
-                        help='tags for the content')
+                        help='HuggingFace author filter')
+    parser.add_argument('-t', '--tag', type=str, default='llama',
+                        help='HuggingFace tag filter')
     parser.add_argument('-s', '--search', type=str, default='',
-                        help='search term')
+                        help='HuggingFace search filter')
+    parser.add_argument('-f', '--filename', type=str, default='q5_1',
+                        help='HuggingFace model repository filename substring match')
 
     # Parse the arguments
     args = parser.parse_args()
@@ -90,7 +92,7 @@ def main():
     # Define the parameters
     params = {
         "author": args.author,
-        "tags": args.tags,
+        "tags": args.tag,
         "search": args.search
     }
 
@@ -108,11 +110,15 @@ def main():
 
         for sibling in model_info.get('siblings', []):
             rfilename = sibling.get('rfilename')
-            if rfilename and 'q5_1' in rfilename:
+            if rfilename and args.filename in rfilename:
                 model_list.append((model_id, rfilename))
 
     # Choose the model
-    if len(model_list) == 1:
+    model_list.sort(key=lambda x: x[0])
+    if len(model_list) == 0:
+        print("No models found")
+        exit(1)
+    elif len(model_list) == 1:
         model_choice = model_list[0]
     else:
         model_choice = get_user_choice(model_list)
@@ -120,13 +126,14 @@ def main():
     if model_choice is not None:
         model_id, rfilename = model_choice
         url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
-        download_file(url, rfilename)
-        _, version = check_magic_and_version(rfilename)
+        dest = f"{model_id.replace('/', '_')}_{rfilename}"
+        download_file(url, dest)
+        _, version = check_magic_and_version(dest)
         if version != args.version:
              print(f"Warning: Expected version {args.version}, but found different version in the file.")
     else:
         print("Error - model choice was None")
-        exit(1)
+        exit(2)
 
 if __name__ == '__main__':
     main()
diff --git a/docker/open_llama/start.sh b/docker/open_llama/start.sh
new file mode 100755
index 0000000..7ee8f74
--- /dev/null
+++ b/docker/open_llama/start.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+MODEL="open_llama_3b"
+
+# Start Docker container
+docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
+sleep 10
+echo
+docker ps | egrep "(^CONTAINER|$MODEL)"
+
+# Test the model works
+echo
+curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{
+  "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+  "stop": [
+    "\n",
+    "###"
+  ]
+}' | grep Paris
+if [ $? -eq 0 ]
+then
+    echo
+    echo "$MODEL is working!!"
+else
+    echo
+    echo "ERROR: $MODEL not replying."
+    exit 1
+fi
diff --git a/docker/auto_docker/start_server.sh b/docker/open_llama/start_server.sh
similarity index 94%
rename from docker/auto_docker/start_server.sh
rename to docker/open_llama/start_server.sh
index 176bd87..d3329ee 100755
--- a/docker/auto_docker/start_server.sh
+++ b/docker/open_llama/start_server.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 
-# For mmap support
+# For mlock support
 ulimit -l unlimited
 
 if [ "$IMAGE" = "python:3-slim-bullseye" ]; then

From f24e7a7e5229448ba64ab819287d07887567840d Mon Sep 17 00:00:00 2001
From: Gary Mulder <gjmulder@gmail.com>
Date: Fri, 2 Jun 2023 10:44:52 +0000
Subject: [PATCH 4/6] Updated instructions

---
 docker/README.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/docker/README.md b/docker/README.md
index 2fb7ef8..f4954d1 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -18,14 +18,15 @@
  - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple`
    where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
 
-# "Bot-in-a-box" - a method to build a Docker image by choosing a model to be downloaded and loading into a Docker image
- - `cd ./auto_docker`:
- - `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke)
-- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
- 
-## Download a Llama Model from Hugging Face
-- To download a MIT licensed Llama model you can run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml -f ggml-model-q5_1.bin`
-- To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama`
+# "Open-Llama-in-a-box" - Download a MIT licensed Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server
+```
+$ cd ./open_llama
+./build.sh
+./start.sh
+```
+
+# Manually choose your own Llama model from Hugging Face
+- `python3 ./hug_model.py -a TheBloke -t llama`
 - You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
 ```
 docker $ ls -lh *.bin

From d4eef735d9d70cf1d8a9e098914b16ccf70f06fe Mon Sep 17 00:00:00 2001
From: Gary Mulder <gjmulder@gmail.com>
Date: Fri, 2 Jun 2023 11:03:19 +0000
Subject: [PATCH 5/6] Fixed .gitignore to ignore any downloaded model .bin
 files. Cleaned up README.md again

---
 .gitignore       |  4 ++--
 docker/README.md | 25 +++++++++++++++----------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8db9bcb..79093b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -165,5 +165,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 
-# model .bin files
-docker/auto_docker/*.bin
+# downloaded model .bin files
+docker/open_llama/*.bin
diff --git a/docker/README.md b/docker/README.md
index f4954d1..c7e92d0 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -7,16 +7,21 @@
 **Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
 
 # Simple Dockerfiles for building the llama-cpp-python server with external model bin files
-- `./openblas_simple/Dockerfile` - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image
- - `cd ./openblas_simple`
- - `docker build -t openblas_simple .`
- - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple`
-   where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
-- `./cuda_simple/Dockerfile` - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image
- - `cd ./cuda_simple`
- - `docker build -t cuda_simple .`
- - `docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple`
-   where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
+## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image
+```
+cd ./openblas_simple
+docker build -t openblas_simple .
+docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple
+```
+where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
+
+## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image
+```
+cd ./cuda_simple
+docker build -t cuda_simple .
+docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple
+```
+where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
 
 # "Open-Llama-in-a-box" - Download a MIT licensed Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server
 ```

From 30d32e996b3bbb4ad641ab275cf1d985f950d1cd Mon Sep 17 00:00:00 2001
From: Gary Mulder <gjmulder@gmail.com>
Date: Fri, 2 Jun 2023 11:08:59 +0000
Subject: [PATCH 6/6] More README.md corrections and cleanup

---
 docker/README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docker/README.md b/docker/README.md
index c7e92d0..053d311 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -4,7 +4,7 @@
 
 [Install Docker Engine](https://docs.docker.com/engine/install)
 
-**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
+**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
 
 # Simple Dockerfiles for building the llama-cpp-python server with external model bin files
 ## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image
@@ -23,7 +23,8 @@ docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:
 ```
 where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
 
-# "Open-Llama-in-a-box" - Download a MIT licensed Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server
+# "Open-Llama-in-a-box"
+## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server
 ```
 $ cd ./open_llama
 ./build.sh
@@ -31,8 +32,8 @@ $ cd ./open_llama
 ```
 
 # Manually choose your own Llama model from Hugging Face
-- `python3 ./hug_model.py -a TheBloke -t llama`
-- You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
+`python3 ./hug_model.py -a TheBloke -t llama`
+You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
 ```
 docker $ ls -lh *.bin
 -rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>q5_1.bin