Working Open Llama 3B in a box

This commit is contained in:
Gary Mulder 2023-06-02 08:48:54 +00:00
parent 217d78320f
commit cf4931a400
6 changed files with 64 additions and 14 deletions

View file

@ -24,7 +24,7 @@
- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin` - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
## Download a Llama Model from Hugging Face ## Download a Llama Model from Hugging Face
- To download a MIT licensed Llama model run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml` - To download a MIT licensed Llama model you can run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml -f ggml-model-q5_1.bin`
- To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama` - To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama`
- You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. - You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
``` ```
@ -37,9 +37,10 @@ lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> <downloaded-model-file>q5_
| Model | Quantized size | | Model | Quantized size |
|------:|----------------:| |------:|----------------:|
| 3B | 3 GB |
| 7B | 5 GB | | 7B | 5 GB |
| 13B | 10 GB | | 13B | 10 GB |
| 30B | 25 GB | | 33B | 25 GB |
| 65B | 50 GB | | 65B | 50 GB |
**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` **Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`

14
docker/open_llama/build.sh Executable file
View file

@ -0,0 +1,14 @@
#!/bin/sh
MODEL="open_llama_3b"
# Get open_llama_3b_ggml q5_1 quantization
python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
ls -lh *.bin
# Build the default OpenBLAS image
docker build -t $MODEL .
docker images | egrep "^(REPOSITORY|$MODEL)"
echo
echo "To start the docker container run:"
echo "docker run -t -p 8000:8000 $MODEL"

View file

@ -76,13 +76,15 @@ def main():
# Arguments # Arguments
parser.add_argument('-v', '--version', type=int, default=0x0003, parser.add_argument('-v', '--version', type=int, default=0x0003,
help='an integer for the version to be used') help='hexadecimal version number of ggml file')
parser.add_argument('-a', '--author', type=str, default='TheBloke', parser.add_argument('-a', '--author', type=str, default='TheBloke',
help='an author to be filtered') help='HuggingFace author filter')
parser.add_argument('-t', '--tags', type=str, default='llama', parser.add_argument('-t', '--tag', type=str, default='llama',
help='tags for the content') help='HuggingFace tag filter')
parser.add_argument('-s', '--search', type=str, default='', parser.add_argument('-s', '--search', type=str, default='',
help='search term') help='HuggingFace search filter')
parser.add_argument('-f', '--filename', type=str, default='q5_1',
help='HuggingFace model repository filename substring match')
# Parse the arguments # Parse the arguments
args = parser.parse_args() args = parser.parse_args()
@ -90,7 +92,7 @@ def main():
# Define the parameters # Define the parameters
params = { params = {
"author": args.author, "author": args.author,
"tags": args.tags, "tags": args.tag,
"search": args.search "search": args.search
} }
@ -108,11 +110,15 @@ def main():
for sibling in model_info.get('siblings', []): for sibling in model_info.get('siblings', []):
rfilename = sibling.get('rfilename') rfilename = sibling.get('rfilename')
if rfilename and 'q5_1' in rfilename: if rfilename and args.filename in rfilename:
model_list.append((model_id, rfilename)) model_list.append((model_id, rfilename))
# Choose the model # Choose the model
if len(model_list) == 1: model_list.sort(key=lambda x: x[0])
if len(model_list) == 0:
print("No models found")
exit(1)
elif len(model_list) == 1:
model_choice = model_list[0] model_choice = model_list[0]
else: else:
model_choice = get_user_choice(model_list) model_choice = get_user_choice(model_list)
@ -120,13 +126,14 @@ def main():
if model_choice is not None: if model_choice is not None:
model_id, rfilename = model_choice model_id, rfilename = model_choice
url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
download_file(url, rfilename) dest = f"{model_id.replace('/', '_')}_{rfilename}"
_, version = check_magic_and_version(rfilename) download_file(url, dest)
_, version = check_magic_and_version(dest)
if version != args.version: if version != args.version:
print(f"Warning: Expected version {args.version}, but found different version in the file.") print(f"Warning: Expected version {args.version}, but found different version in the file.")
else: else:
print("Error - model choice was None") print("Error - model choice was None")
exit(1) exit(2)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

28
docker/open_llama/start.sh Executable file
View file

@ -0,0 +1,28 @@
#!/bin/sh
MODEL="open_llama_3b"
# Start Docker container
docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
sleep 10
echo
docker ps | egrep "(^CONTAINER|$MODEL)"
# Test the model works
echo
curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
"prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
"stop": [
"\n",
"###"
]
}' | grep Paris
if [ $? -eq 0 ]
then
echo
echo "$MODEL is working!!"
else
echo
echo "ERROR: $MODEL not replying."
exit 1
fi

View file

@ -1,6 +1,6 @@
#!/bin/sh #!/bin/sh
# For mmap support # For mlock support
ulimit -l unlimited ulimit -l unlimited
if [ "$IMAGE" = "python:3-slim-bullseye" ]; then if [ "$IMAGE" = "python:3-slim-bullseye" ]; then