Working Open Llama 3B in a box
This commit is contained in:
parent
217d78320f
commit
cf4931a400
6 changed files with 64 additions and 14 deletions
|
@ -24,7 +24,7 @@
|
|||
- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
|
||||
|
||||
## Download a Llama Model from Hugging Face
|
||||
- To download a MIT licensed Llama model run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml`
|
||||
- To download a MIT licensed Llama model you can run: `python3 ./hug_model.py -a vihangd -s open_llama_7b_700bt_ggml -f ggml-model-q5_1.bin`
|
||||
- To select and install a restricted license Llama model run: `python3 ./hug_model.py -a TheBloke -t llama`
|
||||
- You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
|
||||
```
|
||||
|
@ -37,9 +37,10 @@ lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> <downloaded-model-file>q5_
|
|||
|
||||
| Model | Quantized size |
|
||||
|------:|----------------:|
|
||||
| 3B | 3 GB |
|
||||
| 7B | 5 GB |
|
||||
| 13B | 10 GB |
|
||||
| 30B | 25 GB |
|
||||
| 33B | 25 GB |
|
||||
| 65B | 50 GB |
|
||||
|
||||
**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
|
||||
|
|
14
docker/open_llama/build.sh
Executable file
14
docker/open_llama/build.sh
Executable file
|
@ -0,0 +1,14 @@
|
|||
#!/bin/sh
|
||||
|
||||
MODEL="open_llama_3b"
|
||||
# Get open_llama_3b_ggml q5_1 quantization
|
||||
python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
|
||||
ls -lh *.bin
|
||||
|
||||
# Build the default OpenBLAS image
|
||||
docker build -t $MODEL .
|
||||
docker images | egrep "^(REPOSITORY|$MODEL)"
|
||||
|
||||
echo
|
||||
echo "To start the docker container run:"
|
||||
echo "docker run -t -p 8000:8000 $MODEL"
|
|
@ -76,13 +76,15 @@ def main():
|
|||
|
||||
# Arguments
|
||||
parser.add_argument('-v', '--version', type=int, default=0x0003,
|
||||
help='an integer for the version to be used')
|
||||
help='hexadecimal version number of ggml file')
|
||||
parser.add_argument('-a', '--author', type=str, default='TheBloke',
|
||||
help='an author to be filtered')
|
||||
parser.add_argument('-t', '--tags', type=str, default='llama',
|
||||
help='tags for the content')
|
||||
help='HuggingFace author filter')
|
||||
parser.add_argument('-t', '--tag', type=str, default='llama',
|
||||
help='HuggingFace tag filter')
|
||||
parser.add_argument('-s', '--search', type=str, default='',
|
||||
help='search term')
|
||||
help='HuggingFace search filter')
|
||||
parser.add_argument('-f', '--filename', type=str, default='q5_1',
|
||||
help='HuggingFace model repository filename substring match')
|
||||
|
||||
# Parse the arguments
|
||||
args = parser.parse_args()
|
||||
|
@ -90,7 +92,7 @@ def main():
|
|||
# Define the parameters
|
||||
params = {
|
||||
"author": args.author,
|
||||
"tags": args.tags,
|
||||
"tags": args.tag,
|
||||
"search": args.search
|
||||
}
|
||||
|
||||
|
@ -108,11 +110,15 @@ def main():
|
|||
|
||||
for sibling in model_info.get('siblings', []):
|
||||
rfilename = sibling.get('rfilename')
|
||||
if rfilename and 'q5_1' in rfilename:
|
||||
if rfilename and args.filename in rfilename:
|
||||
model_list.append((model_id, rfilename))
|
||||
|
||||
# Choose the model
|
||||
if len(model_list) == 1:
|
||||
model_list.sort(key=lambda x: x[0])
|
||||
if len(model_list) == 0:
|
||||
print("No models found")
|
||||
exit(1)
|
||||
elif len(model_list) == 1:
|
||||
model_choice = model_list[0]
|
||||
else:
|
||||
model_choice = get_user_choice(model_list)
|
||||
|
@ -120,13 +126,14 @@ def main():
|
|||
if model_choice is not None:
|
||||
model_id, rfilename = model_choice
|
||||
url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
|
||||
download_file(url, rfilename)
|
||||
_, version = check_magic_and_version(rfilename)
|
||||
dest = f"{model_id.replace('/', '_')}_{rfilename}"
|
||||
download_file(url, dest)
|
||||
_, version = check_magic_and_version(dest)
|
||||
if version != args.version:
|
||||
print(f"Warning: Expected version {args.version}, but found different version in the file.")
|
||||
else:
|
||||
print("Error - model choice was None")
|
||||
exit(1)
|
||||
exit(2)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
28
docker/open_llama/start.sh
Executable file
28
docker/open_llama/start.sh
Executable file
|
@ -0,0 +1,28 @@
|
|||
#!/bin/sh
|
||||
|
||||
MODEL="open_llama_3b"
|
||||
|
||||
# Start Docker container
|
||||
docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
|
||||
sleep 10
|
||||
echo
|
||||
docker ps | egrep "(^CONTAINER|$MODEL)"
|
||||
|
||||
# Test the model works
|
||||
echo
|
||||
curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
|
||||
"prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
|
||||
"stop": [
|
||||
"\n",
|
||||
"###"
|
||||
]
|
||||
}' | grep Paris
|
||||
if [ $? -eq 0 ]
|
||||
then
|
||||
echo
|
||||
echo "$MODEL is working!!"
|
||||
else
|
||||
echo
|
||||
echo "ERROR: $MODEL not replying."
|
||||
exit 1
|
||||
fi
|
|
@ -1,6 +1,6 @@
|
|||
#!/bin/sh
|
||||
|
||||
# For mmap support
|
||||
# For mlock support
|
||||
ulimit -l unlimited
|
||||
|
||||
if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
|
Loading…
Reference in a new issue