Merge branch 'main' of github.com:abetlen/llama_cpp_python into main

2023-06-04 23:30:47 -04:00 · 2023-06-04 23:30:47 -04:00 · 8ff83db3ee
commit 8ff83db3ee
parent 6d5b049801 3977eea8e9
9 changed files with 128 additions and 40 deletions
--- a/.gitignore
+++ b/.gitignore
@ -164,3 +164,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 # downloaded model .bin files
 docker/open_llama/*.bin
--- a/docker/README.md
+++ b/docker/README.md
@ -1,46 +1,66 @@
-# Dockerfiles for building the llama-cpp-python server
+# Install Docker Server
 - `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS
 - `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS
 - `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke)
 - `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
 # Get model from Hugging Face
 `python3 ./hug_model.py`
 **Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
 [Install Docker Engine](https://docs.docker.com/engine/install)
 **Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
 # Simple Dockerfiles for building the llama-cpp-python server with external model bin files
 ## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image
 ```
 cd ./openblas_simple
 docker build -t openblas_simple .
 docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple
 ```
 where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
 ## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image
 ```
 cd ./cuda_simple
 docker build -t cuda_simple .
 docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple
 ```
 where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
 # "Open-Llama-in-a-box"
 ## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server
 ```
 $ cd ./open_llama
 ./build.sh
 ./start.sh
 ```
 # Manually choose your own Llama model from Hugging Face
 `python3 ./hug_model.py -a TheBloke -t llama`
 You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
 ```
 docker $ ls -lh *.bin
-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>.q5_1.bin
+-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>q5_1.bin
-lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>.q5_1.bin
+lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_1.bin
 ```
 **Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
 **TWICE** as much disk space as the size of the model:
 | Model |  Quantized size |
 |------:|----------------:|
 |    3B |            3 GB |
 |    7B |            5 GB |
 |   13B |           10 GB |
-|   30B |           25 GB |
+|   33B |           25 GB |
 |   65B |           50 GB |
 **Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
-# Install Docker Server
+## Use OpenBLAS
 **Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
 [Install Docker Engine](https://docs.docker.com/engine/install)
 # Use OpenBLAS
 Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
-## Build:
+### Build:
-`docker build --build-arg -t openblas .`
+`docker build -t openblas .`
-## Run:
+### Run:
 `docker run --cap-add SYS_RESOURCE -t openblas`
-# Use CuBLAS
+## Use CuBLAS
-Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
+### Build:
 ## Build:
 `docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
-## Run:
+### Run:
 `docker run --cap-add SYS_RESOURCE -t cublas`
--- a/docker/cuda_simple/Dockerfile
+++ b/docker/cuda_simple/Dockerfile
@ -1,5 +1,5 @@
 ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
-FROM ${CUDA_IMAGE}
+FROM nvidia/cuda:${CUDA_IMAGE}
 # We need to set the host to 0.0.0.0 to allow outside access
 ENV HOST 0.0.0.0
@ -10,7 +10,7 @@ COPY . .
 RUN apt update && apt install -y python3 python3-pip
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
-RUN LLAMA_CUBLAS=1 python3 setup.py develop
+RUN LLAMA_CUBLAS=1 pip install llama-cpp-python
 # Run the server
 CMD python3 -m llama_cpp.server
--- a/docker/open_llama/Dockerfile
+++ b/docker/open_llama/Dockerfile
--- a/docker/open_llama/build.sh
+++ b/docker/open_llama/build.sh
@ -0,0 +1,14 @@
 #!/bin/sh
 MODEL="open_llama_3b"
 # Get  open_llama_3b_ggml q5_1 quantization
 python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
 ls -lh *.bin
 # Build the default OpenBLAS image
 docker build -t $MODEL .
 docker images | egrep "^(REPOSITORY|$MODEL)"
 echo
 echo "To start the docker container run:"
 echo "docker run -t -p 8000:8000 $MODEL"
--- a/docker/open_llama/hug_model.py
+++ b/docker/open_llama/hug_model.py
@ -2,6 +2,7 @@ import requests
 import json
 import os
 import struct
 import argparse
 def make_request(url, params=None):
    print(f"Making request to {url}...")
@ -69,21 +70,30 @@ def get_user_choice(model_list):
    return None
 import argparse
 def main():
    # Create an argument parser
-    parser = argparse.ArgumentParser(description='Process the model version.')
+    parser = argparse.ArgumentParser(description='Process some parameters.')
    # Arguments
    parser.add_argument('-v', '--version', type=int, default=0x0003,
-                        help='an integer for the version to be used')
+                        help='hexadecimal version number of ggml file')
    parser.add_argument('-a', '--author', type=str, default='TheBloke',
                        help='HuggingFace author filter')
    parser.add_argument('-t', '--tag', type=str, default='llama',
                        help='HuggingFace tag filter')
    parser.add_argument('-s', '--search', type=str, default='',
                        help='HuggingFace search filter')
    parser.add_argument('-f', '--filename', type=str, default='q5_1',
                        help='HuggingFace model repository filename substring match')
    # Parse the arguments
    args = parser.parse_args()
    # Define the parameters
    params = {
-        "author": "TheBloke",  # Filter by author
+        "author": args.author,
-        "tags": "llama"
+        "tags": args.tag,
        "search": args.search
    }
    models = make_request('https://huggingface.co/api/models', params=params)
@ -100,17 +110,30 @@ def main():
        for sibling in model_info.get('siblings', []):
            rfilename = sibling.get('rfilename')
-            if rfilename and 'q5_1' in rfilename:
+            if rfilename and args.filename in rfilename:
                model_list.append((model_id, rfilename))
-    model_choice = get_user_choice(model_list)
+    # Choose the model
    model_list.sort(key=lambda x: x[0])
    if len(model_list) == 0:
        print("No models found")
        exit(1)
    elif len(model_list) == 1:
        model_choice = model_list[0]
    else:
        model_choice = get_user_choice(model_list)
    if model_choice is not None:
        model_id, rfilename = model_choice
        url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
-        download_file(url, rfilename)
+        dest = f"{model_id.replace('/', '_')}_{rfilename}"
-        _, version = check_magic_and_version(rfilename)
+        download_file(url, dest)
        _, version = check_magic_and_version(dest)
        if version != args.version:
-            print(f"Warning: Expected version {args.version}, but found different version in the file.")
+             print(f"Warning: Expected version {args.version}, but found different version in the file.")
    else:
        print("Error - model choice was None")
        exit(2)
 if __name__ == '__main__':
    main()
--- a/docker/open_llama/start.sh
+++ b/docker/open_llama/start.sh
@ -0,0 +1,28 @@
 #!/bin/sh
 MODEL="open_llama_3b"
 # Start Docker container
 docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
 sleep 10
 echo
 docker ps | egrep "(^CONTAINER|$MODEL)"
 # Test the model works
 echo
 curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{
  "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
  "stop": [
    "\n",
    "###"
  ]
 }' | grep Paris
 if [ $? -eq 0 ]
 then
    echo
    echo "$MODEL is working!!"
 else
    echo
    echo "ERROR: $MODEL not replying."
    exit 1
 fi
--- a/docker/open_llama/start_server.sh
+++ b/docker/open_llama/start_server.sh
@ -1,6 +1,6 @@
 #!/bin/sh
-# For mmap support
+# For mlock support
 ulimit -l unlimited
 if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
--- a/docker/openblas_simple/Dockerfile
+++ b/docker/openblas_simple/Dockerfile
@ -9,7 +9,7 @@ COPY . .
 RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
 RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
-RUN LLAMA_OPENBLAS=1 python3 setup.py develop
+RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose
 # Run the server
 CMD python3 -m llama_cpp.server