From 03f171e81069932fda296c85ebafc6fdf11025d0 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Fri, 17 May 2024 20:27:26 +0300 Subject: [PATCH] example: LLM inference with Ray Serve (#1465) --- examples/ray/README.md | 19 +++++++++++++++++++ examples/ray/llm.py | 20 ++++++++++++++++++++ examples/ray/requirements.txt | 3 +++ 3 files changed, 42 insertions(+) create mode 100644 examples/ray/README.md create mode 100755 examples/ray/llm.py create mode 100644 examples/ray/requirements.txt diff --git a/examples/ray/README.md b/examples/ray/README.md new file mode 100644 index 0000000..6e338ba --- /dev/null +++ b/examples/ray/README.md @@ -0,0 +1,19 @@ +This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html). + +First, install the requirements: + +```bash +$ pip install -r requirements.txt +``` + +Deploy a GGUF model to Ray Serve with the following command: + +```bash +$ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf' +``` + +This will start an API endpoint at `http://localhost:8000/`. You can query the model like this: + +```bash +$ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000 +``` diff --git a/examples/ray/llm.py b/examples/ray/llm.py new file mode 100755 index 0000000..27b15d2 --- /dev/null +++ b/examples/ray/llm.py @@ -0,0 +1,20 @@ +from starlette.requests import Request +from typing import Dict +from ray import serve +from ray.serve import Application +from llama_cpp import Llama + +@serve.deployment +class LlamaDeployment: + def __init__(self, model_path: str): + self._llm = Llama(model_path=model_path) + + async def __call__(self, http_request: Request) -> Dict: + input_json = await http_request.json() + prompt = input_json["prompt"] + max_tokens = input_json.get("max_tokens", 64) + return self._llm(prompt, max_tokens=max_tokens) + + +def llm_builder(args: Dict[str, str]) -> Application: + return LlamaDeployment.bind(args["model_path"]) diff --git a/examples/ray/requirements.txt b/examples/ray/requirements.txt new file mode 100644 index 0000000..a409fb8 --- /dev/null +++ b/examples/ray/requirements.txt @@ -0,0 +1,3 @@ +ray[serve] +--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu +llama-cpp-python