From 03f171e81069932fda296c85ebafc6fdf11025d0 Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Fri, 17 May 2024 20:27:26 +0300
Subject: [PATCH] example: LLM inference with Ray Serve (#1465)

---
 examples/ray/README.md        | 19 +++++++++++++++++++
 examples/ray/llm.py           | 20 ++++++++++++++++++++
 examples/ray/requirements.txt |  3 +++
 3 files changed, 42 insertions(+)
 create mode 100644 examples/ray/README.md
 create mode 100755 examples/ray/llm.py
 create mode 100644 examples/ray/requirements.txt

diff --git a/examples/ray/README.md b/examples/ray/README.md
new file mode 100644
index 0000000..6e338ba
--- /dev/null
+++ b/examples/ray/README.md
@@ -0,0 +1,19 @@
+This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html).
+
+First, install the requirements:
+
+```bash
+$ pip install -r requirements.txt
+```
+
+Deploy a GGUF model to Ray Serve with the following command:
+
+```bash
+$ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf'
+```
+
+This will start an API endpoint at `http://localhost:8000/`. You can query the model like this:
+
+```bash
+$ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000
+```
diff --git a/examples/ray/llm.py b/examples/ray/llm.py
new file mode 100755
index 0000000..27b15d2
--- /dev/null
+++ b/examples/ray/llm.py
@@ -0,0 +1,20 @@
+from starlette.requests import Request
+from typing import Dict
+from ray import serve
+from ray.serve import Application
+from llama_cpp import Llama
+
+@serve.deployment
+class LlamaDeployment:
+    def __init__(self, model_path: str):
+        self._llm = Llama(model_path=model_path)
+
+    async def __call__(self, http_request: Request) -> Dict:
+        input_json = await http_request.json()
+        prompt = input_json["prompt"]
+        max_tokens = input_json.get("max_tokens", 64)
+        return self._llm(prompt, max_tokens=max_tokens)
+
+
+def llm_builder(args: Dict[str, str]) -> Application:
+    return LlamaDeployment.bind(args["model_path"])
diff --git a/examples/ray/requirements.txt b/examples/ray/requirements.txt
new file mode 100644
index 0000000..a409fb8
--- /dev/null
+++ b/examples/ray/requirements.txt
@@ -0,0 +1,3 @@
+ray[serve]
+--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+llama-cpp-python