Merge branch 'main' of https://github.com/abetlen/llama-cpp-python into main

2024-05-18 01:19:20 -04:00 · 2024-05-18 01:19:20 -04:00 · 3dbfec74e7
commit 3dbfec74e7
parent d8a3b013c3 03f171e810
3 changed files with 42 additions and 0 deletions
--- a/examples/ray/README.md
+++ b/examples/ray/README.md
@ -0,0 +1,19 @@
 This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html).
 First, install the requirements:
 ```bash
 $ pip install -r requirements.txt
 ```
 Deploy a GGUF model to Ray Serve with the following command:
 ```bash
 $ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf'
 ```
 This will start an API endpoint at `http://localhost:8000/`. You can query the model like this:
 ```bash
 $ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000
 ```
--- a/examples/ray/llm.py
+++ b/examples/ray/llm.py
@ -0,0 +1,20 @@
 from starlette.requests import Request
 from typing import Dict
 from ray import serve
 from ray.serve import Application
 from llama_cpp import Llama
@serve.deployment
 class LlamaDeployment:
    def __init__(self, model_path: str):
        self._llm = Llama(model_path=model_path)
    async def __call__(self, http_request: Request) -> Dict:
        input_json = await http_request.json()
        prompt = input_json["prompt"]
        max_tokens = input_json.get("max_tokens", 64)
        return self._llm(prompt, max_tokens=max_tokens)
 def llm_builder(args: Dict[str, str]) -> Application:
    return LlamaDeployment.bind(args["model_path"])
--- a/examples/ray/requirements.txt
+++ b/examples/ray/requirements.txt
@ -0,0 +1,3 @@
 ray[serve]
 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 llama-cpp-python