llama.cpp/examples/ray/llm.py

from starlette.requests import Request
from typing import Dict
from ray import serve
from ray.serve import Application
from llama_cpp import Llama

@serve.deployment
class LlamaDeployment:
    def __init__(self, model_path: str):
        self._llm = Llama(model_path=model_path)

    async def __call__(self, http_request: Request) -> Dict:
        input_json = await http_request.json()
        prompt = input_json["prompt"]
        max_tokens = input_json.get("max_tokens", 64)
        return self._llm(prompt, max_tokens=max_tokens)


def llm_builder(args: Dict[str, str]) -> Application:
    return LlamaDeployment.bind(args["model_path"])