llama.cpp/examples/ray/llm.py

from starlette.requests import Request
from typing import Dict
from ray import serve
from ray.serve import Application
from llama_cpp import Llama

@serve.deployment
class LlamaDeployment:
    def __init__(self, model_path: str):
        self._llm = Llama(model_path=model_path)

    async def __call__(self, http_request: Request) -> Dict:
        input_json = await http_request.json()
        prompt = input_json["prompt"]
        max_tokens = input_json.get("max_tokens", 64)
        return self._llm(prompt, max_tokens=max_tokens)


def llm_builder(args: Dict[str, str]) -> Application:
    return LlamaDeployment.bind(args["model_path"])
example: LLM inference with Ray Serve (#1465) 2024-05-17 17:27:26 +00:00			`from starlette.requests import Request`
			`from typing import Dict`
			`from ray import serve`
			`from ray.serve import Application`
			`from llama_cpp import Llama`

			`@serve.deployment`
			`class LlamaDeployment:`
			`def __init__(self, model_path: str):`
			`self._llm = Llama(model_path=model_path)`

			`async def __call__(self, http_request: Request) -> Dict:`
			`input_json = await http_request.json()`
			`prompt = input_json["prompt"]`
			`max_tokens = input_json.get("max_tokens", 64)`
			`return self._llm(prompt, max_tokens=max_tokens)`


			`def llm_builder(args: Dict[str, str]) -> Application:`
			`return LlamaDeployment.bind(args["model_path"])`