Merge branch 'main' of https://github.com/abetlen/llama-cpp-python into main
This commit is contained in:
commit
3dbfec74e7
3 changed files with 42 additions and 0 deletions
19
examples/ray/README.md
Normal file
19
examples/ray/README.md
Normal file
|
@ -0,0 +1,19 @@
|
|||
This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html).
|
||||
|
||||
First, install the requirements:
|
||||
|
||||
```bash
|
||||
$ pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Deploy a GGUF model to Ray Serve with the following command:
|
||||
|
||||
```bash
|
||||
$ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf'
|
||||
```
|
||||
|
||||
This will start an API endpoint at `http://localhost:8000/`. You can query the model like this:
|
||||
|
||||
```bash
|
||||
$ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000
|
||||
```
|
20
examples/ray/llm.py
Executable file
20
examples/ray/llm.py
Executable file
|
@ -0,0 +1,20 @@
|
|||
from starlette.requests import Request
|
||||
from typing import Dict
|
||||
from ray import serve
|
||||
from ray.serve import Application
|
||||
from llama_cpp import Llama
|
||||
|
||||
@serve.deployment
|
||||
class LlamaDeployment:
|
||||
def __init__(self, model_path: str):
|
||||
self._llm = Llama(model_path=model_path)
|
||||
|
||||
async def __call__(self, http_request: Request) -> Dict:
|
||||
input_json = await http_request.json()
|
||||
prompt = input_json["prompt"]
|
||||
max_tokens = input_json.get("max_tokens", 64)
|
||||
return self._llm(prompt, max_tokens=max_tokens)
|
||||
|
||||
|
||||
def llm_builder(args: Dict[str, str]) -> Application:
|
||||
return LlamaDeployment.bind(args["model_path"])
|
3
examples/ray/requirements.txt
Normal file
3
examples/ray/requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
ray[serve]
|
||||
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
||||
llama-cpp-python
|
Loading…
Reference in a new issue