diff --git a/llama_cpp/server/__init__.py b/llama_cpp/server/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index af6cc38..dd4767f 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -5,283 +5,28 @@ To run this example: ```bash pip install fastapi uvicorn sse-starlette export MODEL=../models/7B/... -uvicorn fastapi_server_chat:app --reload +``` + +Then run: +``` +uvicorn llama_cpp.server.app:app --reload +``` + +or + +``` +python3 -m llama_cpp.server ``` Then visit http://localhost:8000/docs to see the interactive API docs. """ import os -import json -from threading import Lock -from typing import List, Optional, Literal, Union, Iterator, Dict -from typing_extensions import TypedDict - -import llama_cpp - -from fastapi import Depends, FastAPI -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict -from sse_starlette.sse import EventSourceResponse - - -class Settings(BaseSettings): - model: str - n_ctx: int = 2048 - n_batch: int = 512 - n_threads: int = max((os.cpu_count() or 2) // 2, 1) - f16_kv: bool = True - use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... - use_mmap: bool = True - embedding: bool = True - last_n_tokens_size: int = 64 - logits_all: bool = False - cache: bool = False # WARNING: This is an experimental feature - - -app = FastAPI( - title="🦙 llama.cpp Python API", - version="0.0.1", -) -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) -settings = Settings() -llama = llama_cpp.Llama( - settings.model, - f16_kv=settings.f16_kv, - use_mlock=settings.use_mlock, - use_mmap=settings.use_mmap, - embedding=settings.embedding, - logits_all=settings.logits_all, - n_threads=settings.n_threads, - n_batch=settings.n_batch, - n_ctx=settings.n_ctx, - last_n_tokens_size=settings.last_n_tokens_size, -) -if settings.cache: - cache = llama_cpp.LlamaCache() - llama.set_cache(cache) -llama_lock = Lock() - - -def get_llama(): - with llama_lock: - yield llama - - -class CreateCompletionRequest(BaseModel): - prompt: Union[str, List[str]] - suffix: Optional[str] = Field(None) - max_tokens: int = 16 - temperature: float = 0.8 - top_p: float = 0.95 - echo: bool = False - stop: Optional[List[str]] = [] - stream: bool = False - - # ignored or currently unsupported - model: Optional[str] = Field(None) - n: Optional[int] = 1 - logprobs: Optional[int] = Field(None) - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - best_of: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) - - # llama.cpp specific parameters - top_k: int = 40 - repeat_penalty: float = 1.1 - - class Config: - schema_extra = { - "example": { - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": ["\n", "###"], - } - } - - -CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) - - -@app.post( - "/v1/completions", - response_model=CreateCompletionResponse, -) -def create_completion( - request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) -): - if isinstance(request.prompt, list): - request.prompt = "".join(request.prompt) - - completion_or_chunks = llama( - **request.dict( - exclude={ - "model", - "n", - "frequency_penalty", - "presence_penalty", - "best_of", - "logit_bias", - "user", - } - ) - ) - if request.stream: - chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks # type: ignore - return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks) - completion: llama_cpp.Completion = completion_or_chunks # type: ignore - return completion - - -class CreateEmbeddingRequest(BaseModel): - model: Optional[str] - input: str - user: Optional[str] - - class Config: - schema_extra = { - "example": { - "input": "The food was delicious and the waiter...", - } - } - - -CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) - - -@app.post( - "/v1/embeddings", - response_model=CreateEmbeddingResponse, -) -def create_embedding( - request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) -): - return llama.create_embedding(**request.dict(exclude={"model", "user"})) - - -class ChatCompletionRequestMessage(BaseModel): - role: Union[Literal["system"], Literal["user"], Literal["assistant"]] - content: str - user: Optional[str] = None - - -class CreateChatCompletionRequest(BaseModel): - model: Optional[str] - messages: List[ChatCompletionRequestMessage] - temperature: float = 0.8 - top_p: float = 0.95 - stream: bool = False - stop: Optional[List[str]] = [] - max_tokens: int = 128 - - # ignored or currently unsupported - model: Optional[str] = Field(None) - n: Optional[int] = 1 - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) - - # llama.cpp specific parameters - repeat_penalty: float = 1.1 - - class Config: - schema_extra = { - "example": { - "messages": [ - ChatCompletionRequestMessage( - role="system", content="You are a helpful assistant." - ), - ChatCompletionRequestMessage( - role="user", content="What is the capital of France?" - ), - ] - } - } - - -CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) - - -@app.post( - "/v1/chat/completions", - response_model=CreateChatCompletionResponse, -) -def create_chat_completion( - request: CreateChatCompletionRequest, - llama: llama_cpp.Llama = Depends(get_llama), -) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: - completion_or_chunks = llama.create_chat_completion( - **request.dict( - exclude={ - "model", - "n", - "presence_penalty", - "frequency_penalty", - "logit_bias", - "user", - } - ), - ) - - if request.stream: - - async def server_sent_events( - chat_chunks: Iterator[llama_cpp.ChatCompletionChunk], - ): - for chat_chunk in chat_chunks: - yield dict(data=json.dumps(chat_chunk)) - yield dict(data="[DONE]") - - chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore - - return EventSourceResponse( - server_sent_events(chunks), - ) - completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore - return completion - - -class ModelData(TypedDict): - id: str - object: Literal["model"] - owned_by: str - permissions: List[str] - - -class ModelList(TypedDict): - object: Literal["list"] - data: List[ModelData] - - -GetModelResponse = create_model_from_typeddict(ModelList) - - -@app.get("/v1/models", response_model=GetModelResponse) -def get_models() -> ModelList: - return { - "object": "list", - "data": [ - { - "id": llama.model_path, - "object": "model", - "owned_by": "me", - "permissions": [], - } - ], - } +import uvicorn +from llama_cpp.server.app import app if __name__ == "__main__": - import os - import uvicorn uvicorn.run( app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py new file mode 100644 index 0000000..d296e14 --- /dev/null +++ b/llama_cpp/server/app.py @@ -0,0 +1,266 @@ +import os +import json +from threading import Lock +from typing import List, Optional, Literal, Union, Iterator, Dict +from typing_extensions import TypedDict + +import llama_cpp + +from fastapi import Depends, FastAPI +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict +from sse_starlette.sse import EventSourceResponse + + +class Settings(BaseSettings): + model: str = os.environ["MODEL"] + n_ctx: int = 2048 + n_batch: int = 512 + n_threads: int = max((os.cpu_count() or 2) // 2, 1) + f16_kv: bool = True + use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... + use_mmap: bool = True + embedding: bool = True + last_n_tokens_size: int = 64 + logits_all: bool = False + cache: bool = False # WARNING: This is an experimental feature + + +app = FastAPI( + title="🦙 llama.cpp Python API", + version="0.0.1", +) +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) +settings = Settings() +llama = llama_cpp.Llama( + settings.model, + f16_kv=settings.f16_kv, + use_mlock=settings.use_mlock, + use_mmap=settings.use_mmap, + embedding=settings.embedding, + logits_all=settings.logits_all, + n_threads=settings.n_threads, + n_batch=settings.n_batch, + n_ctx=settings.n_ctx, + last_n_tokens_size=settings.last_n_tokens_size, +) +if settings.cache: + cache = llama_cpp.LlamaCache() + llama.set_cache(cache) +llama_lock = Lock() + + +def get_llama(): + with llama_lock: + yield llama + + +class CreateCompletionRequest(BaseModel): + prompt: Union[str, List[str]] + suffix: Optional[str] = Field(None) + max_tokens: int = 16 + temperature: float = 0.8 + top_p: float = 0.95 + echo: bool = False + stop: Optional[List[str]] = [] + stream: bool = False + + # ignored or currently unsupported + model: Optional[str] = Field(None) + n: Optional[int] = 1 + logprobs: Optional[int] = Field(None) + presence_penalty: Optional[float] = 0 + frequency_penalty: Optional[float] = 0 + best_of: Optional[int] = 1 + logit_bias: Optional[Dict[str, float]] = Field(None) + user: Optional[str] = Field(None) + + # llama.cpp specific parameters + top_k: int = 40 + repeat_penalty: float = 1.1 + + class Config: + schema_extra = { + "example": { + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": ["\n", "###"], + } + } + + +CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) + + +@app.post( + "/v1/completions", + response_model=CreateCompletionResponse, +) +def create_completion( + request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) +): + if isinstance(request.prompt, list): + request.prompt = "".join(request.prompt) + + completion_or_chunks = llama( + **request.dict( + exclude={ + "model", + "n", + "frequency_penalty", + "presence_penalty", + "best_of", + "logit_bias", + "user", + } + ) + ) + if request.stream: + chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks # type: ignore + return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks) + completion: llama_cpp.Completion = completion_or_chunks # type: ignore + return completion + + +class CreateEmbeddingRequest(BaseModel): + model: Optional[str] + input: str + user: Optional[str] + + class Config: + schema_extra = { + "example": { + "input": "The food was delicious and the waiter...", + } + } + + +CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) + + +@app.post( + "/v1/embeddings", + response_model=CreateEmbeddingResponse, +) +def create_embedding( + request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) +): + return llama.create_embedding(**request.dict(exclude={"model", "user"})) + + +class ChatCompletionRequestMessage(BaseModel): + role: Union[Literal["system"], Literal["user"], Literal["assistant"]] + content: str + user: Optional[str] = None + + +class CreateChatCompletionRequest(BaseModel): + model: Optional[str] + messages: List[ChatCompletionRequestMessage] + temperature: float = 0.8 + top_p: float = 0.95 + stream: bool = False + stop: Optional[List[str]] = [] + max_tokens: int = 128 + + # ignored or currently unsupported + model: Optional[str] = Field(None) + n: Optional[int] = 1 + presence_penalty: Optional[float] = 0 + frequency_penalty: Optional[float] = 0 + logit_bias: Optional[Dict[str, float]] = Field(None) + user: Optional[str] = Field(None) + + # llama.cpp specific parameters + repeat_penalty: float = 1.1 + + class Config: + schema_extra = { + "example": { + "messages": [ + ChatCompletionRequestMessage( + role="system", content="You are a helpful assistant." + ), + ChatCompletionRequestMessage( + role="user", content="What is the capital of France?" + ), + ] + } + } + + +CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) + + +@app.post( + "/v1/chat/completions", + response_model=CreateChatCompletionResponse, +) +def create_chat_completion( + request: CreateChatCompletionRequest, + llama: llama_cpp.Llama = Depends(get_llama), +) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: + completion_or_chunks = llama.create_chat_completion( + **request.dict( + exclude={ + "model", + "n", + "presence_penalty", + "frequency_penalty", + "logit_bias", + "user", + } + ), + ) + + if request.stream: + + async def server_sent_events( + chat_chunks: Iterator[llama_cpp.ChatCompletionChunk], + ): + for chat_chunk in chat_chunks: + yield dict(data=json.dumps(chat_chunk)) + yield dict(data="[DONE]") + + chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore + + return EventSourceResponse( + server_sent_events(chunks), + ) + completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore + return completion + + +class ModelData(TypedDict): + id: str + object: Literal["model"] + owned_by: str + permissions: List[str] + + +class ModelList(TypedDict): + object: Literal["list"] + data: List[ModelData] + + +GetModelResponse = create_model_from_typeddict(ModelList) + + +@app.get("/v1/models", response_model=GetModelResponse) +def get_models() -> ModelList: + return { + "object": "list", + "data": [ + { + "id": llama.model_path, + "object": "model", + "owned_by": "me", + "permissions": [], + } + ], + }