llama.cpp/llama_cpp/server/app.py

import os
import json
from threading import Lock
from typing import List, Optional, Union, Iterator, Dict
from typing_extensions import TypedDict, Literal

import llama_cpp

from fastapi import Depends, FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
from sse_starlette.sse import EventSourceResponse


class Settings(BaseSettings):
    model: str = os.environ.get("MODEL", "null")
    n_ctx: int = 2048
    n_batch: int = 512
    n_threads: int = max((os.cpu_count() or 2) // 2, 1)
    f16_kv: bool = True
    use_mlock: bool = False  # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
    use_mmap: bool = True
    embedding: bool = True
    last_n_tokens_size: int = 64
    logits_all: bool = False
    cache: bool = False  # WARNING: This is an experimental feature
    vocab_only: bool = False


app = FastAPI(
    title="🦙 llama.cpp Python API",
    version="0.0.1",
)
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

llama: llama_cpp.Llama = None
def init_llama(settings: Settings = None):
    if settings is None:
        settings = Settings()
    global llama
    llama = llama_cpp.Llama(
        settings.model,
        f16_kv=settings.f16_kv,
        use_mlock=settings.use_mlock,
        use_mmap=settings.use_mmap,
        embedding=settings.embedding,
        logits_all=settings.logits_all,
        n_threads=settings.n_threads,
        n_batch=settings.n_batch,
        n_ctx=settings.n_ctx,
        last_n_tokens_size=settings.last_n_tokens_size,
        vocab_only=settings.vocab_only,
    )
    if settings.cache:
        cache = llama_cpp.LlamaCache()
        llama.set_cache(cache)

llama_lock = Lock()
def get_llama():
    with llama_lock:
        yield llama

model_field = Field(
    description="The model to use for generating completions."
)

class CreateCompletionRequest(BaseModel):
    prompt: Union[str, List[str]] = Field(
        default="",
        description="The prompt to generate completions for."
    )
    suffix: Optional[str] = Field(
        default=None,
        description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots."
    )
    max_tokens: int = Field(
        default=16,
        ge=1,
        le=2048,
        description="The maximum number of tokens to generate."
    )
    temperature: float = Field(
        default=0.8,
        ge=0.0,
        le=2.0,
        description="Adjust the randomness of the generated text.\n\n" +
        "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run."
    )
    top_p: float = Field(
        default=0.95,
        ge=0.0,
        le=1.0,
        description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" +
        "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text."
    )
    echo: bool = Field(
        default=False,
        description="Whether to echo the prompt in the generated text. Useful for chatbots."
    )
    stop: Optional[List[str]] = Field(
        default=None,
        description="A list of tokens at which to stop generation. If None, no stop tokens are used."
    )
    stream: bool = Field(
        default=False,
        description="Whether to stream the results as they are generated. Useful for chatbots."
    )
    logprobs: Optional[int] = Field(
        default=None,
        ge=0,
        description="The number of logprobs to generate. If None, no logprobs are generated."
    )


    # ignored, but marked as required for the sake of compatibility with openai's api
    model: str = model_field

    # llama.cpp specific parameters
    top_k: int = Field(
        default=40,
        ge=0,
        description="Limit the next token selection to the K most probable tokens.\n\n" +
        "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text."
    )
    repeat_penalty: float = Field(
        default=1.0,
        ge=0.0,
        description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" +
        "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient."
    )

    class Config:
        schema_extra = {
            "example": {
                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
                "stop": ["\n", "###"],
            }
        }


CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)


@app.post(
    "/v1/completions",
    response_model=CreateCompletionResponse,
)
def create_completion(
    request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
):
    if isinstance(request.prompt, list):
        request.prompt = "".join(request.prompt)

    completion_or_chunks = llama(
        **request.dict(
            exclude={
                "model"
            }
        )
    )
    if request.stream:
        chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks  # type: ignore
        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
    completion: llama_cpp.Completion = completion_or_chunks  # type: ignore
    return completion


class CreateEmbeddingRequest(BaseModel):
    # ignored, but marked as required for the sake of compatibility with openai's api
    model: str = model_field
    input: str

    class Config:
        schema_extra = {
            "example": {
                "input": "The food was delicious and the waiter...",
            }
        }


CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)


@app.post(
    "/v1/embeddings",
    response_model=CreateEmbeddingResponse,
)
def create_embedding(
    request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
):
    return llama.create_embedding(**request.dict(exclude={"model"}))


class ChatCompletionRequestMessage(BaseModel):
    role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
    content: str
    user: Optional[str] = None


class CreateChatCompletionRequest(BaseModel):
    model: Optional[str]
    messages: List[ChatCompletionRequestMessage]
    temperature: float = 0.8
    top_p: float = 0.95
    stream: bool = False
    stop: Optional[List[str]] = []
    max_tokens: int = 128

    # ignored, but marked as required for the sake of compatibility with openai's api
    model: str = model_field

    # llama.cpp specific parameters
    top_k: int = 40,
    repeat_penalty: float = 1.1

    class Config:
        schema_extra = {
            "example": {
                "messages": [
                    ChatCompletionRequestMessage(
                        role="system", content="You are a helpful assistant."
                    ),
                    ChatCompletionRequestMessage(
                        role="user", content="What is the capital of France?"
                    ),
                ]
            }
        }


CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)


@app.post(
    "/v1/chat/completions",
    response_model=CreateChatCompletionResponse,
)
def create_chat_completion(
    request: CreateChatCompletionRequest,
    llama: llama_cpp.Llama = Depends(get_llama),
) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
    completion_or_chunks = llama.create_chat_completion(
        **request.dict(
            exclude={
                "model"
            }
        ),
    )

    if request.stream:

        async def server_sent_events(
            chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
        ):
            for chat_chunk in chat_chunks:
                yield dict(data=json.dumps(chat_chunk))
            yield dict(data="[DONE]")

        chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks  # type: ignore

        return EventSourceResponse(
            server_sent_events(chunks),
        )
    completion: llama_cpp.ChatCompletion = completion_or_chunks  # type: ignore
    return completion


class ModelData(TypedDict):
    id: str
    object: Literal["model"]
    owned_by: str
    permissions: List[str]


class ModelList(TypedDict):
    object: Literal["list"]
    data: List[ModelData]


GetModelResponse = create_model_from_typeddict(ModelList)


@app.get("/v1/models", response_model=GetModelResponse)
def get_models() -> ModelList:
    return {
        "object": "list",
        "data": [
            {
                "id": llama.model_path,
                "object": "model",
                "owned_by": "me",
                "permissions": [],
            }
        ],
    }
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00			`import os`
			`import json`
			`from threading import Lock`
Fix import error 2023-05-01 19:11:15 +00:00			`from typing import List, Optional, Union, Iterator, Dict`
			`from typing_extensions import TypedDict, Literal`
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00
			`import llama_cpp`

			`from fastapi import Depends, FastAPI`
			`from fastapi.middleware.cors import CORSMiddleware`
			`from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict`
			`from sse_starlette.sse import EventSourceResponse`


			`class Settings(BaseSettings):`
llama_cpp server: slight refactor to init_llama function Define an init_llama function that starts llama with supplied settings instead of just doing it in the global context of app.py This allows the test to be less brittle by not needing to mess with os.environ, then importing the app 2023-04-29 06:47:36 +00:00			`model: str = os.environ.get("MODEL", "null")`
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00			`n_ctx: int = 2048`
			`n_batch: int = 512`
			`n_threads: int = max((os.cpu_count() or 2) // 2, 1)`
			`f16_kv: bool = True`
			`use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...`
			`use_mmap: bool = True`
			`embedding: bool = True`
			`last_n_tokens_size: int = 64`
			`logits_all: bool = False`
			`cache: bool = False # WARNING: This is an experimental feature`
tests: simple test for server module 2023-04-29 06:26:07 +00:00			`vocab_only: bool = False`
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00

			`app = FastAPI(`
			`title="🦙 llama.cpp Python API",`
			`version="0.0.1",`
			`)`
			`app.add_middleware(`
			`CORSMiddleware,`
			`allow_origins=["*"],`
			`allow_credentials=True,`
			`allow_methods=["*"],`
			`allow_headers=["*"],`
			`)`

llama_cpp server: slight refactor to init_llama function Define an init_llama function that starts llama with supplied settings instead of just doing it in the global context of app.py This allows the test to be less brittle by not needing to mess with os.environ, then importing the app 2023-04-29 06:47:36 +00:00			`llama: llama_cpp.Llama = None`
			`def init_llama(settings: Settings = None):`
			`if settings is None:`
			`settings = Settings()`
			`global llama`
			`llama = llama_cpp.Llama(`
			`settings.model,`
			`f16_kv=settings.f16_kv,`
			`use_mlock=settings.use_mlock,`
			`use_mmap=settings.use_mmap,`
			`embedding=settings.embedding,`
			`logits_all=settings.logits_all,`
			`n_threads=settings.n_threads,`
			`n_batch=settings.n_batch,`
			`n_ctx=settings.n_ctx,`
			`last_n_tokens_size=settings.last_n_tokens_size,`
			`vocab_only=settings.vocab_only,`
			`)`
			`if settings.cache:`
			`cache = llama_cpp.LlamaCache()`
			`llama.set_cache(cache)`
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00
llama_cpp server: slight refactor to init_llama function Define an init_llama function that starts llama with supplied settings instead of just doing it in the global context of app.py This allows the test to be less brittle by not needing to mess with os.environ, then importing the app 2023-04-29 06:47:36 +00:00			`llama_lock = Lock()`
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00			`def get_llama():`
			`with llama_lock:`
			`yield llama`

llama_cpp server: mark model as required `model` is ignored, but currently marked "optional"... on the one hand could mark "required" to make it explicit in case the server supports multiple llama's at the same time, but also could delete it since its ignored. decision: mark it required for the sake of openai api compatibility. I think out of all parameters, `model` is probably the most important one for people to keep using even if its ignored for now. 2023-04-29 07:47:35 +00:00			`model_field = Field(`
			`description="The model to use for generating completions."`
			`)`

llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00			`class CreateCompletionRequest(BaseModel):`
llama_cpp server: add some more information to fields for completions 2023-04-29 21:37:36 +00:00			`prompt: Union[str, List[str]] = Field(`
			`default="",`
			`description="The prompt to generate completions for."`
			`)`
			`suffix: Optional[str] = Field(`
			`default=None,`
			`description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots."`
			`)`
			`max_tokens: int = Field(`
			`default=16,`
			`ge=1,`
			`le=2048,`
			`description="The maximum number of tokens to generate."`
			`)`
			`temperature: float = Field(`
			`default=0.8,`
			`ge=0.0,`
			`le=2.0,`
			`description="Adjust the randomness of the generated text.\n\n" +`
			"Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run."
			`)`
			`top_p: float = Field(`
			`default=0.95,`
			`ge=0.0,`
			`le=1.0,`
			`description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" +`
			`"Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text."`
			`)`
			`echo: bool = Field(`
			`default=False,`
			`description="Whether to echo the prompt in the generated text. Useful for chatbots."`
			`)`
			`stop: Optional[List[str]] = Field(`
			`default=None,`
			`description="A list of tokens at which to stop generation. If None, no stop tokens are used."`
			`)`
			`stream: bool = Field(`
			`default=False,`
			`description="Whether to stream the results as they are generated. Useful for chatbots."`
			`)`
			`logprobs: Optional[int] = Field(`
			`default=None,`
			`ge=0,`
			`description="The number of logprobs to generate. If None, no logprobs are generated."`
			`)`


llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00
llama_cpp server: mark model as required `model` is ignored, but currently marked "optional"... on the one hand could mark "required" to make it explicit in case the server supports multiple llama's at the same time, but also could delete it since its ignored. decision: mark it required for the sake of openai api compatibility. I think out of all parameters, `model` is probably the most important one for people to keep using even if its ignored for now. 2023-04-29 07:47:35 +00:00			`# ignored, but marked as required for the sake of compatibility with openai's api`
			`model: str = model_field`

llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00			`# llama.cpp specific parameters`
llama_cpp server: add some more information to fields for completions 2023-04-29 21:37:36 +00:00			`top_k: int = Field(`
			`default=40,`
			`ge=0,`
			`description="Limit the next token selection to the K most probable tokens.\n\n" +`
			`"Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text."`
			`)`
			`repeat_penalty: float = Field(`
			`default=1.0,`
			`ge=0.0,`
			`description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" +`
			`"Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient."`
			`)`
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00
			`class Config:`
			`schema_extra = {`
			`"example": {`
			`"prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",`
			`"stop": ["\n", "###"],`
			`}`
			`}`


			`CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)`


			`@app.post(`
			`"/v1/completions",`
			`response_model=CreateCompletionResponse,`
			`)`
			`def create_completion(`
			`request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)`
			`):`
			`if isinstance(request.prompt, list):`
			`request.prompt = "".join(request.prompt)`

			`completion_or_chunks = llama(`
			`**request.dict(`
			`exclude={`
llama_cpp server: delete some ignored / unused parameters `n`, `presence_penalty`, `frequency_penalty`, `best_of`, `logit_bias`, `user`: not supported, excluded from the calls into llama. decision: delete it 2023-04-29 08:19:30 +00:00			`"model"`
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00			`}`
			`)`
			`)`
			`if request.stream:`
			`chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks # type: ignore`
			`return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)`
			`completion: llama_cpp.Completion = completion_or_chunks # type: ignore`
			`return completion`


			`class CreateEmbeddingRequest(BaseModel):`
llama_cpp server: mark model as required `model` is ignored, but currently marked "optional"... on the one hand could mark "required" to make it explicit in case the server supports multiple llama's at the same time, but also could delete it since its ignored. decision: mark it required for the sake of openai api compatibility. I think out of all parameters, `model` is probably the most important one for people to keep using even if its ignored for now. 2023-04-29 07:47:35 +00:00			`# ignored, but marked as required for the sake of compatibility with openai's api`
			`model: str = model_field`
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00			`input: str`

			`class Config:`
			`schema_extra = {`
			`"example": {`
			`"input": "The food was delicious and the waiter...",`
			`}`
			`}`


			`CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)`


			`@app.post(`
			`"/v1/embeddings",`
			`response_model=CreateEmbeddingResponse,`
			`)`
			`def create_embedding(`
			`request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)`
			`):`
llama_cpp server: delete some ignored / unused parameters `n`, `presence_penalty`, `frequency_penalty`, `best_of`, `logit_bias`, `user`: not supported, excluded from the calls into llama. decision: delete it 2023-04-29 08:19:30 +00:00			`return llama.create_embedding(**request.dict(exclude={"model"}))`
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00

			`class ChatCompletionRequestMessage(BaseModel):`
			`role: Union[Literal["system"], Literal["user"], Literal["assistant"]]`
			`content: str`
			`user: Optional[str] = None`


			`class CreateChatCompletionRequest(BaseModel):`
			`model: Optional[str]`
			`messages: List[ChatCompletionRequestMessage]`
			`temperature: float = 0.8`
			`top_p: float = 0.95`
			`stream: bool = False`
			`stop: Optional[List[str]] = []`
			`max_tokens: int = 128`

llama_cpp server: mark model as required `model` is ignored, but currently marked "optional"... on the one hand could mark "required" to make it explicit in case the server supports multiple llama's at the same time, but also could delete it since its ignored. decision: mark it required for the sake of openai api compatibility. I think out of all parameters, `model` is probably the most important one for people to keep using even if its ignored for now. 2023-04-29 07:47:35 +00:00			`# ignored, but marked as required for the sake of compatibility with openai's api`
			`model: str = model_field`
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00
			`# llama.cpp specific parameters`
llama_cpp server: add missing top_k param to CreateChatCompletionRequest `llama.create_chat_completion` definitely has a `top_k` argument, but its missing from `CreateChatCompletionRequest`. decision: add it 2023-04-29 18:52:20 +00:00			`top_k: int = 40,`
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00			`repeat_penalty: float = 1.1`

			`class Config:`
			`schema_extra = {`
			`"example": {`
			`"messages": [`
			`ChatCompletionRequestMessage(`
			`role="system", content="You are a helpful assistant."`
			`),`
			`ChatCompletionRequestMessage(`
			`role="user", content="What is the capital of France?"`
			`),`
			`]`
			`}`
			`}`


			`CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)`


			`@app.post(`
			`"/v1/chat/completions",`
			`response_model=CreateChatCompletionResponse,`
			`)`
			`def create_chat_completion(`
			`request: CreateChatCompletionRequest,`
			`llama: llama_cpp.Llama = Depends(get_llama),`
			`) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:`
			`completion_or_chunks = llama.create_chat_completion(`
			`**request.dict(`
			`exclude={`
llama_cpp server: delete some ignored / unused parameters `n`, `presence_penalty`, `frequency_penalty`, `best_of`, `logit_bias`, `user`: not supported, excluded from the calls into llama. decision: delete it 2023-04-29 08:19:30 +00:00			`"model"`
llama_cpp server: app is now importable, still runnable as a module 2023-04-29 05:43:37 +00:00			`}`
			`),`
			`)`

			`if request.stream:`

			`async def server_sent_events(`
			`chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],`
			`):`
			`for chat_chunk in chat_chunks:`
			`yield dict(data=json.dumps(chat_chunk))`
			`yield dict(data="[DONE]")`

			`chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore`

			`return EventSourceResponse(`
			`server_sent_events(chunks),`
			`)`
			`completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore`
			`return completion`


			`class ModelData(TypedDict):`
			`id: str`
			`object: Literal["model"]`
			`owned_by: str`
			`permissions: List[str]`


			`class ModelList(TypedDict):`
			`object: Literal["list"]`
			`data: List[ModelData]`


			`GetModelResponse = create_model_from_typeddict(ModelList)`


			`@app.get("/v1/models", response_model=GetModelResponse)`
			`def get_models() -> ModelList:`
			`return {`
			`"object": "list",`
			`"data": [`
			`{`
			`"id": llama.model_path,`
			`"object": "model",`
			`"owned_by": "me",`
			`"permissions": [],`
			`}`
			`],`
			`}`