From e40fcb05754d0ec9c65359e245a436794cbfefdb Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 00:47:35 -0700 Subject: [PATCH 01/61] llama_cpp server: mark model as required `model` is ignored, but currently marked "optional"... on the one hand could mark "required" to make it explicit in case the server supports multiple llama's at the same time, but also could delete it since its ignored. decision: mark it required for the sake of openai api compatibility. I think out of all parameters, `model` is probably the most important one for people to keep using even if its ignored for now. --- llama_cpp/server/app.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 640dd3f..5d87e78 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -66,6 +66,10 @@ def get_llama(): with llama_lock: yield llama +model_field = Field( + description="The model to use for generating completions." +) + class CreateCompletionRequest(BaseModel): prompt: Union[str, List[str]] suffix: Optional[str] = Field(None) @@ -76,8 +80,9 @@ class CreateCompletionRequest(BaseModel): stop: Optional[List[str]] = [] stream: bool = False - # ignored or currently unsupported - model: Optional[str] = Field(None) + # ignored, but marked as required for the sake of compatibility with openai's api + model: str = model_field + n: Optional[int] = 1 logprobs: Optional[int] = Field(None) presence_penalty: Optional[float] = 0 @@ -133,7 +138,8 @@ def create_completion( class CreateEmbeddingRequest(BaseModel): - model: Optional[str] + # ignored, but marked as required for the sake of compatibility with openai's api + model: str = model_field input: str user: Optional[str] @@ -173,8 +179,9 @@ class CreateChatCompletionRequest(BaseModel): stop: Optional[List[str]] = [] max_tokens: int = 128 - # ignored or currently unsupported - model: Optional[str] = Field(None) + # ignored, but marked as required for the sake of compatibility with openai's api + model: str = model_field + n: Optional[int] = 1 presence_penalty: Optional[float] = 0 frequency_penalty: Optional[float] = 0 From b47b9549d57f146a00ee19cd7d2bb294111abb67 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 01:19:30 -0700 Subject: [PATCH 02/61] llama_cpp server: delete some ignored / unused parameters `n`, `presence_penalty`, `frequency_penalty`, `best_of`, `logit_bias`, `user`: not supported, excluded from the calls into llama. decision: delete it --- llama_cpp/llama_types.py | 2 -- llama_cpp/server/app.py | 30 +++--------------------------- 2 files changed, 3 insertions(+), 29 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index b62ff1b..b8bdb08 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -60,8 +60,6 @@ class Completion(TypedDict): class ChatCompletionMessage(TypedDict): role: Union[Literal["assistant"], Literal["user"], Literal["system"]] content: str - user: NotRequired[str] - class ChatCompletionChoice(TypedDict): index: int diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 5d87e78..cc467db 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -83,13 +83,7 @@ class CreateCompletionRequest(BaseModel): # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field - n: Optional[int] = 1 logprobs: Optional[int] = Field(None) - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - best_of: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) # llama.cpp specific parameters top_k: int = 40 @@ -120,13 +114,7 @@ def create_completion( completion_or_chunks = llama( **request.dict( exclude={ - "model", - "n", - "frequency_penalty", - "presence_penalty", - "best_of", - "logit_bias", - "user", + "model" } ) ) @@ -141,7 +129,6 @@ class CreateEmbeddingRequest(BaseModel): # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field input: str - user: Optional[str] class Config: schema_extra = { @@ -161,7 +148,7 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - return llama.create_embedding(**request.dict(exclude={"model", "user"})) + return llama.create_embedding(**request.dict(exclude={"model"})) class ChatCompletionRequestMessage(BaseModel): @@ -181,12 +168,6 @@ class CreateChatCompletionRequest(BaseModel): # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field - - n: Optional[int] = 1 - presence_penalty: Optional[float] = 0 - frequency_penalty: Optional[float] = 0 - logit_bias: Optional[Dict[str, float]] = Field(None) - user: Optional[str] = Field(None) # llama.cpp specific parameters repeat_penalty: float = 1.1 @@ -220,12 +201,7 @@ def create_chat_completion( completion_or_chunks = llama.create_chat_completion( **request.dict( exclude={ - "model", - "n", - "presence_penalty", - "frequency_penalty", - "logit_bias", - "user", + "model" } ), ) From 1e429135993f4e1298d8c801f2628bae3d8f18a9 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 11:48:49 -0700 Subject: [PATCH 03/61] llama_cpp server: move logprobs to supported I think this is actually supported (its in the arguments of `LLama.__call__`, which is how the completion is invoked). decision: mark as supported --- llama_cpp/server/app.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index cc467db..2d20f37 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -79,12 +79,11 @@ class CreateCompletionRequest(BaseModel): echo: bool = False stop: Optional[List[str]] = [] stream: bool = False + logprobs: Optional[int] = Field(None) # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field - logprobs: Optional[int] = Field(None) - # llama.cpp specific parameters top_k: int = 40 repeat_penalty: float = 1.1 From a5aa6c1478de7cc16b654df533be3dee6519c42a Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 11:52:20 -0700 Subject: [PATCH 04/61] llama_cpp server: add missing top_k param to CreateChatCompletionRequest `llama.create_chat_completion` definitely has a `top_k` argument, but its missing from `CreateChatCompletionRequest`. decision: add it --- llama_cpp/server/app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 2d20f37..e1045af 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -169,6 +169,7 @@ class CreateChatCompletionRequest(BaseModel): model: str = model_field # llama.cpp specific parameters + top_k: int = 40, repeat_penalty: float = 1.1 class Config: From 978b6daf9313a11367d0a9393226379173fdb688 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 14:37:36 -0700 Subject: [PATCH 05/61] llama_cpp server: add some more information to fields for completions --- llama_cpp/server/app.py | 70 ++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index e1045af..e168485 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -71,22 +71,70 @@ model_field = Field( ) class CreateCompletionRequest(BaseModel): - prompt: Union[str, List[str]] - suffix: Optional[str] = Field(None) - max_tokens: int = 16 - temperature: float = 0.8 - top_p: float = 0.95 - echo: bool = False - stop: Optional[List[str]] = [] - stream: bool = False - logprobs: Optional[int] = Field(None) + prompt: Union[str, List[str]] = Field( + default="", + description="The prompt to generate completions for." + ) + suffix: Optional[str] = Field( + default=None, + description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots." + ) + max_tokens: int = Field( + default=16, + ge=1, + le=2048, + description="The maximum number of tokens to generate." + ) + temperature: float = Field( + default=0.8, + ge=0.0, + le=2.0, + description="Adjust the randomness of the generated text.\n\n" + + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run." + ) + top_p: float = Field( + default=0.95, + ge=0.0, + le=1.0, + description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text." + ) + echo: bool = Field( + default=False, + description="Whether to echo the prompt in the generated text. Useful for chatbots." + ) + stop: Optional[List[str]] = Field( + default=None, + description="A list of tokens at which to stop generation. If None, no stop tokens are used." + ) + stream: bool = Field( + default=False, + description="Whether to stream the results as they are generated. Useful for chatbots." + ) + logprobs: Optional[int] = Field( + default=None, + ge=0, + description="The number of logprobs to generate. If None, no logprobs are generated." + ) + + # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field # llama.cpp specific parameters - top_k: int = 40 - repeat_penalty: float = 1.1 + top_k: int = Field( + default=40, + ge=0, + description="Limit the next token selection to the K most probable tokens.\n\n" + + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text." + ) + repeat_penalty: float = Field( + default=1.0, + ge=0.0, + description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient." + ) class Config: schema_extra = { From 8dcbf65a45d729eedb4363f4e92247e6325d5b7d Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 18:37:43 -0700 Subject: [PATCH 06/61] llama_cpp server: define fields for chat completions Slight refactor for common fields shared between completion and chat completion --- llama_cpp/server/app.py | 125 +++++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 54 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index e168485..ec5dbd3 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -70,6 +70,55 @@ model_field = Field( description="The model to use for generating completions." ) +max_tokens_field = Field( + default=16, + ge=1, + le=2048, + description="The maximum number of tokens to generate." +) + +temperature_field = Field( + default=0.8, + ge=0.0, + le=2.0, + description="Adjust the randomness of the generated text.\n\n" + + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run." +) + +top_p_field = Field( + default=0.95, + ge=0.0, + le=1.0, + description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text." +) + +stop_field = Field( + default=None, + description="A list of tokens at which to stop generation. If None, no stop tokens are used." +) + +stream_field = Field( + default=False, + description="Whether to stream the results as they are generated. Useful for chatbots." +) + +top_k_field = Field( + default=40, + ge=0, + description="Limit the next token selection to the K most probable tokens.\n\n" + + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text." +) + +repeat_penalty_field = Field( + default=1.0, + ge=0.0, + description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient." +) + + + class CreateCompletionRequest(BaseModel): prompt: Union[str, List[str]] = Field( default="", @@ -79,62 +128,27 @@ class CreateCompletionRequest(BaseModel): default=None, description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots." ) - max_tokens: int = Field( - default=16, - ge=1, - le=2048, - description="The maximum number of tokens to generate." - ) - temperature: float = Field( - default=0.8, - ge=0.0, - le=2.0, - description="Adjust the randomness of the generated text.\n\n" + - "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run." - ) - top_p: float = Field( - default=0.95, - ge=0.0, - le=1.0, - description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + - "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text." - ) + max_tokens: int = max_tokens_field + temperature: float = temperature_field + top_p: float = top_p_field echo: bool = Field( default=False, description="Whether to echo the prompt in the generated text. Useful for chatbots." ) - stop: Optional[List[str]] = Field( - default=None, - description="A list of tokens at which to stop generation. If None, no stop tokens are used." - ) - stream: bool = Field( - default=False, - description="Whether to stream the results as they are generated. Useful for chatbots." - ) + stop: Optional[List[str]] = stop_field + stream: bool = stream_field logprobs: Optional[int] = Field( default=None, ge=0, description="The number of logprobs to generate. If None, no logprobs are generated." ) - - # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field # llama.cpp specific parameters - top_k: int = Field( - default=40, - ge=0, - description="Limit the next token selection to the K most probable tokens.\n\n" + - "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text." - ) - repeat_penalty: float = Field( - default=1.0, - ge=0.0, - description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + - "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient." - ) + top_k: int = top_k_field + repeat_penalty: float = repeat_penalty_field class Config: schema_extra = { @@ -199,26 +213,29 @@ def create_embedding( class ChatCompletionRequestMessage(BaseModel): - role: Union[Literal["system"], Literal["user"], Literal["assistant"]] - content: str - user: Optional[str] = None + role: Union[Literal["system"], Literal["user"], Literal["assistant"]] = Field( + default=Literal["user"], description="The role of the message." + ) + content: str = Field(default="", description="The content of the message.") class CreateChatCompletionRequest(BaseModel): - model: Optional[str] - messages: List[ChatCompletionRequestMessage] - temperature: float = 0.8 - top_p: float = 0.95 - stream: bool = False - stop: Optional[List[str]] = [] - max_tokens: int = 128 + messages: List[ChatCompletionRequestMessage] = Field( + default=[], + description="A list of messages to generate completions for." + ) + max_tokens: int = max_tokens_field + temperature: float = temperature_field + top_p: float = top_p_field + stop: Optional[List[str]] = stop_field + stream: bool = stream_field # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field # llama.cpp specific parameters - top_k: int = 40, - repeat_penalty: float = 1.1 + top_k: int = top_k_field + repeat_penalty: float = repeat_penalty_field class Config: schema_extra = { From fa2a61e06569bb600d36d7ea5fee2ab456b3434d Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Sat, 29 Apr 2023 18:46:01 -0700 Subject: [PATCH 07/61] llama_cpp server: fields for the embedding endpoint --- llama_cpp/server/app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ec5dbd3..9adddcd 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -189,7 +189,9 @@ def create_completion( class CreateEmbeddingRequest(BaseModel): # ignored, but marked as required for the sake of compatibility with openai's api model: str = model_field - input: str + input: str = Field( + description="The input to embed." + ) class Config: schema_extra = { From dbbfc4ba2f8460e130dc268096f5906d3d22347b Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Mon, 1 May 2023 11:48:37 -0700 Subject: [PATCH 08/61] llama_cpp server: fix to ChatCompletionRequestMessage When I generate a client, it breaks because it fails to process the schema of ChatCompletionRequestMessage These fix that: - I think `Union[Literal["user"], Literal["channel"], ...]` is the same as Literal["user", "channel", ...] - Turns out default value `Literal["user"]` isn't JSON serializable, so replace with "user" --- llama_cpp/llama_types.py | 2 +- llama_cpp/server/app.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index b8bdb08..b770a01 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -58,7 +58,7 @@ class Completion(TypedDict): class ChatCompletionMessage(TypedDict): - role: Union[Literal["assistant"], Literal["user"], Literal["system"]] + role: Literal["assistant", "user", "system"] content: str class ChatCompletionChoice(TypedDict): diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 9adddcd..886ee6d 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -215,8 +215,8 @@ def create_embedding( class ChatCompletionRequestMessage(BaseModel): - role: Union[Literal["system"], Literal["user"], Literal["assistant"]] = Field( - default=Literal["user"], description="The role of the message." + role: Literal["system", "user", "assistant"] = Field( + default="user", description="The role of the message." ) content: str = Field(default="", description="The content of the message.") From b9098b0ef7309b63ebff99cdfadf641223c15025 Mon Sep 17 00:00:00 2001 From: Lucas Doyle Date: Tue, 2 May 2023 14:08:51 -0700 Subject: [PATCH 09/61] llama_cpp server: prompt is a string Not sure why this union type was here but taking a look at llama.py, prompt is only ever processed as a string for completion This was breaking types when generating an openapi client --- llama_cpp/server/app.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ef8aa4e..595476f 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -126,7 +126,7 @@ repeat_penalty_field = Field( ) class CreateCompletionRequest(BaseModel): - prompt: Union[str, List[str]] = Field( + prompt: Optional[str] = Field( default="", description="The prompt to generate completions for." ) @@ -175,9 +175,6 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) def create_completion( request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - if isinstance(request.prompt, list): - request.prompt = "".join(request.prompt) - completion_or_chunks = llama( **request.dict( exclude={ From 6d3c20e39dbae1a0c89e1ce6d5bec076b102f2e6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 May 2023 22:20:53 -0400 Subject: [PATCH 10/61] Add CUDA docker image build to github actions --- .github/workflows/build-docker.yaml | 38 +++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 44196f1..8ffa45f 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -37,3 +37,41 @@ jobs: pull: true # always fetch the latest base images platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 tags: ghcr.io/abetlen/llama-cpp-python:latest + + docker-cuda: + name: Build and push Docker CUDA image + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: "true" + + - name: Setup CUDA 12.1 + uses: Jimver/cuda-toolkit@v0.2.10 + id: cuda-toolkit + with: + cuda: '12.1.0' + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v4 + with: + file: Dockerfile.cuda + context: . + push: true # push to registry + pull: true # always fetch the latest base images + platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 + tags: ghcr.io/abetlen/llama-cpp-python-cuda:latest From 0607f6578efe03c7b8894d2ed5f71eaf03473c55 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 3 May 2023 23:22:16 -0400 Subject: [PATCH 11/61] Use network installer for cuda --- .github/workflows/build-docker.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 8ffa45f..2ec5c0d 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -52,6 +52,7 @@ jobs: id: cuda-toolkit with: cuda: '12.1.0' + method: network - name: Set up QEMU uses: docker/setup-qemu-action@v2 From d594892fd425cb41b30e4cb31e3aa5ef1c16e681 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 00:02:46 -0400 Subject: [PATCH 12/61] Remove Docker CUDA build job --- .github/workflows/build-docker.yaml | 41 +---------------------------- 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 2ec5c0d..16b00a2 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -36,43 +36,4 @@ jobs: push: true # push to registry pull: true # always fetch the latest base images platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 - tags: ghcr.io/abetlen/llama-cpp-python:latest - - docker-cuda: - name: Build and push Docker CUDA image - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: "true" - - - name: Setup CUDA 12.1 - uses: Jimver/cuda-toolkit@v0.2.10 - id: cuda-toolkit - with: - cuda: '12.1.0' - method: network - - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build and push - uses: docker/build-push-action@v4 - with: - file: Dockerfile.cuda - context: . - push: true # push to registry - pull: true # always fetch the latest base images - platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 - tags: ghcr.io/abetlen/llama-cpp-python-cuda:latest + tags: ghcr.io/abetlen/llama-cpp-python:latest \ No newline at end of file From 329297fafb4916951cf1c3146505a9501e986d95 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 12:18:40 -0400 Subject: [PATCH 13/61] Bugfix: Missing logits_to_logprobs --- llama_cpp/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index fef7b3e..8cd77ee 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -639,7 +639,7 @@ class Llama: self.detokenize([token]).decode("utf-8", errors="ignore") for token in all_tokens ] - all_logprobs = [Llama._logits_to_logprobs(row) for row in self.eval_logits] + all_logprobs = [Llama.logits_to_logprobs(list(map(float, row))) for row in self.eval_logits] for token, token_str, logprobs_token in zip( all_tokens, all_token_strs, all_logprobs ): @@ -985,7 +985,7 @@ class Llama: return llama_cpp.llama_token_bos() @staticmethod - def logits_to_logprobs(logits: List[llama_cpp.c_float]) -> List[llama_cpp.c_float]: + def logits_to_logprobs(logits: List[float]) -> List[float]: exps = [math.exp(float(x)) for x in logits] sum_exps = sum(exps) - return [llama_cpp.c_float(math.log(x / sum_exps)) for x in exps] + return [math.log(x / sum_exps) for x in exps] From d78cec67df876221471782e7e1fbe62abf48ee25 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 12:20:25 -0400 Subject: [PATCH 14/61] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e216aa0..2edbdb0 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e216aa04633892b972d013719e38b59fd4917341 +Subproject commit 2edbdb0f99336cb41f0995061c7602ed54beb863 From cabd8b8ed1ee45a19baa9436668898bbe9471492 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 12:21:20 -0400 Subject: [PATCH 15/61] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 64f7a0d..2dab374 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.41" +version = "0.1.42" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index f7f0fa4..0a52826 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.41", + version="0.1.42", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 501321875f449594c249cdbbc9b48208fbce4bde Mon Sep 17 00:00:00 2001 From: Thomas Neu <81517187+th-neu@users.noreply.github.com> Date: Thu, 4 May 2023 21:03:19 +0200 Subject: [PATCH 16/61] Slim-Bullseye based docker image ends up at ~669MB --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 14fb3be..f58506f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3-bullseye +FROM python:3-slim-bullseye # We need to set the host to 0.0.0.0 to allow outside access ENV HOST 0.0.0.0 @@ -6,10 +6,10 @@ ENV HOST 0.0.0.0 COPY . . # Install the package -RUN apt update && apt install -y libopenblas-dev +RUN apt update && apt install -y libopenblas-dev ninja-build build-essential RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette RUN LLAMA_OPENBLAS=1 python3 setup.py develop # Run the server -CMD python3 -m llama_cpp.server \ No newline at end of file +CMD python3 -m llama_cpp.server From 97c6372350c57a4fffb6072cb299e5a9bd8b38dc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 21:58:27 -0400 Subject: [PATCH 17/61] Rewind model to longest prefix. --- llama_cpp/llama.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 8cd77ee..7a8c25b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -390,18 +390,28 @@ class Llama: """ assert self.ctx is not None - if ( - reset - and len(self.eval_tokens) > 0 - and tuple(self.eval_tokens) == tuple(tokens[: len(self.eval_tokens)]) - ): - if self.verbose: - print("Llama.generate: cache hit", file=sys.stderr) - reset = False - tokens = tokens[len(self.eval_tokens) :] + if reset and len(self.eval_tokens) > 0: + longest_prefix = 0 + for a, b in zip(self.eval_tokens, tokens[:-1]): + if a == b: + longest_prefix += 1 + else: + break + if longest_prefix > 0: + if self.verbose: + print("Llama.generate: prefix-match hit", file=sys.stderr) + reset = False + tokens = tokens[longest_prefix:] + for _ in range(len(self.eval_tokens) - longest_prefix): + self.eval_tokens.pop() + try: + self.eval_logits.pop() + except IndexError: + pass if reset: self.reset() + while True: self.eval(tokens) token = self.sample( From 853dc711cc5507ca119cb822f459cd16c9021f15 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 21:58:36 -0400 Subject: [PATCH 18/61] Format --- llama_cpp/llama.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7a8c25b..32d5424 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -649,7 +649,10 @@ class Llama: self.detokenize([token]).decode("utf-8", errors="ignore") for token in all_tokens ] - all_logprobs = [Llama.logits_to_logprobs(list(map(float, row))) for row in self.eval_logits] + all_logprobs = [ + Llama.logits_to_logprobs(list(map(float, row))) + for row in self.eval_logits + ] for token, token_str, logprobs_token in zip( all_tokens, all_token_strs, all_logprobs ): @@ -968,7 +971,10 @@ class Llama: llama_state_compact = (llama_cpp.c_uint8 * int(n_bytes))() llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes)) if self.verbose: - print(f"Llama.save_state: saving {n_bytes} bytes of llama state", file=sys.stderr) + print( + f"Llama.save_state: saving {n_bytes} bytes of llama state", + file=sys.stderr, + ) return LlamaState( eval_tokens=self.eval_tokens.copy(), eval_logits=self.eval_logits.copy(), From 5c165a85da5a340aca85a44e2282db2e5f729463 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 4 May 2023 21:59:37 -0400 Subject: [PATCH 19/61] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2dab374..ca0346f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.42" +version = "0.1.43" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 0a52826..405886a 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.42", + version="0.1.43", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 952ba9ecaf7a78be1844a1c533d6f6f580b92833 Mon Sep 17 00:00:00 2001 From: Thomas Neu <81517187+th-neu@users.noreply.github.com> Date: Fri, 5 May 2023 14:21:57 +0200 Subject: [PATCH 20/61] Update README.md add windows server commad --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index a8afa67..ee6ec2d 100644 --- a/README.md +++ b/README.md @@ -64,12 +64,20 @@ This allows you to use llama.cpp compatible models with any OpenAI compatible cl To install the server package and get started: +Linux ```bash pip install llama-cpp-python[server] export MODEL=./models/7B/ggml-model.bin python3 -m llama_cpp.server ``` +Windows +```cmd +pip install llama-cpp-python[server] +SET MODEL=\models\7B\ggml-model.bin +python3 -m llama_cpp.server +``` + Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. ## Docker image From eb54e30f343251767ec0a2cb10da2684b896718f Mon Sep 17 00:00:00 2001 From: Thomas Neu <81517187+th-neu@users.noreply.github.com> Date: Fri, 5 May 2023 14:22:41 +0200 Subject: [PATCH 21/61] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ee6ec2d..d24bad5 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ python3 -m llama_cpp.server Windows ```cmd pip install llama-cpp-python[server] -SET MODEL=\models\7B\ggml-model.bin +SET MODEL=..\models\7B\ggml-model.bin python3 -m llama_cpp.server ``` From 24fc38754b6da802ae5b32fb301e957868ec5e86 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 12:08:28 -0400 Subject: [PATCH 22/61] Add cli options to server. Closes #37 --- llama_cpp/server/__main__.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 4fbee37..5c9598a 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -22,12 +22,26 @@ Then visit http://localhost:8000/docs to see the interactive API docs. """ import os +import argparse + import uvicorn -from llama_cpp.server.app import create_app +from llama_cpp.server.app import create_app, Settings if __name__ == "__main__": - app = create_app() + parser = argparse.ArgumentParser() + for name, field in Settings.__fields__.items(): + parser.add_argument( + f"--{name}", + dest=name, + type=field.type_, + default=field.default, + help=field.field_info.description, + ) + + args = parser.parse_args() + settings = Settings(**vars(args)) + app = create_app(settings=settings) uvicorn.run( app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) From 5be0efa5f8f98f4b889ca9869e5005ecb5f195d2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 12:21:49 -0400 Subject: [PATCH 23/61] Cache should raise KeyError when key is missing --- llama_cpp/llama.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 32d5424..4e03ed4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -33,12 +33,10 @@ class LlamaCache: return k return None - def __getitem__( - self, key: Sequence[llama_cpp.llama_token] - ) -> Optional["LlamaState"]: + def __getitem__(self, key: Sequence[llama_cpp.llama_token]) -> "LlamaState": _key = self._find_key(tuple(key)) if _key is None: - return None + raise KeyError(f"Key not found: {key}") return self.cache_state[_key] def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool: From b6a9a0b6ba74c8b539e98ec31fc6558563b20c96 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 12:22:27 -0400 Subject: [PATCH 24/61] Add types for all low-level api functions --- llama_cpp/llama.py | 2 +- llama_cpp/llama_cpp.py | 81 +++++++++++++++++++++++++++++++----------- 2 files changed, 62 insertions(+), 21 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4e03ed4..c1c8847 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -52,7 +52,7 @@ class LlamaState: self, eval_tokens: Deque[llama_cpp.llama_token], eval_logits: Deque[List[llama_cpp.c_float]], - llama_state, + llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] llama_state_size: llama_cpp.c_size_t, ): self.eval_tokens = eval_tokens diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 30414f5..0a35445 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -17,7 +17,7 @@ import pathlib # Load the library -def _load_shared_library(lib_base_name): +def _load_shared_library(lib_base_name: str): # Determine the file extension based on the platform if sys.platform.startswith("linux"): lib_ext = ".so" @@ -252,7 +252,9 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied -def llama_copy_state_data(ctx: llama_context_p, dest) -> c_size_t: +def llama_copy_state_data( + ctx: llama_context_p, dest # type: Array[c_uint8] +) -> c_size_t: return _lib.llama_copy_state_data(ctx, dest) @@ -262,7 +264,9 @@ _lib.llama_copy_state_data.restype = c_size_t # Set the state reading from the specified address # Returns the number of bytes read -def llama_set_state_data(ctx: llama_context_p, src) -> c_size_t: +def llama_set_state_data( + ctx: llama_context_p, src # type: Array[c_uint8] +) -> c_size_t: return _lib.llama_set_state_data(ctx, src) @@ -274,9 +278,9 @@ _lib.llama_set_state_data.restype = c_size_t def llama_load_session_file( ctx: llama_context_p, path_session: bytes, - tokens_out, + tokens_out, # type: Array[llama_token] n_token_capacity: c_size_t, - n_token_count_out, + n_token_count_out, # type: Array[c_size_t] ) -> c_size_t: return _lib.llama_load_session_file( ctx, path_session, tokens_out, n_token_capacity, n_token_count_out @@ -294,7 +298,10 @@ _lib.llama_load_session_file.restype = c_size_t def llama_save_session_file( - ctx: llama_context_p, path_session: bytes, tokens, n_token_count: c_size_t + ctx: llama_context_p, + path_session: bytes, + tokens, # type: Array[llama_token] + n_token_count: c_size_t, ) -> c_size_t: return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) @@ -433,8 +440,8 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, - candidates, - last_tokens_data, + candidates, # type: Array[llama_token_data] + last_tokens_data, # type: Array[llama_token] last_tokens_size: c_int, penalty: c_float, ): @@ -456,8 +463,8 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, - candidates, - last_tokens_data, + candidates, # type: Array[llama_token_data] + last_tokens_data, # type: Array[llama_token] last_tokens_size: c_int, alpha_frequency: c_float, alpha_presence: c_float, @@ -484,7 +491,10 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -def llama_sample_softmax(ctx: llama_context_p, candidates): +def llama_sample_softmax( + ctx: llama_context_p, + candidates # type: Array[llama_token_data] +): return _lib.llama_sample_softmax(ctx, candidates) @@ -497,7 +507,10 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_k( - ctx: llama_context_p, candidates, k: c_int, min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + k: c_int, + min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -513,7 +526,10 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_p( - ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + p: c_float, + min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -529,7 +545,10 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( - ctx: llama_context_p, candidates, z: c_float, min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + z: c_float, + min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -545,7 +564,10 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. def llama_sample_typical( - ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + p: c_float, + min_keep: c_size_t = c_size_t(1) ): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) @@ -559,7 +581,11 @@ _lib.llama_sample_typical.argtypes = [ _lib.llama_sample_typical.restype = None -def llama_sample_temperature(ctx: llama_context_p, candidates, temp: c_float): +def llama_sample_temperature( + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + temp: c_float +): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -578,7 +604,12 @@ _lib.llama_sample_temperature.restype = None # @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( - ctx: llama_context_p, candidates, tau: c_float, eta: c_float, m: c_int, mu + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + tau: c_float, + eta: c_float, + m: c_int, + mu # type: Array[c_float] ) -> llama_token: return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -600,7 +631,11 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( - ctx: llama_context_p, candidates, tau: c_float, eta: c_float, mu + ctx: llama_context_p, + candidates, # type: Array[llama_token_data] + tau: c_float, + eta: c_float, + mu # type: Array[c_float] ) -> llama_token: return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) @@ -616,7 +651,10 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. -def llama_sample_token_greedy(ctx: llama_context_p, candidates) -> llama_token: +def llama_sample_token_greedy( + ctx: llama_context_p, + candidates # type: Array[llama_token_data] +) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -628,7 +666,10 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. -def llama_sample_token(ctx: llama_context_p, candidates) -> llama_token: +def llama_sample_token( + ctx: llama_context_p, + candidates # type: Array[llama_token_data] +) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From 22c3056b2a8d19f2c5ce9ab817e312da21e66d9c Mon Sep 17 00:00:00 2001 From: Thomas Neu <81517187+th-neu@users.noreply.github.com> Date: Fri, 5 May 2023 18:40:00 +0200 Subject: [PATCH 25/61] Update README.md added MacOS --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d24bad5..c46fa11 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ This allows you to use llama.cpp compatible models with any OpenAI compatible cl To install the server package and get started: -Linux +Linux/MacOS ```bash pip install llama-cpp-python[server] export MODEL=./models/7B/ggml-model.bin From 5e7ddfc3d6933471ba503477c0513a8987db4d9a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 13:54:22 -0400 Subject: [PATCH 26/61] Fix llama_cpp types --- llama_cpp/llama_cpp.py | 74 +++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 0a35445..87d9249 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -8,6 +8,7 @@ from ctypes import ( c_void_p, c_bool, POINTER, + _Pointer, # type: ignore Structure, Array, c_uint8, @@ -252,9 +253,7 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied -def llama_copy_state_data( - ctx: llama_context_p, dest # type: Array[c_uint8] -) -> c_size_t: +def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_t: return _lib.llama_copy_state_data(ctx, dest) @@ -278,9 +277,9 @@ _lib.llama_set_state_data.restype = c_size_t def llama_load_session_file( ctx: llama_context_p, path_session: bytes, - tokens_out, # type: Array[llama_token] + tokens_out: Array[llama_token], n_token_capacity: c_size_t, - n_token_count_out, # type: Array[c_size_t] + n_token_count_out: _Pointer[c_size_t], ) -> c_size_t: return _lib.llama_load_session_file( ctx, path_session, tokens_out, n_token_capacity, n_token_count_out @@ -300,7 +299,7 @@ _lib.llama_load_session_file.restype = c_size_t def llama_save_session_file( ctx: llama_context_p, path_session: bytes, - tokens, # type: Array[llama_token] + tokens: Array[llama_token], n_token_count: c_size_t, ) -> c_size_t: return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) @@ -321,7 +320,7 @@ _lib.llama_save_session_file.restype = c_size_t # Returns 0 on success def llama_eval( ctx: llama_context_p, - tokens, # type: Array[llama_token] + tokens: Array[llama_token], n_tokens: c_int, n_past: c_int, n_threads: c_int, @@ -440,8 +439,8 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - last_tokens_data, # type: Array[llama_token] + candidates: _Pointer[llama_token_data], + last_tokens_data: Array[llama_token], last_tokens_size: c_int, penalty: c_float, ): @@ -463,8 +462,8 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - last_tokens_data, # type: Array[llama_token] + candidates: _Pointer[llama_token_data], + last_tokens_data: Array[llama_token], last_tokens_size: c_int, alpha_frequency: c_float, alpha_presence: c_float, @@ -491,10 +490,7 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -def llama_sample_softmax( - ctx: llama_context_p, - candidates # type: Array[llama_token_data] -): +def llama_sample_softmax(ctx: llama_context_p, candidates: _Pointer[llama_token_data]): return _lib.llama_sample_softmax(ctx, candidates) @@ -507,10 +503,10 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_k( - ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - k: c_int, - min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates: _Pointer[llama_token_data], + k: c_int, + min_keep: c_size_t = c_size_t(1), ): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -526,10 +522,10 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_p( - ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - p: c_float, - min_keep: c_size_t = c_size_t(1) + ctx: llama_context_p, + candidates: _Pointer[llama_token_data], + p: c_float, + min_keep: c_size_t = c_size_t(1), ): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -546,9 +542,9 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] + candidates: _Pointer[llama_token_data], z: c_float, - min_keep: c_size_t = c_size_t(1) + min_keep: c_size_t = c_size_t(1), ): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -565,9 +561,9 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. def llama_sample_typical( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - p: c_float, - min_keep: c_size_t = c_size_t(1) + candidates: _Pointer[llama_token_data], + p: c_float, + min_keep: c_size_t = c_size_t(1), ): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) @@ -582,9 +578,7 @@ _lib.llama_sample_typical.restype = None def llama_sample_temperature( - ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - temp: c_float + ctx: llama_context_p, candidates: _Pointer[llama_token_data], temp: c_float ): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -605,11 +599,11 @@ _lib.llama_sample_temperature.restype = None # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] + candidates: _Pointer[llama_token_data], tau: c_float, - eta: c_float, + eta: c_float, m: c_int, - mu # type: Array[c_float] + mu: _Pointer[c_float], ) -> llama_token: return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -632,10 +626,10 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( ctx: llama_context_p, - candidates, # type: Array[llama_token_data] - tau: c_float, + candidates: _Pointer[llama_token_data], + tau: c_float, eta: c_float, - mu # type: Array[c_float] + mu: _Pointer[c_float], ) -> llama_token: return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) @@ -652,8 +646,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. def llama_sample_token_greedy( - ctx: llama_context_p, - candidates # type: Array[llama_token_data] + ctx: llama_context_p, candidates: _Pointer[llama_token_data] ) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -667,8 +660,7 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. def llama_sample_token( - ctx: llama_context_p, - candidates # type: Array[llama_token_data] + ctx: llama_context_p, candidates: _Pointer[llama_token_data] ) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From 6702d2abfdc313873931baa470b8b547dd825727 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:00:30 -0400 Subject: [PATCH 27/61] Fix candidates type --- llama_cpp/llama_cpp.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 87d9249..61b40f8 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -439,7 +439,7 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], last_tokens_data: Array[llama_token], last_tokens_size: c_int, penalty: c_float, @@ -462,7 +462,7 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], last_tokens_data: Array[llama_token], last_tokens_size: c_int, alpha_frequency: c_float, @@ -504,7 +504,7 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_k( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], k: c_int, min_keep: c_size_t = c_size_t(1), ): @@ -523,7 +523,7 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_p( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], p: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -542,7 +542,7 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], z: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -561,7 +561,7 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. def llama_sample_typical( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], p: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -578,7 +578,7 @@ _lib.llama_sample_typical.restype = None def llama_sample_temperature( - ctx: llama_context_p, candidates: _Pointer[llama_token_data], temp: c_float + ctx: llama_context_p, candidates: _Pointer[llama_token_data_array], temp: c_float ): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -599,7 +599,7 @@ _lib.llama_sample_temperature.restype = None # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], tau: c_float, eta: c_float, m: c_int, @@ -626,7 +626,7 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( ctx: llama_context_p, - candidates: _Pointer[llama_token_data], + candidates: _Pointer[llama_token_data_array], tau: c_float, eta: c_float, mu: _Pointer[c_float], @@ -646,7 +646,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. def llama_sample_token_greedy( - ctx: llama_context_p, candidates: _Pointer[llama_token_data] + ctx: llama_context_p, candidates: _Pointer[llama_token_data_array] ) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -660,7 +660,7 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. def llama_sample_token( - ctx: llama_context_p, candidates: _Pointer[llama_token_data] + ctx: llama_context_p, candidates: _Pointer[llama_token_data_array] ) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From 66e28eb548974fe50aa80b8593f77cff651959c6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:00:41 -0400 Subject: [PATCH 28/61] Fix temperature bug --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c1c8847..6cd65a4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -287,7 +287,7 @@ class Llama: candidates=llama_cpp.ctypes.pointer(candidates), penalty=repeat_penalty, ) - if temp == 0.0: + if float(temp) == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), From 40501435c12578fc0bc696c2bdc0bf63d0e15650 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:04:12 -0400 Subject: [PATCH 29/61] Fix: types --- llama_cpp/llama_cpp.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 61b40f8..8ce3c89 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -141,6 +141,11 @@ LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9) # except 1d tensors +# Misc +c_float_p = POINTER(c_float) +c_uint8_p = POINTER(c_uint8) +c_size_t_p = POINTER(c_size_t) + # Functions @@ -257,7 +262,7 @@ def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_ return _lib.llama_copy_state_data(ctx, dest) -_lib.llama_copy_state_data.argtypes = [llama_context_p, POINTER(c_uint8)] +_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] _lib.llama_copy_state_data.restype = c_size_t @@ -269,7 +274,7 @@ def llama_set_state_data( return _lib.llama_set_state_data(ctx, src) -_lib.llama_set_state_data.argtypes = [llama_context_p, POINTER(c_uint8)] +_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p] _lib.llama_set_state_data.restype = c_size_t @@ -291,7 +296,7 @@ _lib.llama_load_session_file.argtypes = [ c_char_p, llama_token_p, c_size_t, - POINTER(c_size_t), + c_size_t_p, ] _lib.llama_load_session_file.restype = c_size_t @@ -340,7 +345,7 @@ _lib.llama_eval.restype = c_int def llama_tokenize( ctx: llama_context_p, text: bytes, - tokens, # type: Array[llama_token] + tokens: Array[llama_token], n_max_tokens: c_int, add_bos: c_bool, ) -> c_int: @@ -385,7 +390,7 @@ def llama_get_logits(ctx: llama_context_p): _lib.llama_get_logits.argtypes = [llama_context_p] -_lib.llama_get_logits.restype = POINTER(c_float) +_lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input @@ -395,7 +400,7 @@ def llama_get_embeddings(ctx: llama_context_p): _lib.llama_get_embeddings.argtypes = [llama_context_p] -_lib.llama_get_embeddings.restype = POINTER(c_float) +_lib.llama_get_embeddings.restype = c_float_p # Token Id -> String. Uses the vocabulary in the provided context @@ -614,7 +619,7 @@ _lib.llama_sample_token_mirostat.argtypes = [ c_float, c_float, c_int, - POINTER(c_float), + c_float_p, ] _lib.llama_sample_token_mirostat.restype = llama_token @@ -639,7 +644,7 @@ _lib.llama_sample_token_mirostat_v2.argtypes = [ llama_token_data_array_p, c_float, c_float, - POINTER(c_float), + c_float_p, ] _lib.llama_sample_token_mirostat_v2.restype = llama_token From e24c3d7447e158164397686bbecac2d22d8a75a1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:05:31 -0400 Subject: [PATCH 30/61] Prefer explicit imports --- llama_cpp/llama_cpp.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 8ce3c89..f6a71fa 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -68,11 +68,11 @@ _lib_base_name = "llama" _lib = _load_shared_library(_lib_base_name) # C types -LLAMA_FILE_VERSION = ctypes.c_int(1) +LLAMA_FILE_VERSION = c_int(1) LLAMA_FILE_MAGIC = b"ggjt" LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" LLAMA_SESSION_MAGIC = b"ggsn" -LLAMA_SESSION_VERSION = ctypes.c_int(1) +LLAMA_SESSION_VERSION = c_int(1) llama_context_p = c_void_p @@ -128,18 +128,18 @@ class llama_context_params(Structure): llama_context_params_p = POINTER(llama_context_params) -LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0) -LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int( +LLAMA_FTYPE_ALL_F32 = c_int(0) +LLAMA_FTYPE_MOSTLY_F16 = c_int(1) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( 4 ) # tok_embeddings.weight and output.weight are F16 -LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5) # except 1d tensors -# LLAMA_FTYPE_MOSTYL_Q4_3 = ctypes.c_int(6) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8) # except 1d tensors -LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors +# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors +LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) # except 1d tensors # Misc c_float_p = POINTER(c_float) @@ -216,8 +216,8 @@ _lib.llama_model_quantize.restype = c_int # Returns 0 on success def llama_apply_lora_from_file( ctx: llama_context_p, - path_lora: ctypes.c_char_p, - path_base_model: ctypes.c_char_p, + path_lora: c_char_p, + path_base_model: c_char_p, n_threads: c_int, ) -> c_int: return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) From 3e28e0e50ccd7b579ae99b0fbe163fbed8888167 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:12:26 -0400 Subject: [PATCH 31/61] Fix: runtime type errors --- llama_cpp/llama_cpp.py | 52 ++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index f6a71fa..3b1ac1e 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -258,7 +258,9 @@ _lib.llama_get_state_size.restype = c_size_t # Copies the state to the specified destination address. # Destination needs to have allocated enough memory. # Returns the number of bytes copied -def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_t: +def llama_copy_state_data( + ctx: llama_context_p, dest # type: Array[c_uint8] +) -> c_size_t: return _lib.llama_copy_state_data(ctx, dest) @@ -282,9 +284,9 @@ _lib.llama_set_state_data.restype = c_size_t def llama_load_session_file( ctx: llama_context_p, path_session: bytes, - tokens_out: Array[llama_token], + tokens_out, # type: Array[llama_token] n_token_capacity: c_size_t, - n_token_count_out: _Pointer[c_size_t], + n_token_count_out, # type: _Pointer[c_size_t] ) -> c_size_t: return _lib.llama_load_session_file( ctx, path_session, tokens_out, n_token_capacity, n_token_count_out @@ -304,7 +306,7 @@ _lib.llama_load_session_file.restype = c_size_t def llama_save_session_file( ctx: llama_context_p, path_session: bytes, - tokens: Array[llama_token], + tokens, # type: Array[llama_token] n_token_count: c_size_t, ) -> c_size_t: return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) @@ -325,7 +327,7 @@ _lib.llama_save_session_file.restype = c_size_t # Returns 0 on success def llama_eval( ctx: llama_context_p, - tokens: Array[llama_token], + tokens, # type: Array[llama_token] n_tokens: c_int, n_past: c_int, n_threads: c_int, @@ -345,7 +347,7 @@ _lib.llama_eval.restype = c_int def llama_tokenize( ctx: llama_context_p, text: bytes, - tokens: Array[llama_token], + tokens, # type: Array[llama_token] n_max_tokens: c_int, add_bos: c_bool, ) -> c_int: @@ -444,8 +446,8 @@ _lib.llama_token_nl.restype = llama_token # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. def llama_sample_repetition_penalty( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], - last_tokens_data: Array[llama_token], + candidates, # type: _Pointer[llama_token_data_array] + last_tokens_data, # type: Array[llama_token] last_tokens_size: c_int, penalty: c_float, ): @@ -467,8 +469,8 @@ _lib.llama_sample_repetition_penalty.restype = None # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. def llama_sample_frequency_and_presence_penalties( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], - last_tokens_data: Array[llama_token], + candidates, # type: _Pointer[llama_token_data_array] + last_tokens_data, # type: Array[llama_token] last_tokens_size: c_int, alpha_frequency: c_float, alpha_presence: c_float, @@ -495,7 +497,9 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -def llama_sample_softmax(ctx: llama_context_p, candidates: _Pointer[llama_token_data]): +def llama_sample_softmax( + ctx: llama_context_p, candidates # type: _Pointer[llama_token_data] +): return _lib.llama_sample_softmax(ctx, candidates) @@ -509,7 +513,7 @@ _lib.llama_sample_softmax.restype = None # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_k( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] k: c_int, min_keep: c_size_t = c_size_t(1), ): @@ -528,7 +532,7 @@ _lib.llama_sample_top_k.restype = None # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 def llama_sample_top_p( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] p: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -547,7 +551,7 @@ _lib.llama_sample_top_p.restype = None # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. def llama_sample_tail_free( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] z: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -566,7 +570,7 @@ _lib.llama_sample_tail_free.restype = None # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. def llama_sample_typical( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] p: c_float, min_keep: c_size_t = c_size_t(1), ): @@ -583,7 +587,9 @@ _lib.llama_sample_typical.restype = None def llama_sample_temperature( - ctx: llama_context_p, candidates: _Pointer[llama_token_data_array], temp: c_float + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + temp: c_float, ): return _lib.llama_sample_temperature(ctx, candidates, temp) @@ -604,11 +610,11 @@ _lib.llama_sample_temperature.restype = None # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] tau: c_float, eta: c_float, m: c_int, - mu: _Pointer[c_float], + mu, # type: _Pointer[c_float] ) -> llama_token: return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -631,10 +637,10 @@ _lib.llama_sample_token_mirostat.restype = llama_token # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. def llama_sample_token_mirostat_v2( ctx: llama_context_p, - candidates: _Pointer[llama_token_data_array], + candidates, # type: _Pointer[llama_token_data_array] tau: c_float, eta: c_float, - mu: _Pointer[c_float], + mu, # type: _Pointer[c_float] ) -> llama_token: return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) @@ -651,7 +657,8 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token # @details Selects the token with the highest probability. def llama_sample_token_greedy( - ctx: llama_context_p, candidates: _Pointer[llama_token_data_array] + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] ) -> llama_token: return _lib.llama_sample_token_greedy(ctx, candidates) @@ -665,7 +672,8 @@ _lib.llama_sample_token_greedy.restype = llama_token # @details Randomly selects a token from the candidates based on their probabilities. def llama_sample_token( - ctx: llama_context_p, candidates: _Pointer[llama_token_data_array] + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] ) -> llama_token: return _lib.llama_sample_token(ctx, candidates) From b5f3e746275bf231df544c60f30b80f537195af7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:22:55 -0400 Subject: [PATCH 32/61] Add return type annotations for embeddings and logits --- llama_cpp/llama_cpp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 3b1ac1e..ccec12c 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -387,7 +387,7 @@ _lib.llama_n_embd.restype = c_int # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab -def llama_get_logits(ctx: llama_context_p): +def llama_get_logits(ctx: llama_context_p): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_logits(ctx) @@ -397,7 +397,7 @@ _lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input # shape: [n_embd] (1-dimensional) -def llama_get_embeddings(ctx: llama_context_p): +def llama_get_embeddings(ctx: llama_context_p): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_embeddings(ctx) From 98bbd1c6a8ea1f86c010583f6b1ab74996a1c751 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 5 May 2023 14:23:14 -0400 Subject: [PATCH 33/61] Fix eval logits type --- llama_cpp/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6cd65a4..a643f51 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -127,7 +127,7 @@ class Llama: self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx) - self.eval_logits: Deque[List[llama_cpp.c_float]] = deque( + self.eval_logits: Deque[List[float]] = deque( maxlen=n_ctx if logits_all else 1 ) @@ -245,7 +245,7 @@ class Llama: n_vocab = llama_cpp.llama_n_vocab(self.ctx) cols = int(n_vocab) logits_view = llama_cpp.llama_get_logits(self.ctx) - logits: List[List[llama_cpp.c_float]] = [ + logits: List[List[float]] = [ [logits_view[i * cols + j] for j in range(cols)] for i in range(rows) ] self.eval_logits.extend(logits) @@ -287,7 +287,7 @@ class Llama: candidates=llama_cpp.ctypes.pointer(candidates), penalty=repeat_penalty, ) - if float(temp) == 0.0: + if float(temp.value) == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), From 79d50a29f40c4b14cd56a329ee50f269e673f277 Mon Sep 17 00:00:00 2001 From: Thomas Neu <81517187+th-neu@users.noreply.github.com> Date: Sat, 6 May 2023 01:02:59 +0200 Subject: [PATCH 34/61] Create dependabot.yml --- .github/dependabot.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..91abb11 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "weekly" From c9bb602b2682ae12c5690829fee1635fcdfc707c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 5 May 2023 23:25:53 +0000 Subject: [PATCH 35/61] Bump black from 23.1.0 to 23.3.0 Bumps [black](https://github.com/psf/black) from 23.1.0 to 23.3.0. - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/23.1.0...23.3.0) --- updated-dependencies: - dependency-name: black dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 56 +++++++++++++++++++++++++------------------------- pyproject.toml | 2 +- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/poetry.lock b/poetry.lock index a505168..129f923 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "anyio" @@ -42,37 +42,37 @@ tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy [[package]] name = "black" -version = "23.1.0" +version = "23.3.0" description = "The uncompromising code formatter." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"}, - {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"}, - {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"}, - {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"}, - {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"}, - {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"}, - {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"}, - {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"}, - {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"}, - {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"}, - {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"}, - {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"}, - {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"}, - {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"}, - {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"}, - {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"}, + {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"}, + {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"}, + {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"}, + {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"}, + {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"}, + {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"}, + {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"}, + {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"}, + {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"}, + {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"}, + {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"}, + {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"}, + {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"}, ] [package.dependencies] @@ -1458,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "aa15e57300668bd23c051b4cd87bec4c1a58dcccd2f2b4767579fea7f2c5fa41" +content-hash = "714083b7f30a677f9a358a9633970fb88b8198d50558a0b50bf311d4a209ed4c" diff --git a/pyproject.toml b/pyproject.toml index ca0346f..a164ef7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ typing-extensions = "^4.5.0" [tool.poetry.group.dev.dependencies] -black = "^23.1.0" +black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.2" mkdocstrings = {extras = ["python"], version = "^0.20.0"} From fdcab2286c8d9e91779590d6facb3aee34456169 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 6 May 2023 21:11:57 +0000 Subject: [PATCH 36/61] Bump mkdocs-material from 9.1.4 to 9.1.9 Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.4 to 9.1.9. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.4...9.1.9) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 129f923..287d05e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -792,14 +792,14 @@ mkdocs = ">=1.1" [[package]] name = "mkdocs-material" -version = "9.1.4" +version = "9.1.9" description = "Documentation that simply works" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs_material-9.1.4-py3-none-any.whl", hash = "sha256:4c92dcf9365068259bef3eed8e0dd5410056b6f7187bdea2d52848c0f94cd94c"}, - {file = "mkdocs_material-9.1.4.tar.gz", hash = "sha256:c3a8943e9e4a7d2624291da365bbccf0b9f88688aa6947a46260d8c165cd4389"}, + {file = "mkdocs_material-9.1.9-py3-none-any.whl", hash = "sha256:7db24261cb17400e132c46d17eea712bfe71056d892a9beba32cf68210297141"}, + {file = "mkdocs_material-9.1.9.tar.gz", hash = "sha256:74d8da1371ab3a326868fe47bae3cbc4aa22e93c048b4ca5117e6817b88bd734"}, ] [package.dependencies] @@ -1458,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "714083b7f30a677f9a358a9633970fb88b8198d50558a0b50bf311d4a209ed4c" +content-hash = "a921481e74f47e925f7ec2814fa0bc2e07707cb36fd12d9b33ecc6b0402a27c8" diff --git a/pyproject.toml b/pyproject.toml index a164ef7..55ca8ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.2" mkdocstrings = {extras = ["python"], version = "^0.20.0"} -mkdocs-material = "^9.1.4" +mkdocs-material = "^9.1.9" pytest = "^7.2.2" httpx = "^0.24.0" From 2a21b8f69e7049f03a4ab3e0b5ec51d81456a796 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 6 May 2023 21:16:08 +0000 Subject: [PATCH 37/61] Bump mkdocs from 1.4.2 to 1.4.3 Bumps [mkdocs](https://github.com/mkdocs/mkdocs) from 1.4.2 to 1.4.3. - [Release notes](https://github.com/mkdocs/mkdocs/releases) - [Commits](https://github.com/mkdocs/mkdocs/compare/1.4.2...1.4.3) --- updated-dependencies: - dependency-name: mkdocs dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 287d05e..d30dc8f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -747,14 +747,14 @@ files = [ [[package]] name = "mkdocs" -version = "1.4.2" +version = "1.4.3" description = "Project documentation with Markdown." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocs-1.4.2-py3-none-any.whl", hash = "sha256:c8856a832c1e56702577023cd64cc5f84948280c1c0fcc6af4cd39006ea6aa8c"}, - {file = "mkdocs-1.4.2.tar.gz", hash = "sha256:8947af423a6d0facf41ea1195b8e1e8c85ad94ac95ae307fe11232e0424b11c5"}, + {file = "mkdocs-1.4.3-py3-none-any.whl", hash = "sha256:6ee46d309bda331aac915cd24aab882c179a933bd9e77b80ce7d2eaaa3f689dd"}, + {file = "mkdocs-1.4.3.tar.gz", hash = "sha256:5955093bbd4dd2e9403c5afaf57324ad8b04f16886512a3ee6ef828956481c57"}, ] [package.dependencies] @@ -1458,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "a921481e74f47e925f7ec2814fa0bc2e07707cb36fd12d9b33ecc6b0402a27c8" +content-hash = "f2de41d10587a7f21e4891584de2c7152dfa6f75809144778b2dc34d93395abe" diff --git a/pyproject.toml b/pyproject.toml index 55ca8ce..1f79b74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ typing-extensions = "^4.5.0" [tool.poetry.group.dev.dependencies] black = "^23.3.0" twine = "^4.0.2" -mkdocs = "^1.4.2" +mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.20.0"} mkdocs-material = "^9.1.9" pytest = "^7.2.2" From 33d41fb8f3f949e29d4038fdf542ee8445af190a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 7 May 2023 00:07:39 +0000 Subject: [PATCH 38/61] Bump pytest from 7.2.2 to 7.3.1 Bumps [pytest](https://github.com/pytest-dev/pytest) from 7.2.2 to 7.3.1. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/7.2.2...7.3.1) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 30 +++++------------------------- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 26 deletions(-) diff --git a/poetry.lock b/poetry.lock index d30dc8f..0bd08d5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -21,25 +21,6 @@ doc = ["packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] test = ["contextlib2", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (<0.15)", "uvloop (>=0.15)"] trio = ["trio (>=0.16,<0.22)"] -[[package]] -name = "attrs" -version = "22.2.0" -description = "Classes Without Boilerplate" -category = "dev" -optional = false -python-versions = ">=3.6" -files = [ - {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"}, - {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"}, -] - -[package.extras] -cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] -dev = ["attrs[docs,tests]"] -docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"] -tests = ["attrs[tests-no-zope]", "zope.interface"] -tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] - [[package]] name = "black" version = "23.3.0" @@ -1007,18 +988,17 @@ pyyaml = "*" [[package]] name = "pytest" -version = "7.2.2" +version = "7.3.1" description = "pytest: simple powerful testing with Python" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.2.2-py3-none-any.whl", hash = "sha256:130328f552dcfac0b1cec75c12e3f005619dc5f874f0a06e8ff7263f0ee6225e"}, - {file = "pytest-7.2.2.tar.gz", hash = "sha256:c99ab0c73aceb050f68929bc93af19ab6db0558791c6a0715723abe9d0ade9d4"}, + {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, + {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, ] [package.dependencies] -attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" @@ -1027,7 +1007,7 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] [[package]] name = "python-dateutil" @@ -1458,4 +1438,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "f2de41d10587a7f21e4891584de2c7152dfa6f75809144778b2dc34d93395abe" +content-hash = "e02fff3d4a50fbc9a89f6f001409a5f066c26a341c2d5f2dfbfb32f07e711eca" diff --git a/pyproject.toml b/pyproject.toml index 1f79b74..6f83611 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ twine = "^4.0.2" mkdocs = "^1.4.3" mkdocstrings = {extras = ["python"], version = "^0.20.0"} mkdocs-material = "^9.1.9" -pytest = "^7.2.2" +pytest = "^7.3.1" httpx = "^0.24.0" [build-system] From ae3c639764359890e692776cfb87ff84b911532f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 7 May 2023 00:16:31 +0000 Subject: [PATCH 39/61] Bump mkdocstrings from 0.20.0 to 0.21.2 Bumps [mkdocstrings](https://github.com/mkdocstrings/mkdocstrings) from 0.20.0 to 0.21.2. - [Release notes](https://github.com/mkdocstrings/mkdocstrings/releases) - [Changelog](https://github.com/mkdocstrings/mkdocstrings/blob/master/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/mkdocstrings/compare/0.20.0...0.21.2) --- updated-dependencies: - dependency-name: mkdocstrings dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- poetry.lock | 9 +++++---- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 0bd08d5..5b364a7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -808,14 +808,14 @@ files = [ [[package]] name = "mkdocstrings" -version = "0.20.0" +version = "0.21.2" description = "Automatic documentation from sources, for MkDocs." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "mkdocstrings-0.20.0-py3-none-any.whl", hash = "sha256:f17fc2c4f760ec302b069075ef9e31045aa6372ca91d2f35ded3adba8e25a472"}, - {file = "mkdocstrings-0.20.0.tar.gz", hash = "sha256:c757f4f646d4f939491d6bc9256bfe33e36c5f8026392f49eaa351d241c838e5"}, + {file = "mkdocstrings-0.21.2-py3-none-any.whl", hash = "sha256:949ef8da92df9d692ca07be50616459a6b536083a25520fd54b00e8814ce019b"}, + {file = "mkdocstrings-0.21.2.tar.gz", hash = "sha256:304e56a2e90595708a38a13a278e538a67ad82052dd5c8b71f77a604a4f3d911"}, ] [package.dependencies] @@ -826,6 +826,7 @@ mkdocs = ">=1.2" mkdocs-autorefs = ">=0.3.1" mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""} pymdown-extensions = ">=6.3" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.10\""} [package.extras] crystal = ["mkdocstrings-crystal (>=0.3.4)"] @@ -1438,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "e02fff3d4a50fbc9a89f6f001409a5f066c26a341c2d5f2dfbfb32f07e711eca" +content-hash = "e87403dcd0a0b8484436b02c392326adfaf22b8d7e182d77e4a155c67a7435bc" diff --git a/pyproject.toml b/pyproject.toml index 6f83611..a11faef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ typing-extensions = "^4.5.0" black = "^23.3.0" twine = "^4.0.2" mkdocs = "^1.4.3" -mkdocstrings = {extras = ["python"], version = "^0.20.0"} +mkdocstrings = {extras = ["python"], version = "^0.21.2"} mkdocs-material = "^9.1.9" pytest = "^7.3.1" httpx = "^0.24.0" From bc853e3742fd2a4718bd66bd501bdb5ede50f6d3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 6 May 2023 21:32:50 -0400 Subject: [PATCH 40/61] Fix type for eval_logits in LlamaState object --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index a643f51..fc91ea4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -51,7 +51,7 @@ class LlamaState: def __init__( self, eval_tokens: Deque[llama_cpp.llama_token], - eval_logits: Deque[List[llama_cpp.c_float]], + eval_logits: Deque[List[float]], llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] llama_state_size: llama_cpp.c_size_t, ): From c76e0913bbc6a039f5456ca44f4d84966e5c14fd Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 6 May 2023 22:18:31 -0400 Subject: [PATCH 41/61] Update issue templates --- .github/ISSUE_TEMPLATE/bug_report.md | 80 +++++++++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.md | 20 ++++++ 2 files changed, 100 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..b8e33e5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,80 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +# Prerequisites + +Please answer the following questions for yourself before submitting an issue. + +- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now. +- [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md). +- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed). +- [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share. + +# Expected Behavior + +Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do. + +# Current Behavior + +Please provide a detailed written description of what `llama-cpp-python` did, instead. + +# Environment and Context + +Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions. + +* Physical (or virtual) hardware you are using, e.g. for Linux: + +`$ lscpu` + +* Operating System, e.g. for Linux: + +`$ uname -a` + +* SDK version, e.g. for Linux: + +``` +$ python3 --version +$ make --version +$ g++ --version +``` + +# Failure Information (for bugs) + +Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template. + +# Steps to Reproduce + +Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better. + +1. step 1 +2. step 2 +3. step 3 +4. etc. + +**Note: Many issues seem to be regarding performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.** + +# Failure Logs + +Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes. + +Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability. + +Example environment info: +``` +llama-cpp-python$ git log | head -1 +commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2 + +llama-cpp-python$ python3 --version +Python 3.10.10 + +llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette" +fastapi 0.95.0 +sse-starlette 1.3.3 +uvicorn 0.21.1 +``` diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..bbcbbe7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. From 7c3743fe5f2781a8aab9ba8e15f4d250963747cf Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 00:12:47 -0400 Subject: [PATCH 42/61] Update llama.cpp --- llama_cpp/llama.py | 4 ++++ llama_cpp/llama_cpp.py | 8 ++++---- vendor/llama.cpp | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index fc91ea4..0db5c10 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -297,21 +297,25 @@ class Llama: ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), k=top_k, + min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_tail_free( ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), z=llama_cpp.c_float(1.0), + min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_typical( ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), p=llama_cpp.c_float(1.0), + min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_top_p( ctx=self.ctx, candidates=llama_cpp.ctypes.pointer(candidates), p=top_p, + min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_temperature( ctx=self.ctx, diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index ccec12c..527ed7c 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -515,7 +515,7 @@ def llama_sample_top_k( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] k: c_int, - min_keep: c_size_t = c_size_t(1), + min_keep: c_size_t, ): return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) @@ -534,7 +534,7 @@ def llama_sample_top_p( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] p: c_float, - min_keep: c_size_t = c_size_t(1), + min_keep: c_size_t, ): return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) @@ -553,7 +553,7 @@ def llama_sample_tail_free( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] z: c_float, - min_keep: c_size_t = c_size_t(1), + min_keep: c_size_t, ): return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) @@ -572,7 +572,7 @@ def llama_sample_typical( ctx: llama_context_p, candidates, # type: _Pointer[llama_token_data_array] p: c_float, - min_keep: c_size_t = c_size_t(1), + min_keep: c_size_t, ): return _lib.llama_sample_typical(ctx, candidates, p, min_keep) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2edbdb0..1b0fd45 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2edbdb0f99336cb41f0995061c7602ed54beb863 +Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25 From 397ae97f64bb235db5a773a63caaeea5b258a20c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 01:41:19 -0400 Subject: [PATCH 43/61] Update README --- README.md | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c46fa11..9daca60 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,10 @@ You can force the use of `cmake` on Linux / MacOS setting the `FORCE_CMAKE=1` en ## High-level API +The high-level API provides a simple managed interface through the `Llama` class. + +Below is a short example demonstrating how to use the high-level API to generate text: + ```python >>> from llama_cpp import Llama >>> llm = Llama(model_path="./models/7B/ggml-model.bin") @@ -90,8 +94,25 @@ docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml- ## Low-level API -The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`. -The entire API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and should mirror [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). +The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`. +The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). + +Below is a short example demonstrating how to use the low-level API to tokenize a prompt: + +```python +>>> import llama_cpp +>>> import ctypes +>>> params = llama_cpp.llama_context_default_params() +# use bytes for char * params +>>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params) +>>> max_tokens = params.n_ctx +# use ctypes arrays for array params +>>> tokens = (llama_cppp.llama_token * int(max_tokens))() +>>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) +>>> llama_cpp.llama_free(ctx) +``` + +Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API. # Documentation From c382d8f86a628edec4427ac01687babb5c4aa35f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 02:00:22 -0400 Subject: [PATCH 44/61] Revert "llama_cpp server: mark model as required" This reverts commit e40fcb05754d0ec9c65359e245a436794cbfefdb. --- llama_cpp/server/app.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 595476f..0b7b1b2 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -149,8 +149,15 @@ class CreateCompletionRequest(BaseModel): description="The number of logprobs to generate. If None, no logprobs are generated." ) - # ignored, but marked as required for the sake of compatibility with openai's api - model: str = model_field + # ignored or currently unsupported + model: Optional[str] = model_field + n: Optional[int] = 1 + logprobs: Optional[int] = Field(None) + presence_penalty: Optional[float] = 0 + frequency_penalty: Optional[float] = 0 + best_of: Optional[int] = 1 + logit_bias: Optional[Dict[str, float]] = Field(None) + user: Optional[str] = Field(None) # llama.cpp specific parameters top_k: int = top_k_field @@ -190,11 +197,11 @@ def create_completion( class CreateEmbeddingRequest(BaseModel): - # ignored, but marked as required for the sake of compatibility with openai's api - model: str = model_field + model: Optional[str] = model_field input: str = Field( description="The input to embed." ) + user: Optional[str] class Config: schema_extra = { @@ -235,8 +242,13 @@ class CreateChatCompletionRequest(BaseModel): stop: Optional[List[str]] = stop_field stream: bool = stream_field - # ignored, but marked as required for the sake of compatibility with openai's api - model: str = model_field + # ignored or currently unsupported + model: Optional[str] = model_field + n: Optional[int] = 1 + presence_penalty: Optional[float] = 0 + frequency_penalty: Optional[float] = 0 + logit_bias: Optional[Dict[str, float]] = Field(None) + user: Optional[str] = Field(None) # llama.cpp specific parameters top_k: int = top_k_field From 86753976c4ce1289a784b7385f419f471f7e8a50 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 02:02:34 -0400 Subject: [PATCH 45/61] Revert "llama_cpp server: delete some ignored / unused parameters" This reverts commit b47b9549d57f146a00ee19cd7d2bb294111abb67. --- llama_cpp/llama_types.py | 2 ++ llama_cpp/server/app.py | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index b770a01..bfc7342 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -60,6 +60,8 @@ class Completion(TypedDict): class ChatCompletionMessage(TypedDict): role: Literal["assistant", "user", "system"] content: str + user: NotRequired[str] + class ChatCompletionChoice(TypedDict): index: int diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 0b7b1b2..ba2ca2f 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -185,7 +185,13 @@ def create_completion( completion_or_chunks = llama( **request.dict( exclude={ - "model" + "model", + "n", + "frequency_penalty", + "presence_penalty", + "best_of", + "logit_bias", + "user", } ) ) @@ -221,7 +227,7 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): - return llama.create_embedding(**request.dict(exclude={"model"})) + return llama.create_embedding(**request.dict(exclude={"model", "user"})) class ChatCompletionRequestMessage(BaseModel): @@ -283,7 +289,12 @@ def create_chat_completion( completion_or_chunks = llama.create_chat_completion( **request.dict( exclude={ - "model" + "model", + "n", + "presence_penalty", + "frequency_penalty", + "logit_bias", + "user", } ), ) From 1a00e452ea1e82232ffc035647b1c56116ae62ea Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 02:52:20 -0400 Subject: [PATCH 46/61] Update settings fields and defaults --- llama_cpp/server/app.py | 94 ++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ba2ca2f..48dfc5e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -13,18 +13,41 @@ from sse_starlette.sse import EventSourceResponse class Settings(BaseSettings): - model: str - n_ctx: int = 2048 - n_batch: int = 512 - n_threads: int = max((os.cpu_count() or 2) // 2, 1) - f16_kv: bool = True - use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... - use_mmap: bool = True - embedding: bool = True - last_n_tokens_size: int = 64 - logits_all: bool = False - cache: bool = False # WARNING: This is an experimental feature - vocab_only: bool = False + model: str = Field( + description="The path to the model to use for generating completions." + ) + n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_batch: int = Field( + default=512, ge=1, description="The batch size to use per eval." + ) + n_threads: int = Field( + default=max((os.cpu_count() or 2) // 2, 1), + ge=1, + description="The number of threads to use.", + ) + f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") + use_mlock: bool = Field( + default=bool(llama_cpp.llama_mlock_supported().value), + description="Use mlock.", + ) + use_mmap: bool = Field( + default=bool(llama_cpp.llama_mmap_supported().value), + description="Use mmap.", + ) + embedding: bool = Field(default=True, description="Whether to use embeddings.") + last_n_tokens_size: int = Field( + default=64, + ge=0, + description="Last n tokens to keep for repeat penalty calculation.", + ) + logits_all: bool = Field(default=True, description="Whether to return logits.") + cache: bool = Field( + default=False, + description="Use a cache to reduce processing times for evaluated prompts.", + ) + vocab_only: bool = Field( + default=False, description="Whether to only return the vocabulary." + ) router = APIRouter() @@ -74,79 +97,75 @@ def get_llama(): with llama_lock: yield llama -model_field = Field( - description="The model to use for generating completions." -) + +model_field = Field(description="The model to use for generating completions.") max_tokens_field = Field( - default=16, - ge=1, - le=2048, - description="The maximum number of tokens to generate." + default=16, ge=1, le=2048, description="The maximum number of tokens to generate." ) temperature_field = Field( default=0.8, ge=0.0, le=2.0, - description="Adjust the randomness of the generated text.\n\n" + - "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run." + description="Adjust the randomness of the generated text.\n\n" + + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.", ) top_p_field = Field( default=0.95, ge=0.0, le=1.0, - description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + - "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text." + description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.", ) stop_field = Field( default=None, - description="A list of tokens at which to stop generation. If None, no stop tokens are used." + description="A list of tokens at which to stop generation. If None, no stop tokens are used.", ) stream_field = Field( default=False, - description="Whether to stream the results as they are generated. Useful for chatbots." + description="Whether to stream the results as they are generated. Useful for chatbots.", ) top_k_field = Field( default=40, ge=0, - description="Limit the next token selection to the K most probable tokens.\n\n" + - "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text." + description="Limit the next token selection to the K most probable tokens.\n\n" + + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.", ) repeat_penalty_field = Field( default=1.0, ge=0.0, - description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + - "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient." + description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.", ) + class CreateCompletionRequest(BaseModel): prompt: Optional[str] = Field( - default="", - description="The prompt to generate completions for." + default="", description="The prompt to generate completions for." ) suffix: Optional[str] = Field( default=None, - description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots." + description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.", ) max_tokens: int = max_tokens_field temperature: float = temperature_field top_p: float = top_p_field echo: bool = Field( default=False, - description="Whether to echo the prompt in the generated text. Useful for chatbots." + description="Whether to echo the prompt in the generated text. Useful for chatbots.", ) stop: Optional[List[str]] = stop_field stream: bool = stream_field logprobs: Optional[int] = Field( default=None, ge=0, - description="The number of logprobs to generate. If None, no logprobs are generated." + description="The number of logprobs to generate. If None, no logprobs are generated.", ) # ignored or currently unsupported @@ -204,9 +223,7 @@ def create_completion( class CreateEmbeddingRequest(BaseModel): model: Optional[str] = model_field - input: str = Field( - description="The input to embed." - ) + input: str = Field(description="The input to embed.") user: Optional[str] class Config: @@ -239,8 +256,7 @@ class ChatCompletionRequestMessage(BaseModel): class CreateChatCompletionRequest(BaseModel): messages: List[ChatCompletionRequestMessage] = Field( - default=[], - description="A list of messages to generate completions for." + default=[], description="A list of messages to generate completions for." ) max_tokens: int = max_tokens_field temperature: float = temperature_field From 5a3413eee398ff36e0fb496a44e39d960f402a48 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 03:03:57 -0400 Subject: [PATCH 47/61] Update cpu_count --- llama_cpp/server/app.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 48dfc5e..dfb819c 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,8 +1,8 @@ -import os import json +import multiprocessing from threading import Lock from typing import List, Optional, Union, Iterator, Dict -from typing_extensions import TypedDict, Literal, Annotated +from typing_extensions import TypedDict, Literal import llama_cpp @@ -21,7 +21,7 @@ class Settings(BaseSettings): default=512, ge=1, description="The batch size to use per eval." ) n_threads: int = Field( - default=max((os.cpu_count() or 2) // 2, 1), + default=max(multiprocessing.cpu_count() // 2, 1), ge=1, description="The number of threads to use.", ) From 3fbda717904080ec0286fc13488e3318e3dec75e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 03:04:22 -0400 Subject: [PATCH 48/61] Fix mlock_supported and mmap_supported return type --- llama_cpp/llama_cpp.py | 12 ++++++++---- llama_cpp/server/app.py | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 527ed7c..bce0fd7 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -157,7 +157,7 @@ _lib.llama_context_default_params.argtypes = [] _lib.llama_context_default_params.restype = llama_context_params -def llama_mmap_supported() -> c_bool: +def llama_mmap_supported() -> bool: return _lib.llama_mmap_supported() @@ -165,7 +165,7 @@ _lib.llama_mmap_supported.argtypes = [] _lib.llama_mmap_supported.restype = c_bool -def llama_mlock_supported() -> c_bool: +def llama_mlock_supported() -> bool: return _lib.llama_mlock_supported() @@ -387,7 +387,9 @@ _lib.llama_n_embd.restype = c_int # Can be mutated in order to change the probabilities of the next token # Rows: n_tokens # Cols: n_vocab -def llama_get_logits(ctx: llama_context_p): # type: (...) -> Array[float] # type: ignore +def llama_get_logits( + ctx: llama_context_p, +): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_logits(ctx) @@ -397,7 +399,9 @@ _lib.llama_get_logits.restype = c_float_p # Get the embeddings for the input # shape: [n_embd] (1-dimensional) -def llama_get_embeddings(ctx: llama_context_p): # type: (...) -> Array[float] # type: ignore +def llama_get_embeddings( + ctx: llama_context_p, +): # type: (...) -> Array[float] # type: ignore return _lib.llama_get_embeddings(ctx) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index dfb819c..3e45684 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -27,11 +27,11 @@ class Settings(BaseSettings): ) f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") use_mlock: bool = Field( - default=bool(llama_cpp.llama_mlock_supported().value), + default=llama_cpp.llama_mlock_supported(), description="Use mlock.", ) use_mmap: bool = Field( - default=bool(llama_cpp.llama_mmap_supported().value), + default=llama_cpp.llama_mmap_supported(), description="Use mmap.", ) embedding: bool = Field(default=True, description="Whether to use embeddings.") From 5f43c553d59f5ee8ca6bea3044d50ba40bc8b426 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 03:29:33 -0400 Subject: [PATCH 49/61] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a11faef..f6d1e9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.43" +version = "0.1.44" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 405886a..020d236 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.43", + version="0.1.44", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 627811ea837f6f3b108d916a5ae802111d0f0690 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 05:09:10 -0400 Subject: [PATCH 50/61] Add verbose flag to server --- llama_cpp/server/app.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 3e45684..f46f920 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -48,6 +48,9 @@ class Settings(BaseSettings): vocab_only: bool = Field( default=False, description="Whether to only return the vocabulary." ) + verbose: bool = Field( + default=True, description="Whether to print debug information." + ) router = APIRouter() @@ -83,6 +86,7 @@ def create_app(settings: Optional[Settings] = None): n_ctx=settings.n_ctx, last_n_tokens_size=settings.last_n_tokens_size, vocab_only=settings.vocab_only, + verbose=settings.verbose, ) if settings.cache: cache = llama_cpp.LlamaCache() From 3adc8fb3ae887d385b4a884814f9055c7165f168 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 05:10:52 -0400 Subject: [PATCH 51/61] Update README to use cli options for server --- README.md | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/README.md b/README.md index 9daca60..9fa3bed 100644 --- a/README.md +++ b/README.md @@ -68,18 +68,9 @@ This allows you to use llama.cpp compatible models with any OpenAI compatible cl To install the server package and get started: -Linux/MacOS ```bash pip install llama-cpp-python[server] -export MODEL=./models/7B/ggml-model.bin -python3 -m llama_cpp.server -``` - -Windows -```cmd -pip install llama-cpp-python[server] -SET MODEL=..\models\7B\ggml-model.bin -python3 -m llama_cpp.server +python3 -m llama_cpp.server --model models/7B/ggml-model.bin ``` Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. From 4f8cf52a38761f8cd611d3f65f07b6fe382445a9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 05:20:04 -0400 Subject: [PATCH 52/61] Update README --- README.md | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9fa3bed..b7772d9 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ This package provides: - OpenAI-like API - LangChain compatibility -## Installation +## Installation from PyPI (recommended) Install from PyPI (requires a c compiler): @@ -26,8 +26,30 @@ pip install llama-cpp-python The above command will attempt to install the package and build build `llama.cpp` from source. This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. -This method defaults to using `make` to build `llama.cpp` on Linux / MacOS and `cmake` on Windows. -You can force the use of `cmake` on Linux / MacOS setting the `FORCE_CMAKE=1` environment variable before installing. + +### Installation with OpenBLAS / cuBLAS / CLBlast + +`llama.cpp` supports multiple BLAS backends for faster processing. +Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend. + +To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing: + +```bash +LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python +``` + +To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing: + +```bash +LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python +``` + +To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing: + +```bash +LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python +``` + ## High-level API From 2753b853212bfb81a3643b69eb666443ad03d494 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 13:19:56 -0400 Subject: [PATCH 53/61] Format --- llama_cpp/llama.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 0db5c10..6836ea5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -127,9 +127,7 @@ class Llama: self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx) - self.eval_logits: Deque[List[float]] = deque( - maxlen=n_ctx if logits_all else 1 - ) + self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx if logits_all else 1) self.cache: Optional[LlamaCache] = None @@ -547,12 +545,6 @@ class Llama: finish_reason = "stop" break - if self.cache and len(completion_tokens) == 0: - if prompt_tokens not in self.cache: - if self.verbose: - print("Llama._create_completion: cache miss", file=sys.stderr) - self.cache[prompt_tokens] = self.save_state() - completion_tokens.append(token) all_text = self.detokenize(completion_tokens) @@ -611,6 +603,11 @@ class Llama: finish_reason = "length" break + if self.cache: + if self.verbose: + print("Llama._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() + if stream: yield { "id": completion_id, From 8dfde63255651f05e015df6dcfb614b2eac7c1f5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 19:30:14 -0400 Subject: [PATCH 54/61] Fix return type --- llama_cpp/llama_cpp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index bce0fd7..e60558c 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -260,7 +260,7 @@ _lib.llama_get_state_size.restype = c_size_t # Returns the number of bytes copied def llama_copy_state_data( ctx: llama_context_p, dest # type: Array[c_uint8] -) -> c_size_t: +) -> int: return _lib.llama_copy_state_data(ctx, dest) @@ -272,7 +272,7 @@ _lib.llama_copy_state_data.restype = c_size_t # Returns the number of bytes read def llama_set_state_data( ctx: llama_context_p, src # type: Array[c_uint8] -) -> c_size_t: +) -> int: return _lib.llama_set_state_data(ctx, src) From 0e94a70de1727c8071d5802c34ad83a1fee987b0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 19:31:26 -0400 Subject: [PATCH 55/61] Add in-memory longest prefix cache. Closes #158 --- llama_cpp/llama.py | 91 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 27 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6836ea5..de06da0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -5,7 +5,7 @@ import time import math import multiprocessing from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple -from collections import deque +from collections import deque, OrderedDict from . import llama_cpp from .llama_types import * @@ -14,37 +14,50 @@ from .llama_types import * class LlamaCache: """Cache for a llama.cpp model.""" - def __init__(self): - self.cache_state: Dict[Tuple[llama_cpp.llama_token, ...], "LlamaState"] = dict() + def __init__(self, capacity_bytes: int = (2 << 30)): + self.cache_state: OrderedDict[ + Tuple[llama_cpp.llama_token, ...], "LlamaState" + ] = OrderedDict() + self.capacity_bytes = capacity_bytes - def _sorted_keys(self) -> List[Tuple[llama_cpp.llama_token, ...]]: - return [ - key - for _, key in sorted( - ((len(key), key) for key in self.cache_state.keys()), reverse=True - ) - ] + @property + def cache_size(self): + return sum([state.llama_state_size for state in self.cache_state.values()]) - def _find_key( - self, key: Tuple[llama_cpp.llama_token, ...] + def _find_longest_prefix_key( + self, + key: Tuple[llama_cpp.llama_token, ...], ) -> Optional[Tuple[llama_cpp.llama_token, ...]]: - for k in self._sorted_keys(): - if key[: len(k)] == k: - return k - return None + min_len = 0 + min_key = None + keys = ( + (k, Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() + ) + for k, prefix_len in keys: + if prefix_len > min_len: + min_len = prefix_len + min_key = k + return min_key def __getitem__(self, key: Sequence[llama_cpp.llama_token]) -> "LlamaState": - _key = self._find_key(tuple(key)) + key = tuple(key) + _key = self._find_longest_prefix_key(key) if _key is None: - raise KeyError(f"Key not found: {key}") - return self.cache_state[_key] + raise KeyError(f"Key not found") + value = self.cache_state[_key] + self.cache_state.move_to_end(_key) + return value def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool: - return self._find_key(tuple(key)) is not None + return self._find_longest_prefix_key(tuple(key)) is not None def __setitem__(self, key: Sequence[llama_cpp.llama_token], value: "LlamaState"): - self.cache_state = dict() # NOTE: Currently limit to one cache entry. - self.cache_state[tuple(key)] = value + key = tuple(key) + if key in self.cache_state: + del self.cache_state[key] + self.cache_state[key] = value + while self.cache_size > self.capacity_bytes: + self.cache_state.popitem(last=False) class LlamaState: @@ -53,7 +66,7 @@ class LlamaState: eval_tokens: Deque[llama_cpp.llama_token], eval_logits: Deque[List[float]], llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8] - llama_state_size: llama_cpp.c_size_t, + llama_state_size: int, ): self.eval_tokens = eval_tokens self.eval_logits = eval_logits @@ -526,10 +539,22 @@ class Llama: "logprobs is not supported for models created with logits_all=False" ) - if self.cache and prompt_tokens in self.cache: - if self.verbose: - print("Llama._create_completion: cache hit", file=sys.stderr) - self.load_state(self.cache[prompt_tokens]) + if self.cache: + try: + cache_item = self.cache[prompt_tokens] + cache_prefix_len = Llama.longest_token_prefix( + cache_item.eval_tokens, prompt_tokens + ) + eval_prefix_len = Llama.longest_token_prefix( + self.eval_tokens, prompt_tokens + ) + if cache_prefix_len > eval_prefix_len: + self.load_state(cache_item) + if self.verbose: + print("Llama._create_completion: cache hit", file=sys.stderr) + except KeyError: + if self.verbose: + print("Llama._create_completion: cache miss", file=sys.stderr) finish_reason = "length" multibyte_fix = 0 @@ -1004,3 +1029,15 @@ class Llama: exps = [math.exp(float(x)) for x in logits] sum_exps = sum(exps) return [math.log(x / sum_exps) for x in exps] + + @staticmethod + def longest_token_prefix( + a: Sequence[llama_cpp.llama_token], b: Sequence[llama_cpp.llama_token] + ): + longest_prefix = 0 + for _a, _b in zip(a, b): + if _a == _b: + longest_prefix += 1 + else: + break + return longest_prefix From 14da46f16e46dba2a6964c8d0d7ddbce388182e5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 19:33:17 -0400 Subject: [PATCH 56/61] Added cache size to settins object. --- llama_cpp/server/app.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index f46f920..e74d17d 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -45,6 +45,10 @@ class Settings(BaseSettings): default=False, description="Use a cache to reduce processing times for evaluated prompts.", ) + cache_size: int = Field( + default=2 << 30, + description="The size of the cache in bytes. Only used if cache is True.", + ) vocab_only: bool = Field( default=False, description="Whether to only return the vocabulary." ) @@ -89,7 +93,7 @@ def create_app(settings: Optional[Settings] = None): verbose=settings.verbose, ) if settings.cache: - cache = llama_cpp.LlamaCache() + cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) llama.set_cache(cache) return app From e72f58614b35ae3f995fd46897f2272d8f23362c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 20:01:34 -0400 Subject: [PATCH 57/61] Change pointer to lower overhead byref --- llama_cpp/llama.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index de06da0..41e6fd8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -295,47 +295,47 @@ class Llama: ctx=self.ctx, last_tokens_data=last_n_tokens_data, last_tokens_size=last_n_tokens_size, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore penalty=repeat_penalty, ) if float(temp.value) == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore ) else: llama_cpp.llama_sample_top_k( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore k=top_k, min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_tail_free( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore z=llama_cpp.c_float(1.0), min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_typical( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore p=llama_cpp.c_float(1.0), min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_top_p( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore p=top_p, min_keep=llama_cpp.c_size_t(1), ) llama_cpp.llama_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore temp=temp, ) return llama_cpp.llama_sample_token( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore ) def sample( From a0b61ea2a7c27660bc1421802c327b379a47a7d7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 20:17:52 -0400 Subject: [PATCH 58/61] Bugfix for models endpoint --- llama_cpp/server/app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index e74d17d..b46914e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -357,7 +357,9 @@ GetModelResponse = create_model_from_typeddict(ModelList) @router.get("/v1/models", response_model=GetModelResponse) -def get_models() -> ModelList: +def get_models( + llama: llama_cpp.Llama = Depends(get_llama), +) -> ModelList: return { "object": "list", "data": [ From 75d8619b1a373a3900dbbdaf2fc7f71343ae312e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 7 May 2023 20:19:34 -0400 Subject: [PATCH 59/61] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f6d1e9a..781d21b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.44" +version = "0.1.45" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 020d236..e2bc2da 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.44", + version="0.1.45", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 65d9cc050cb630a1d12f3874947b4729d1cbaab7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 01:30:18 -0400 Subject: [PATCH 60/61] Add openai frequency and presence penalty parameters. Closes #169 --- llama_cpp/llama.py | 38 ++++++++++++++++++++++++++++++++++++-- llama_cpp/server/app.py | 4 ---- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 41e6fd8..7b53112 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -261,7 +261,7 @@ class Llama: ] self.eval_logits.extend(logits) - def _sample_top_p_top_k( + def _sample( self, last_n_tokens_data, # type: llama_cpp.Array[llama_cpp.llama_token] last_n_tokens_size: llama_cpp.c_int, @@ -269,6 +269,8 @@ class Llama: top_p: llama_cpp.c_float, temp: llama_cpp.c_float, repeat_penalty: llama_cpp.c_float, + frequency_penalty: llama_cpp.c_float, + presence_penalty: llama_cpp.c_float, ): assert self.ctx is not None assert len(self.eval_logits) > 0 @@ -298,6 +300,14 @@ class Llama: candidates=llama_cpp.ctypes.byref(candidates), # type: ignore penalty=repeat_penalty, ) + llama_cpp.llama_sample_frequency_and_presence_penalties( + ctx=self.ctx, + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + last_tokens_data=last_n_tokens_data, + last_tokens_size=last_n_tokens_size, + alpha_frequency=frequency_penalty, + alpha_presence=presence_penalty, + ) if float(temp.value) == 0.0: return llama_cpp.llama_sample_token_greedy( ctx=self.ctx, @@ -344,6 +354,8 @@ class Llama: top_p: float, temp: float, repeat_penalty: float, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, ): """Sample a token from the model. @@ -360,7 +372,7 @@ class Llama: last_n_tokens_data = [llama_cpp.llama_token(0)] * max( 0, self.last_n_tokens_size - len(self.eval_tokens) ) + list(self.eval_tokens)[-self.last_n_tokens_size :] - return self._sample_top_p_top_k( + return self._sample( last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)( *last_n_tokens_data ), @@ -369,6 +381,8 @@ class Llama: top_p=llama_cpp.c_float(top_p), temp=llama_cpp.c_float(temp), repeat_penalty=llama_cpp.c_float(repeat_penalty), + frequency_penalty=llama_cpp.c_float(frequency_penalty), + presence_penalty=llama_cpp.c_float(presence_penalty), ) def generate( @@ -378,6 +392,8 @@ class Llama: top_p: float, temp: float, repeat_penalty: float, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, reset: bool = True, ) -> Generator[ llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None @@ -431,6 +447,8 @@ class Llama: top_k=top_k, top_p=top_p, temp=temp, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, ) tokens_or_none = yield token @@ -505,6 +523,8 @@ class Llama: logprobs: Optional[int] = None, echo: bool = False, stop: Optional[List[str]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, @@ -563,6 +583,8 @@ class Llama: top_k=top_k, top_p=top_p, temp=temperature, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, ): if token == llama_cpp.llama_token_eos(): @@ -737,6 +759,8 @@ class Llama: logprobs: Optional[int] = None, echo: bool = False, stop: Optional[List[str]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, @@ -772,6 +796,8 @@ class Llama: logprobs=logprobs, echo=echo, stop=stop, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, @@ -792,6 +818,8 @@ class Llama: logprobs: Optional[int] = None, echo: bool = False, stop: Optional[List[str]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, repeat_penalty: float = 1.1, top_k: int = 40, stream: bool = False, @@ -827,6 +855,8 @@ class Llama: logprobs=logprobs, echo=echo, stop=stop, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, repeat_penalty=repeat_penalty, top_k=top_k, stream=stream, @@ -899,6 +929,8 @@ class Llama: stream: bool = False, stop: Optional[List[str]] = [], max_tokens: int = 256, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, repeat_penalty: float = 1.1, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -932,6 +964,8 @@ class Llama: stream=stream, max_tokens=max_tokens, repeat_penalty=repeat_penalty, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, ) if stream: chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index b46914e..c9f2aef 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -214,8 +214,6 @@ def create_completion( exclude={ "model", "n", - "frequency_penalty", - "presence_penalty", "best_of", "logit_bias", "user", @@ -315,8 +313,6 @@ def create_chat_completion( exclude={ "model", "n", - "presence_penalty", - "frequency_penalty", "logit_bias", "user", } From 0d751a69a78c0a2f7b83c894d6a98ceec8daa680 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 8 May 2023 01:50:43 -0400 Subject: [PATCH 61/61] Set repeat_penalty to 0 by default --- llama_cpp/server/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index c9f2aef..b459b80 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -146,7 +146,7 @@ top_k_field = Field( ) repeat_penalty_field = Field( - default=1.0, + default=0.0, ge=0.0, description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",