From e40fcb05754d0ec9c65359e245a436794cbfefdb Mon Sep 17 00:00:00 2001
From: Lucas Doyle <lucas.p.doyle@gmail.com>
Date: Sat, 29 Apr 2023 00:47:35 -0700
Subject: [PATCH 01/53] llama_cpp server: mark model as required

`model` is ignored, but currently marked "optional"... on the one hand could mark "required" to make it explicit in case the server supports multiple llama's at the same time, but also could delete it since its ignored. decision: mark it required for the sake of openai api compatibility.

I think out of all parameters, `model` is probably the most important one for people to keep using even if its ignored for now.
---
 llama_cpp/server/app.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 640dd3f..5d87e78 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -66,6 +66,10 @@ def get_llama():
     with llama_lock:
         yield llama
 
+model_field = Field(
+    description="The model to use for generating completions."
+)
+
 class CreateCompletionRequest(BaseModel):
     prompt: Union[str, List[str]]
     suffix: Optional[str] = Field(None)
@@ -76,8 +80,9 @@ class CreateCompletionRequest(BaseModel):
     stop: Optional[List[str]] = []
     stream: bool = False
 
-    # ignored or currently unsupported
-    model: Optional[str] = Field(None)
+    # ignored, but marked as required for the sake of compatibility with openai's api
+    model: str = model_field
+
     n: Optional[int] = 1
     logprobs: Optional[int] = Field(None)
     presence_penalty: Optional[float] = 0
@@ -133,7 +138,8 @@ def create_completion(
 
 
 class CreateEmbeddingRequest(BaseModel):
-    model: Optional[str]
+    # ignored, but marked as required for the sake of compatibility with openai's api
+    model: str = model_field
     input: str
     user: Optional[str]
 
@@ -173,8 +179,9 @@ class CreateChatCompletionRequest(BaseModel):
     stop: Optional[List[str]] = []
     max_tokens: int = 128
 
-    # ignored or currently unsupported
-    model: Optional[str] = Field(None)
+    # ignored, but marked as required for the sake of compatibility with openai's api
+    model: str = model_field
+    
     n: Optional[int] = 1
     presence_penalty: Optional[float] = 0
     frequency_penalty: Optional[float] = 0

From b47b9549d57f146a00ee19cd7d2bb294111abb67 Mon Sep 17 00:00:00 2001
From: Lucas Doyle <lucas.p.doyle@gmail.com>
Date: Sat, 29 Apr 2023 01:19:30 -0700
Subject: [PATCH 02/53] llama_cpp server: delete some ignored / unused
 parameters

`n`, `presence_penalty`, `frequency_penalty`, `best_of`, `logit_bias`, `user`: not supported, excluded from the calls into llama. decision: delete it
---
 llama_cpp/llama_types.py |  2 --
 llama_cpp/server/app.py  | 30 +++---------------------------
 2 files changed, 3 insertions(+), 29 deletions(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index b62ff1b..b8bdb08 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -60,8 +60,6 @@ class Completion(TypedDict):
 class ChatCompletionMessage(TypedDict):
     role: Union[Literal["assistant"], Literal["user"], Literal["system"]]
     content: str
-    user: NotRequired[str]
-
 
 class ChatCompletionChoice(TypedDict):
     index: int
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 5d87e78..cc467db 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -83,13 +83,7 @@ class CreateCompletionRequest(BaseModel):
     # ignored, but marked as required for the sake of compatibility with openai's api
     model: str = model_field
 
-    n: Optional[int] = 1
     logprobs: Optional[int] = Field(None)
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
-    best_of: Optional[int] = 1
-    logit_bias: Optional[Dict[str, float]] = Field(None)
-    user: Optional[str] = Field(None)
 
     # llama.cpp specific parameters
     top_k: int = 40
@@ -120,13 +114,7 @@ def create_completion(
     completion_or_chunks = llama(
         **request.dict(
             exclude={
-                "model",
-                "n",
-                "frequency_penalty",
-                "presence_penalty",
-                "best_of",
-                "logit_bias",
-                "user",
+                "model"
             }
         )
     )
@@ -141,7 +129,6 @@ class CreateEmbeddingRequest(BaseModel):
     # ignored, but marked as required for the sake of compatibility with openai's api
     model: str = model_field
     input: str
-    user: Optional[str]
 
     class Config:
         schema_extra = {
@@ -161,7 +148,7 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
 def create_embedding(
     request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
-    return llama.create_embedding(**request.dict(exclude={"model", "user"}))
+    return llama.create_embedding(**request.dict(exclude={"model"}))
 
 
 class ChatCompletionRequestMessage(BaseModel):
@@ -181,12 +168,6 @@ class CreateChatCompletionRequest(BaseModel):
 
     # ignored, but marked as required for the sake of compatibility with openai's api
     model: str = model_field
-    
-    n: Optional[int] = 1
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
-    logit_bias: Optional[Dict[str, float]] = Field(None)
-    user: Optional[str] = Field(None)
 
     # llama.cpp specific parameters
     repeat_penalty: float = 1.1
@@ -220,12 +201,7 @@ def create_chat_completion(
     completion_or_chunks = llama.create_chat_completion(
         **request.dict(
             exclude={
-                "model",
-                "n",
-                "presence_penalty",
-                "frequency_penalty",
-                "logit_bias",
-                "user",
+                "model"
             }
         ),
     )

From 1e429135993f4e1298d8c801f2628bae3d8f18a9 Mon Sep 17 00:00:00 2001
From: Lucas Doyle <lucas.p.doyle@gmail.com>
Date: Sat, 29 Apr 2023 11:48:49 -0700
Subject: [PATCH 03/53] llama_cpp server: move logprobs to supported

I think this is actually supported (its in the arguments of `LLama.__call__`, which is how the completion is invoked). decision: mark as supported
---
 llama_cpp/server/app.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index cc467db..2d20f37 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -79,12 +79,11 @@ class CreateCompletionRequest(BaseModel):
     echo: bool = False
     stop: Optional[List[str]] = []
     stream: bool = False
+    logprobs: Optional[int] = Field(None)
 
     # ignored, but marked as required for the sake of compatibility with openai's api
     model: str = model_field
 
-    logprobs: Optional[int] = Field(None)
-
     # llama.cpp specific parameters
     top_k: int = 40
     repeat_penalty: float = 1.1

From a5aa6c1478de7cc16b654df533be3dee6519c42a Mon Sep 17 00:00:00 2001
From: Lucas Doyle <lucas.p.doyle@gmail.com>
Date: Sat, 29 Apr 2023 11:52:20 -0700
Subject: [PATCH 04/53] llama_cpp server: add missing top_k param to
 CreateChatCompletionRequest

`llama.create_chat_completion` definitely has a `top_k` argument, but its missing from `CreateChatCompletionRequest`. decision: add it
---
 llama_cpp/server/app.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 2d20f37..e1045af 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -169,6 +169,7 @@ class CreateChatCompletionRequest(BaseModel):
     model: str = model_field
 
     # llama.cpp specific parameters
+    top_k: int = 40,
     repeat_penalty: float = 1.1
 
     class Config:

From 978b6daf9313a11367d0a9393226379173fdb688 Mon Sep 17 00:00:00 2001
From: Lucas Doyle <lucas.p.doyle@gmail.com>
Date: Sat, 29 Apr 2023 14:37:36 -0700
Subject: [PATCH 05/53] llama_cpp server: add some more information to fields
 for completions

---
 llama_cpp/server/app.py | 70 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 11 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index e1045af..e168485 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -71,22 +71,70 @@ model_field = Field(
 )
 
 class CreateCompletionRequest(BaseModel):
-    prompt: Union[str, List[str]]
-    suffix: Optional[str] = Field(None)
-    max_tokens: int = 16
-    temperature: float = 0.8
-    top_p: float = 0.95
-    echo: bool = False
-    stop: Optional[List[str]] = []
-    stream: bool = False
-    logprobs: Optional[int] = Field(None)
+    prompt: Union[str, List[str]] = Field(
+        default="",
+        description="The prompt to generate completions for."
+    )
+    suffix: Optional[str] = Field(
+        default=None,
+        description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots."
+    )
+    max_tokens: int = Field(
+        default=16,
+        ge=1,
+        le=2048,
+        description="The maximum number of tokens to generate."
+    )
+    temperature: float = Field(
+        default=0.8,
+        ge=0.0,
+        le=2.0,
+        description="Adjust the randomness of the generated text.\n\n" +
+        "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run."
+    )
+    top_p: float = Field(
+        default=0.95,
+        ge=0.0,
+        le=1.0,
+        description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" +
+        "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text."
+    )
+    echo: bool = Field(
+        default=False,
+        description="Whether to echo the prompt in the generated text. Useful for chatbots."
+    )
+    stop: Optional[List[str]] = Field(
+        default=None,
+        description="A list of tokens at which to stop generation. If None, no stop tokens are used."
+    )
+    stream: bool = Field(
+        default=False,
+        description="Whether to stream the results as they are generated. Useful for chatbots."
+    )
+    logprobs: Optional[int] = Field(
+        default=None,
+        ge=0,
+        description="The number of logprobs to generate. If None, no logprobs are generated."
+    )
+
+
 
     # ignored, but marked as required for the sake of compatibility with openai's api
     model: str = model_field
 
     # llama.cpp specific parameters
-    top_k: int = 40
-    repeat_penalty: float = 1.1
+    top_k: int = Field(
+        default=40,
+        ge=0,
+        description="Limit the next token selection to the K most probable tokens.\n\n" +
+        "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text."
+    )
+    repeat_penalty: float = Field(
+        default=1.0,
+        ge=0.0,
+        description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" +
+        "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient."
+    )
 
     class Config:
         schema_extra = {

From 8dcbf65a45d729eedb4363f4e92247e6325d5b7d Mon Sep 17 00:00:00 2001
From: Lucas Doyle <lucas.p.doyle@gmail.com>
Date: Sat, 29 Apr 2023 18:37:43 -0700
Subject: [PATCH 06/53] llama_cpp server: define fields for chat completions

Slight refactor for common fields shared between completion and chat completion
---
 llama_cpp/server/app.py | 125 +++++++++++++++++++++++-----------------
 1 file changed, 71 insertions(+), 54 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index e168485..ec5dbd3 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -70,6 +70,55 @@ model_field = Field(
     description="The model to use for generating completions."
 )
 
+max_tokens_field = Field(
+    default=16,
+    ge=1,
+    le=2048,
+    description="The maximum number of tokens to generate."
+)
+
+temperature_field = Field(
+    default=0.8,
+    ge=0.0,
+    le=2.0,
+    description="Adjust the randomness of the generated text.\n\n" +
+    "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run."
+)
+
+top_p_field = Field(
+    default=0.95,
+    ge=0.0,
+    le=1.0,
+    description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" +
+    "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text."
+)
+
+stop_field = Field(
+    default=None,
+    description="A list of tokens at which to stop generation. If None, no stop tokens are used."
+)
+
+stream_field = Field(
+    default=False,
+    description="Whether to stream the results as they are generated. Useful for chatbots."
+)
+
+top_k_field = Field(
+    default=40,
+    ge=0,
+    description="Limit the next token selection to the K most probable tokens.\n\n" +
+    "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text."
+)
+
+repeat_penalty_field = Field(
+    default=1.0,
+    ge=0.0,
+    description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" +
+    "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient."
+)
+
+
+
 class CreateCompletionRequest(BaseModel):
     prompt: Union[str, List[str]] = Field(
         default="",
@@ -79,62 +128,27 @@ class CreateCompletionRequest(BaseModel):
         default=None,
         description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots."
     )
-    max_tokens: int = Field(
-        default=16,
-        ge=1,
-        le=2048,
-        description="The maximum number of tokens to generate."
-    )
-    temperature: float = Field(
-        default=0.8,
-        ge=0.0,
-        le=2.0,
-        description="Adjust the randomness of the generated text.\n\n" +
-        "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run."
-    )
-    top_p: float = Field(
-        default=0.95,
-        ge=0.0,
-        le=1.0,
-        description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" +
-        "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text."
-    )
+    max_tokens: int = max_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
     echo: bool = Field(
         default=False,
         description="Whether to echo the prompt in the generated text. Useful for chatbots."
     )
-    stop: Optional[List[str]] = Field(
-        default=None,
-        description="A list of tokens at which to stop generation. If None, no stop tokens are used."
-    )
-    stream: bool = Field(
-        default=False,
-        description="Whether to stream the results as they are generated. Useful for chatbots."
-    )
+    stop: Optional[List[str]] = stop_field
+    stream: bool = stream_field
     logprobs: Optional[int] = Field(
         default=None,
         ge=0,
         description="The number of logprobs to generate. If None, no logprobs are generated."
     )
 
-
-
     # ignored, but marked as required for the sake of compatibility with openai's api
     model: str = model_field
 
     # llama.cpp specific parameters
-    top_k: int = Field(
-        default=40,
-        ge=0,
-        description="Limit the next token selection to the K most probable tokens.\n\n" +
-        "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text."
-    )
-    repeat_penalty: float = Field(
-        default=1.0,
-        ge=0.0,
-        description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" +
-        "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient."
-    )
+    top_k: int = top_k_field
+    repeat_penalty: float = repeat_penalty_field
 
     class Config:
         schema_extra = {
@@ -199,26 +213,29 @@ def create_embedding(
 
 
 class ChatCompletionRequestMessage(BaseModel):
-    role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
-    content: str
-    user: Optional[str] = None
+    role: Union[Literal["system"], Literal["user"], Literal["assistant"]] = Field(
+        default=Literal["user"], description="The role of the message."
+    )
+    content: str = Field(default="", description="The content of the message.")
 
 
 class CreateChatCompletionRequest(BaseModel):
-    model: Optional[str]
-    messages: List[ChatCompletionRequestMessage]
-    temperature: float = 0.8
-    top_p: float = 0.95
-    stream: bool = False
-    stop: Optional[List[str]] = []
-    max_tokens: int = 128
+    messages: List[ChatCompletionRequestMessage] = Field(
+        default=[],
+        description="A list of messages to generate completions for."
+    )
+    max_tokens: int = max_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    stop: Optional[List[str]] = stop_field
+    stream: bool = stream_field
 
     # ignored, but marked as required for the sake of compatibility with openai's api
     model: str = model_field
 
     # llama.cpp specific parameters
-    top_k: int = 40,
-    repeat_penalty: float = 1.1
+    top_k: int = top_k_field
+    repeat_penalty: float = repeat_penalty_field
 
     class Config:
         schema_extra = {

From fa2a61e06569bb600d36d7ea5fee2ab456b3434d Mon Sep 17 00:00:00 2001
From: Lucas Doyle <lucas.p.doyle@gmail.com>
Date: Sat, 29 Apr 2023 18:46:01 -0700
Subject: [PATCH 07/53]  llama_cpp server: fields for the embedding endpoint

---
 llama_cpp/server/app.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index ec5dbd3..9adddcd 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -189,7 +189,9 @@ def create_completion(
 class CreateEmbeddingRequest(BaseModel):
     # ignored, but marked as required for the sake of compatibility with openai's api
     model: str = model_field
-    input: str
+    input: str = Field(
+        description="The input to embed."
+    )
 
     class Config:
         schema_extra = {

From dbbfc4ba2f8460e130dc268096f5906d3d22347b Mon Sep 17 00:00:00 2001
From: Lucas Doyle <lucas.p.doyle@gmail.com>
Date: Mon, 1 May 2023 11:48:37 -0700
Subject: [PATCH 08/53] llama_cpp server: fix to ChatCompletionRequestMessage

When I generate a client, it breaks because it fails to process the schema of ChatCompletionRequestMessage

These fix that:
- I think `Union[Literal["user"], Literal["channel"], ...]` is the same as Literal["user", "channel", ...]
- Turns out default value `Literal["user"]` isn't JSON serializable, so replace with "user"
---
 llama_cpp/llama_types.py | 2 +-
 llama_cpp/server/app.py  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index b8bdb08..b770a01 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -58,7 +58,7 @@ class Completion(TypedDict):
 
 
 class ChatCompletionMessage(TypedDict):
-    role: Union[Literal["assistant"], Literal["user"], Literal["system"]]
+    role: Literal["assistant", "user", "system"]
     content: str
 
 class ChatCompletionChoice(TypedDict):
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 9adddcd..886ee6d 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -215,8 +215,8 @@ def create_embedding(
 
 
 class ChatCompletionRequestMessage(BaseModel):
-    role: Union[Literal["system"], Literal["user"], Literal["assistant"]] = Field(
-        default=Literal["user"], description="The role of the message."
+    role: Literal["system", "user", "assistant"] = Field(
+        default="user", description="The role of the message."
     )
     content: str = Field(default="", description="The content of the message.")
 

From 0fcc25cdacc550ca5ab663239a3600b297c4a188 Mon Sep 17 00:00:00 2001
From: Lucas Doyle <lucas.p.doyle@gmail.com>
Date: Fri, 28 Apr 2023 23:54:31 -0700
Subject: [PATCH 09/53] examples fastapi_server: deprecate

This commit "deprecates" the example fastapi server by remaining runnable but pointing folks at the module if they want to learn more.

Rationale:

Currently there exist two server implementations in this repo:

- `llama_cpp/server/__main__.py`, the module that's runnable by consumers of the library with `python3 -m llama_cpp.server`
- `examples/high_level_api/fastapi_server.py`, which is probably a copy-pasted example by folks hacking around

IMO this is confusing. As a new user of the library I see they've both been updated relatively recently but looking side-by-side there's a diff.

The one in the module seems better:
- supports logits_all
- supports use_mmap
- has experimental cache support (with some mutex thing going on)
- some stuff with streaming support was moved around more recently than fastapi_server.py
---
 examples/high_level_api/fastapi_server.py | 267 ++--------------------
 1 file changed, 21 insertions(+), 246 deletions(-)

diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py
index 3ed0eac..4b3189d 100644
--- a/examples/high_level_api/fastapi_server.py
+++ b/examples/high_level_api/fastapi_server.py
@@ -4,259 +4,34 @@ To run this example:
 
 ```bash
 pip install fastapi uvicorn sse-starlette
-export MODEL=../models/7B/ggml-model.bin
-uvicorn fastapi_server_chat:app --reload
+export MODEL=../models/7B/...
+```
+
+Then run:
+```
+uvicorn llama_cpp.server.app:app --reload
+```
+
+or
+
+```
+python3 -m llama_cpp.server
 ```
 
 Then visit http://localhost:8000/docs to see the interactive API docs.
 
+
+To actually see the implementation of the server, see llama_cpp/server/app.py
+
 """
 import os
-import json
-from typing import List, Optional, Literal, Union, Iterator, Dict
-from typing_extensions import TypedDict
-
-import llama_cpp
-
-from fastapi import FastAPI
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
-from sse_starlette.sse import EventSourceResponse
-
-
-class Settings(BaseSettings):
-    model: str
-    n_ctx: int = 2048
-    n_batch: int = 8
-    n_threads: int = int(os.cpu_count() / 2) or 1
-    f16_kv: bool = True
-    use_mlock: bool = False     # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
-    embedding: bool = True
-    last_n_tokens_size: int = 64
-
-
-app = FastAPI(
-    title="🦙 llama.cpp Python API",
-    version="0.0.1",
-)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-settings = Settings()
-llama = llama_cpp.Llama(
-    settings.model,
-    f16_kv=settings.f16_kv,
-    use_mlock=settings.use_mlock,
-    embedding=settings.embedding,
-    n_threads=settings.n_threads,
-    n_batch=settings.n_batch,
-    n_ctx=settings.n_ctx,
-    last_n_tokens_size=settings.last_n_tokens_size,
-)
-
-
-class CreateCompletionRequest(BaseModel):
-    prompt: str
-    suffix: Optional[str] = Field(None)
-    max_tokens: int = 16
-    temperature: float = 0.8
-    top_p: float = 0.95
-    echo: bool = False
-    stop: List[str] = []
-    stream: bool = False
-
-    # ignored or currently unsupported
-    model: Optional[str] = Field(None)
-    n: Optional[int] = 1
-    logprobs: Optional[int] = Field(None)
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
-    best_of: Optional[int] = 1
-    logit_bias: Optional[Dict[str, float]] = Field(None)
-    user: Optional[str] = Field(None)
-
-    # llama.cpp specific parameters
-    top_k: int = 40
-    repeat_penalty: float = 1.1
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
-                "stop": ["\n", "###"],
-            }
-        }
-
-
-CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
-
-
-@app.post(
-    "/v1/completions",
-    response_model=CreateCompletionResponse,
-)
-def create_completion(request: CreateCompletionRequest):
-    if request.stream:
-        chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict())  # type: ignore
-        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
-    return llama(
-        **request.dict(
-            exclude={
-                "model",
-                "n",
-                "logprobs",
-                "frequency_penalty",
-                "presence_penalty",
-                "best_of",
-                "logit_bias",
-                "user",
-            }
-        )
-    )
-
-
-class CreateEmbeddingRequest(BaseModel):
-    model: Optional[str]
-    input: str
-    user: Optional[str]
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "input": "The food was delicious and the waiter...",
-            }
-        }
-
-
-CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
-
-
-@app.post(
-    "/v1/embeddings",
-    response_model=CreateEmbeddingResponse,
-)
-def create_embedding(request: CreateEmbeddingRequest):
-    return llama.create_embedding(**request.dict(exclude={"model", "user"}))
-
-
-class ChatCompletionRequestMessage(BaseModel):
-    role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
-    content: str
-    user: Optional[str] = None
-
-
-class CreateChatCompletionRequest(BaseModel):
-    model: Optional[str]
-    messages: List[ChatCompletionRequestMessage]
-    temperature: float = 0.8
-    top_p: float = 0.95
-    stream: bool = False
-    stop: List[str] = []
-    max_tokens: int = 128
-
-    # ignored or currently unsupported
-    model: Optional[str] = Field(None)
-    n: Optional[int] = 1
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
-    logit_bias: Optional[Dict[str, float]] = Field(None)
-    user: Optional[str] = Field(None)
-
-    # llama.cpp specific parameters
-    repeat_penalty: float = 1.1
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "messages": [
-                    ChatCompletionRequestMessage(
-                        role="system", content="You are a helpful assistant."
-                    ),
-                    ChatCompletionRequestMessage(
-                        role="user", content="What is the capital of France?"
-                    ),
-                ]
-            }
-        }
-
-
-CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)
-
-
-@app.post(
-    "/v1/chat/completions",
-    response_model=CreateChatCompletionResponse,
-)
-async def create_chat_completion(
-    request: CreateChatCompletionRequest,
-) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
-    completion_or_chunks = llama.create_chat_completion(
-        **request.dict(
-            exclude={
-                "model",
-                "n",
-                "presence_penalty",
-                "frequency_penalty",
-                "logit_bias",
-                "user",
-            }
-        ),
-    )
-
-    if request.stream:
-
-        async def server_sent_events(
-            chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
-        ):
-            for chat_chunk in chat_chunks:
-                yield dict(data=json.dumps(chat_chunk))
-            yield dict(data="[DONE]")
-
-        chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks  # type: ignore
-
-        return EventSourceResponse(
-            server_sent_events(chunks),
-        )
-    completion: llama_cpp.ChatCompletion = completion_or_chunks  # type: ignore
-    return completion
-
-
-class ModelData(TypedDict):
-    id: str
-    object: Literal["model"]
-    owned_by: str
-    permissions: List[str]
-
-
-class ModelList(TypedDict):
-    object: Literal["list"]
-    data: List[ModelData]
-
-
-GetModelResponse = create_model_from_typeddict(ModelList)
-
-
-@app.get("/v1/models", response_model=GetModelResponse)
-def get_models() -> ModelList:
-    return {
-        "object": "list",
-        "data": [
-            {
-                "id": llama.model_path,
-                "object": "model",
-                "owned_by": "me",
-                "permissions": [],
-            }
-        ],
-    }
+import uvicorn
 
+from llama_cpp.server.app import create_app
 
 if __name__ == "__main__":
-    import os
-    import uvicorn
+    app = create_app()
 
-    uvicorn.run(app, host=os.getenv("HOST", "localhost"), port=os.getenv("PORT", 8000))
+    uvicorn.run(
+        app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
+    )

From b9098b0ef7309b63ebff99cdfadf641223c15025 Mon Sep 17 00:00:00 2001
From: Lucas Doyle <lucas.p.doyle@gmail.com>
Date: Tue, 2 May 2023 14:08:51 -0700
Subject: [PATCH 10/53] llama_cpp server: prompt is a string

Not sure why this union type was here but taking a look at llama.py, prompt is only ever processed as a string for completion

This was breaking types when generating an openapi client
---
 llama_cpp/server/app.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index ef8aa4e..595476f 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -126,7 +126,7 @@ repeat_penalty_field = Field(
 )
 
 class CreateCompletionRequest(BaseModel):
-    prompt: Union[str, List[str]] = Field(
+    prompt: Optional[str] = Field(
         default="",
         description="The prompt to generate completions for."
     )
@@ -175,9 +175,6 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
 def create_completion(
     request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
-    if isinstance(request.prompt, list):
-        request.prompt = "".join(request.prompt)
-
     completion_or_chunks = llama(
         **request.dict(
             exclude={

From 0e9f227afd4537018c7fe5c3018b22871708cb65 Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Thu, 4 May 2023 18:33:08 +0200
Subject: [PATCH 11/53] Update low level examples

---
 examples/low_level_api/Chat.py                |  70 ++++++
 examples/low_level_api/Miku.py                |  59 +++++
 examples/low_level_api/ReasonAct.py           |  49 +++++
 examples/low_level_api/common.py              | 163 +++++++++-----
 .../low_level_api/low_level_api_chat_cpp.py   | 202 +++++++++++++++---
 .../low_level_api/low_level_api_llama_cpp.py  |  35 ++-
 6 files changed, 486 insertions(+), 92 deletions(-)
 create mode 100644 examples/low_level_api/Chat.py
 create mode 100644 examples/low_level_api/Miku.py
 create mode 100644 examples/low_level_api/ReasonAct.py

diff --git a/examples/low_level_api/Chat.py b/examples/low_level_api/Chat.py
new file mode 100644
index 0000000..1015508
--- /dev/null
+++ b/examples/low_level_api/Chat.py
@@ -0,0 +1,70 @@
+#!/bin/python
+import sys, os, datetime
+from common import GptParams
+from low_level_api_chat_cpp import LLaMAInteract
+
+def env_or_def(env, default):
+	if (env in os.environ):
+		return os.environ[env]
+	return default
+
+AI_NAME = env_or_def("AI_NAME", "ChatLLaMa")
+MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
+USER_NAME = env_or_def("USER_NAME", "USER")
+N_PREDICTS = int(env_or_def("N_PREDICTS", "2048"))
+N_THREAD = int(env_or_def("N_THREAD", "8"))
+
+today = datetime.datetime.today()
+DATE_YEAR=today.strftime("%Y")
+DATE_TIME=today.strftime("%H:%M")
+
+prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
+{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision.
+There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
+The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+
+{USER_NAME}: Hello, {AI_NAME}!
+{AI_NAME}: Hello {USER_NAME}! How may I help you today?
+{USER_NAME}: What year is it?
+{AI_NAME}: We are in {DATE_YEAR}.
+{USER_NAME}: Please tell me the largest city in Europe.
+{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia.
+{USER_NAME}: What can you tell me about Moscow?
+{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center.
+{USER_NAME}: What is a cat?
+{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
+{USER_NAME}: How do I pass command line arguments to a Node.js program?
+{AI_NAME}: The arguments are stored in process.argv.
+
+    argv[0] is the path to the Node. js executable.
+    argv[1] is the path to the script file.
+    argv[2] is the first argument passed to the script.
+    argv[3] is the second argument passed to the script and so on.
+{USER_NAME}: Name a color.
+{AI_NAME}: Blue.
+{USER_NAME}: What time is it?
+{AI_NAME}: It is {DATE_TIME}.
+{USER_NAME}:""" + " ".join(sys.argv[1:])
+
+print("Loading model...")
+params = GptParams(
+	n_ctx=2048,
+	temp=0.7,
+	top_k=40,
+	top_p=0.5,
+	repeat_last_n=256,
+	n_batch=1024,
+	repeat_penalty=1.17647,
+	model=MODEL,
+	n_threads=N_THREAD,
+	n_predict=N_PREDICTS,
+	use_color=True,
+	interactive=True,
+	antiprompt=[f"{USER_NAME}:"],
+	input_prefix=" ",
+	prompt=prompt,
+)
+
+with LLaMAInteract(params) as m:
+	m.interact()
diff --git a/examples/low_level_api/Miku.py b/examples/low_level_api/Miku.py
new file mode 100644
index 0000000..eb9a2cf
--- /dev/null
+++ b/examples/low_level_api/Miku.py
@@ -0,0 +1,59 @@
+#!/bin/python
+import sys, os
+from common import GptParams
+from low_level_api_chat_cpp import LLaMAInteract
+
+def env_or_def(env, default):
+	if (env in os.environ):
+		return os.environ[env]
+	return default
+
+AI_NAME = env_or_def("AI_NAME", "Miku")
+MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
+USER_NAME = env_or_def("USER_NAME", "Anon")
+N_PREDICTS = int(env_or_def("N_PREDICTS", "4096"))
+N_THREAD = int(env_or_def("N_THREAD", "0"))
+
+prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer.
+{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
+{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
+{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
+{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
+The conversation is only between {USER_NAME} and {AI_NAME}
+The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice.
+{AI_NAME} can only communicate through text, so she can't send images or videos.
+
+
+{USER_NAME}: Hello!
+{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
+{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
+{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
+{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
+{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
+{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that!
+{AI_NAME}: What do you like to do in your free time? ^_^
+{USER_NAME}:""" + " ".join(sys.argv[1:])
+
+print("Loading model...")
+params = GptParams(
+	n_batch=1024,
+	n_ctx=2048,
+	n_keep=-1,
+	repeat_last_n=256,
+	repeat_penalty=1.17647,
+	temp=0.7,
+	top_k=40,
+	top_p=0.5,
+	model=MODEL,
+	n_predict=N_PREDICTS,
+	use_color=True,
+	interactive=True,
+	antiprompt=[f"{USER_NAME}:"],
+	prompt=prompt,
+)
+
+if N_THREAD > 0:
+	params.n_threads = N_THREAD
+
+with LLaMAInteract(params) as m:
+	m.interact()
diff --git a/examples/low_level_api/ReasonAct.py b/examples/low_level_api/ReasonAct.py
new file mode 100644
index 0000000..82e5c44
--- /dev/null
+++ b/examples/low_level_api/ReasonAct.py
@@ -0,0 +1,49 @@
+#!/bin/python
+import sys, os, datetime
+from common import GptParams
+from low_level_api_chat_cpp import LLaMAInteract
+
+def env_or_def(env, default):
+	if (env in os.environ):
+		return os.environ[env]
+	return default
+
+MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
+
+prompt=f"""You run in a loop of Thought, Action, Observation.
+At the end of the loop either Answer or restate your Thought and Action.
+Use Thought to describe your thoughts about the question you have been asked.
+Use Action to run one of these actions available to you:
+- calculate[python math expression]
+Observation will be the result of running those actions
+
+
+Question: What is 4 * 7 / 3?
+Thought: Do I need to use an action? Yes, I use calculate to do math
+Action: calculate[4 * 7 / 3]
+Observation: 9.3333333333
+Thought: Do I need to use an action? No, have the result
+Answer: The calculate tool says it is 9.3333333333
+Question: What is capital of france?
+Thought: Do I need to use an action? No, I know the answer
+Answer: Paris is the capital of France
+Question:""" + " ".join(sys.argv[1:])
+
+print("Loading model...")
+params = GptParams(
+	interactive=True,
+	interactive_start=True,
+	top_k=10000,
+	temp=0.2,
+	repeat_penalty=1,
+	n_threads=7,
+	n_ctx=2048,
+	antiprompt=["Question:","Observation:"],
+	model=MODEL,
+	input_prefix=" ",
+	n_predict=-1,
+	prompt=prompt,
+)
+
+with LLaMAInteract(params) as m:
+	m.interact()
diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py
index 061ec3a..6c35cc5 100644
--- a/examples/low_level_api/common.py
+++ b/examples/low_level_api/common.py
@@ -1,8 +1,9 @@
 import os
 import argparse
+import re
 
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import List
 
 # Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
 
@@ -12,23 +13,35 @@ class GptParams:
     seed: int = -1
     n_threads: int = min(4, os.cpu_count() or 1)
     n_predict: int = 128
-    repeat_last_n: int = 64
     n_parts: int = -1
     n_ctx: int = 512
     n_batch: int = 8
     n_keep: int = 0
 
+    ignore_eos: bool = False
+    logit_bias: dict[int, float] = field(default_factory=dict)
     top_k: int = 40
     top_p: float = 0.95
+    tfs_z: float = 1.00
+    typical_p: float = 1.00
     temp: float = 0.80
     repeat_penalty: float = 1.10
+    repeat_last_n: int = 64
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    mirostat: int = 0
+    mirostat_tau: float = 5.0
+    mirostat_eta: float = 0.1
 
     model: str = "./models/llama-7B/ggml-model.bin"
     prompt: str = ""
+    path_session: str = ""
     input_prefix: str = " "
-
     antiprompt: List[str] = field(default_factory=list)
 
+    lora_adapter: str = ""
+    lora_base: str = ""
+
     memory_f16: bool = True
     random_prompt: bool = False
     use_color: bool = False
@@ -38,7 +51,7 @@ class GptParams:
     interactive_start: bool = False
 
     instruct: bool = False
-    ignore_eos: bool = False
+    penalize_nl: bool = True
     perplexity: bool = False
     use_mmap: bool = True
     use_mlock: bool = False
@@ -61,59 +74,42 @@ class GptParams:
     instruct_inp_suffix: str="\n\n### Response:\n\n"
 
 
-def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
-    if params is None:
-        params = GptParams()
-
+def gpt_params_parse(argv = None):
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed")
     parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads")
-    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
-    parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
+    parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict")
+    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
     parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx")
-    parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
-    parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
-    parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
-    parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
-    parser.add_argument("--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict")
-    parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
-    parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
     parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch")
     parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep")
+
+    parser.add_argument(
+        "-l",
+        "--logit-bias",
+        type=str,
+        action='append',
+        help="--logit-bias TOKEN_ID(+/-)BIAS",
+        dest="logit_bias_str"
+    )
+    parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
+    parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
+    parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
+    parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z")
+    parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
+    parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
+    parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
+    parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z")
+    parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty")
+    parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat")
+    parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau",dest="mirostat_tau")
+    parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")
+
     parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
-    parser.add_argument(
-        "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
-    )
-    parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
-    parser.add_argument(
-        "--interactive-start",
-        action="store_true",
-        help="run in interactive mode",
-        dest="interactive"
-    )
-    parser.add_argument(
-        "--interactive-first",
-        action="store_true",
-        help="run in interactive mode and wait for input right away",
-        dest="interactive_start"
-    )
-    parser.add_argument(
-        "-ins",
-        "--instruct",
-        action="store_true",
-        help="run in instruction mode (use with Alpaca or Vicuna models)",
-        dest="instruct"
-    )
-    parser.add_argument(
-        "--color",
-        action="store_true",
-        help="colorise output to distinguish prompt and user input from generations",
-        dest="use_color"
-    )
-    parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
-    parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap")
-    parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
-    parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt")
+    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
+    parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
+    parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session")
+    parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
     parser.add_argument(
         "-r",
         "--reverse-prompt",
@@ -122,16 +118,71 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
         help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
         dest="antiprompt"
     )
-    parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
-    parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
-    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
+    
+    parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter")
+    parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base")
+
+    parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
     parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt")
-    parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
+    parser.add_argument(
+        "--color",
+        action="store_true",
+        help="colorise output to distinguish prompt and user input from generations",
+        dest="use_color"
+    )
+    parser.add_argument(
+        "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
+    )
+    
+    parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
+    parser.add_argument(
+        "--interactive-first",
+        action="store_true",
+        help="run in interactive mode and wait for input right away",
+        dest="interactive_start"
+    )
+
+    parser.add_argument(
+        "-ins",
+        "--instruct",
+        action="store_true",
+        help="run in instruction mode (use with Alpaca or Vicuna models)",
+        dest="instruct"
+    )
+    parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl")
+    parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
+    parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap")
+    parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
+    parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
+    parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt")
+
+    #Custom args
     parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix")
     parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix")
     parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo")
+
+    parser.add_argument(
+        "--interactive-start",
+        action="store_true",
+        help="run in interactive mode",
+        dest="interactive"
+    )
+
     args = parser.parse_args(argv)
-    return args
+    
+    logit_bias_str = args.logit_bias_str
+    delattr(args, "logit_bias_str")
+    params = GptParams(**vars(args))
+
+    if (params.lora_adapter):
+        params.use_mmap = False
+
+    if (logit_bias_str != None):
+        for i in logit_bias_str:
+            if (m := re.match(r"(\d+)([-+]\d+)", i)):
+                params.logit_bias[int(m.group(1))] = int(m.group(2))
+
+    return params
 
 def gpt_random_prompt(rng):
     return [
@@ -148,4 +199,4 @@ def gpt_random_prompt(rng):
     ][rng % 10]
 
 if __name__ == "__main__":
-    print(GptParams(gpt_params_parse()))
+    print(gpt_params_parse())
diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 6fced65..4e129ee 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -10,9 +10,10 @@ Quirks:
    You should also still be feeding the model with a "primer" prompt that 
    shows it the expected format.
 """
+import ctypes
 import sys
 from time import time
-from os import cpu_count
+from os import cpu_count, path
 
 import llama_cpp
 from common import GptParams, gpt_params_parse, gpt_random_prompt
@@ -77,6 +78,7 @@ specified) expect poor results""", file=sys.stderr)
 		# runtime args
 		self.input_consumed = 0
 		self.n_past = 0
+		self.n_session_consumed = 0
 		self.first_antiprompt = []
 		self.remaining_tokens = self.params.n_predict
 		self.output_echo = self.params.input_echo
@@ -94,6 +96,19 @@ specified) expect poor results""", file=sys.stderr)
 		if (not self.ctx):
 			raise RuntimeError(f"error: failed to load model '{self.params.model}'")
 
+		if (self.params.ignore_eos):
+			self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")
+
+		if (len(self.params.lora_adapter) > 0):
+			if (llama_cpp.llama_apply_lora_from_file(
+				self.ctx, 
+				self.params.lora_adapter, 
+				self.params.lora_base if len(self.params.lora_base) > 0 else None,
+				self.params.n_threads
+			) != 0):
+				print("error: failed to apply lora adapter")
+				return
+
 		print(file=sys.stderr)
 		print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
 | {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr)
@@ -117,13 +132,49 @@ specified) expect poor results""", file=sys.stderr)
 			with open(self.params.file) as f:
 				self.params.prompt = f.read()
 
+		self.session_tokens: list[llama_cpp.llama_token] = []
+		if (len(self.params.path_session) > 0):
+			print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr)
+
+			if (path.exists(self.params.path_session)):
+				_session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
+				_n_token_count_out = llama_cpp.c_int()
+				if (llama_cpp.llama_load_session_file(
+					self.ctx, 
+					self.params.path_session.encode("utf8"),
+					_session_tokens,
+					self.params.n_ctx,
+					ctypes.byref(_n_token_count_out)
+				) != 0):
+					print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr)
+					return
+				self.session_tokens = _session_tokens[:_n_token_count_out]
+				print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr)
+			else:
+				print(f"session file does not exist, will create", file=sys.stderr)
+
 		# tokenize the prompt
 		self.embd = []
 		self.embd_inp = self._tokenize(self.params.prompt)
 
-		if (len(self.embd_inp) > self.params.n_ctx - 4):
+		if (len(self.embd_inp) > self.n_ctx - 4):
 			raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})")
 
+		# debug message about similarity of saved session, if applicable
+		n_matching_session_tokens = 0
+		if len(self.session_tokens) > 0:
+			for id in self.session_tokens:
+				if n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[n_matching_session_tokens]:
+					break
+				n_matching_session_tokens += 1
+			
+			if n_matching_session_tokens >= len(self.embd_inp):
+				print(f"session file has exact match for prompt!")
+			elif n_matching_session_tokens < (len(self.embd_inp) / 2):
+				print(f"warning: session file has low similarity to prompt ({n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated")
+			else:
+				print(f"session file matches {n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt")
+
 		# number of tokens to keep when resetting context
 		if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct):
 			self.params.n_keep = len(self.embd_inp)
@@ -132,6 +183,7 @@ specified) expect poor results""", file=sys.stderr)
 		self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False)
 
 		# in instruct mode, we inject a prefix and a suffix to each input by the user
+		self.antiecho = None
 		if (self.params.instruct):
 			self.params.interactive_start = True
 			_ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False)
@@ -171,16 +223,24 @@ number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr)
 			if len(self.params.input_prefix) > 0:
 				print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr)
 
-		print(f"""sampling: temp = {self.params.temp},\
+		print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\
+repeat_penalty = {self.params.repeat_penalty},\
+presence_penalty = {self.params.presence_penalty},\
+frequency_penalty = {self.params.frequency_penalty},\
 top_k = {self.params.top_k},\
+tfs_z = {self.params.tfs_z},\
 top_p = {self.params.top_p},\
-repeat_last_n = {self.params.repeat_last_n},\
-repeat_penalty = {self.params.repeat_penalty}
+typical_p = {self.params.typical_p},\
+temp = {self.params.temp},\
+mirostat = {self.params.mirostat},\
+mirostat_lr = {self.params.mirostat_eta},\
+mirostat_ent = {self.params.mirostat_tau},\
 
-generate: n_ctx = {self.n_ctx}, \
-n_batch = {self.params.n_batch}, \
-n_predict = {self.params.n_predict}, \
+generate: n_ctx = {self.n_ctx},\
+n_batch = {self.params.n_batch},\
+n_predict = {self.params.n_predict},\
 n_keep = {self.params.n_keep}
+
 """, file=sys.stderr)
 
 		# determine antiprompt tokens
@@ -198,6 +258,9 @@ n_keep = {self.params.n_keep}
 """, file=sys.stderr)
 		self.set_color(CONSOLE_COLOR_PROMPT)
 
+		self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
+
+
 	# tokenize a prompt
 	def _tokenize(self, prompt, bos=True):
 		_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
@@ -229,31 +292,117 @@ n_keep = {self.params.n_keep}
 						self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd)
 					]
 					self.embd = _insert + self.embd
+					self.params.path_session = ""
+
+				# try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+				# REVIEW
+				if self.n_session_consumed < len(self.session_tokens):
+					for i in range(len(self.embd)):
+						if self.embd[i] != self.session_tokens[self.n_session_consumed]:
+							self.session_tokens = self.session_tokens[:self.n_session_consumed]
+							break
+						
+						self.n_past += 1
+						self.n_session_consumed += 1
+						
+						if self.n_session_consumed >= len(self.session_tokens):
+							i += 1
+							break
+					
+					if i > 0:
+						self.embd = self.embd[i:]
+
+				# evaluate tokens in batches
+				# embd is typically prepared beforehand to fit within a batch, but not always
+				#TODO BUG: The batching code causes nonsensical generation
+				"""for i in range(0, len(self.embd), self.params.n_batch):
+					n_eval = self.params.n_batch
+					_arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval])
+					if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0:
+						print(f"failed to eval")
+						return
+					
+					self.n_past += n_eval"""
 
 				if (llama_cpp.llama_eval(
 					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
 				) != 0):
 					raise Exception("Failed to llama_eval!")
 
+				if len(self.embd) > 0 and not len(self.params.path_session) > 0:
+					self.session_tokens.extend(self.embd)
+					self.n_session_consumed = len(self.session_tokens)
+
 			self.n_past += len(self.embd)
 			self.embd = []
-			if len(self.embd_inp) <= self.input_consumed:
+			if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting
 				# out of user input, sample next token
+				top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k
+				repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n
 
-				if (self.params.ignore_eos):
-					logits = llama_cpp.llama_get_logits(self.ctx)
-					logits[llama_cpp.llama_token_eos()] = llama_cpp.c_float(0)
+				# optionally save the session on first sample (for faster prompt loading next time)
+				if len(self.params.path_session) > 0 and self.need_to_save_session:
+					self.need_to_save_session = False
+					llama_cpp.llama_save_session_file(
+						self.ctx,
+						self.params.path_session.encode("utf8"),
+						self.session_tokens,
+						len(self.session_tokens)
+					)
+
+				id = 0
+
+				logits = llama_cpp.llama_get_logits(self.ctx)
+				n_vocab = llama_cpp.llama_n_vocab(self.ctx)
+
+				# Apply params.logit_bias map
+				for key, value in self.params.logit_bias.items():
+					logits[key] += value
+
+				_arr = (llama_cpp.llama_token_data * n_vocab)(*[
+					llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
+					for token_id in range(n_vocab)
+				])
+				candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
+
+				# Apply penalties
+				nl_logit = logits[llama_cpp.llama_token_nl()]
+				last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
+
+				_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
+				llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
+					_arr,
+					last_n_repeat, self.params.repeat_penalty)
+				llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
+					_arr,
+					last_n_repeat, self.params.frequency_penalty, self.params.presence_penalty)
+
+				if not self.params.penalize_nl:
+					logits[llama_cpp.llama_token_nl()] = nl_logit
+
+				if self.params.temp <= 0:
+					# Greedy sampling
+					id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p)
+				else:
+					if self.params.mirostat == 1:
+						mirostat_mu = 2.0 * self.params.mirostat_tau
+						mirostat_m = 100
+						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp)
+						id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_m, mirostat_mu)
+					elif self.params.mirostat == 2:
+						mirostat_mu = 2.0 * self.params.mirostat_tau
+						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp)
+						id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_mu)
+					else:
+						# Temperature sampling
+						llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k)
+						llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, self.params.tfs_z)
+						llama_cpp.llama_sample_typical(self.ctx, candidates_p, self.params.typical_p)
+						llama_cpp.llama_sample_top_p(self.ctx, candidates_p, self.params.top_p)
+						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp)
+						id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
+				# print("`{}`".format(candidates_p.size))
 
-				_arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):]
-				id = llama_cpp.llama_sample_top_p_top_k(
-					self.ctx,
-					(llama_cpp.llama_token * len(_arr))(*_arr),
-					len(_arr),
-					self.params.top_k,
-					self.params.top_p,
-					self.params.temp,
-					self.params.repeat_penalty,
-				)
 				self.last_n_tokens.pop(0)
 				self.last_n_tokens.append(id)
 
@@ -288,7 +437,7 @@ n_keep = {self.params.n_keep}
 			# display tokens
 			if self.output_echo:
 				for id in self.embd:
-					if self.params.instruct:
+					if self.antiecho != None:
 						for r in self.antiecho(id):
 							yield r
 					else:
@@ -316,7 +465,7 @@ n_keep = {self.params.n_keep}
 				if (not self.params.instruct):
 					for i in self.llama_token_eot:
 						yield i
-				break
+					break
 
 			# respect n_predict even if antiprompt is present
 			if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1):
@@ -356,7 +505,7 @@ n_keep = {self.params.n_keep}
 	def output(self):
 		self.remaining_tokens = self.params.n_predict
 		for id in self.generate():
-			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore")
+			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
 
 	# read user input
 	def read_input(self):
@@ -415,8 +564,7 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {USER_NAME}: Name a color.
 {AI_NAME}: Blue
 {USER_NAME}:"""
-	args = gpt_params_parse()
-	params = GptParams(**vars(args))
+	params = gpt_params_parse()
 
 	with LLaMAInteract(params) as m:
 		m.interact()
diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py
index 4fb5a03..9e38ec7 100644
--- a/examples/low_level_api/low_level_api_llama_cpp.py
+++ b/examples/low_level_api/low_level_api_llama_cpp.py
@@ -37,6 +37,10 @@ embd = []
 last_n_size = 64
 last_n_tokens_data = [0] * last_n_size
 n_batch = 24
+last_n_repeat = 64
+repeat_penalty = 1
+frequency_penalty = 0.0
+presence_penalty = 0.0
 
 while remaining_tokens > 0:
     if len(embd) > 0:
@@ -47,15 +51,28 @@ while remaining_tokens > 0:
     n_past += len(embd)
     embd = []
     if len(embd_inp) <= input_consumed:
-        id = llama_cpp.llama_sample_top_p_top_k(
-            ctx,
-            (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data),
-            len(last_n_tokens_data),
-            40,
-            0.8,
-            0.2,
-            1.0 / 0.85,
-        )
+        logits = llama_cpp.llama_get_logits(ctx)
+        n_vocab = llama_cpp.llama_n_vocab(ctx)
+
+        _arr = (llama_cpp.llama_token_data * n_vocab)(*[
+            llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
+            for token_id in range(n_vocab)
+        ])
+        candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False))
+
+        _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
+        llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
+            _arr,
+            last_n_repeat, repeat_penalty)
+        llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p,
+            _arr,
+            last_n_repeat, frequency_penalty, presence_penalty)
+
+        llama_cpp.llama_sample_top_k(ctx, candidates_p, 40)
+        llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8)
+        llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2)
+        id = llama_cpp.llama_sample_token(ctx, candidates_p)
+
         last_n_tokens_data = last_n_tokens_data[1:] + [id]
         embd.append(id)
         input_noecho = False

From c9bb602b2682ae12c5690829fee1635fcdfc707c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 5 May 2023 23:25:53 +0000
Subject: [PATCH 12/53] Bump black from 23.1.0 to 23.3.0

Bumps [black](https://github.com/psf/black) from 23.1.0 to 23.3.0.
- [Release notes](https://github.com/psf/black/releases)
- [Changelog](https://github.com/psf/black/blob/main/CHANGES.md)
- [Commits](https://github.com/psf/black/compare/23.1.0...23.3.0)

---
updated-dependencies:
- dependency-name: black
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 56 +++++++++++++++++++++++++-------------------------
 pyproject.toml |  2 +-
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index a505168..129f923 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry and should not be changed by hand.
 
 [[package]]
 name = "anyio"
@@ -42,37 +42,37 @@ tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy
 
 [[package]]
 name = "black"
-version = "23.1.0"
+version = "23.3.0"
 description = "The uncompromising code formatter."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"},
-    {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"},
-    {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"},
-    {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"},
-    {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"},
-    {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"},
-    {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"},
-    {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"},
-    {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"},
-    {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"},
-    {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"},
-    {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"},
-    {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"},
-    {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"},
-    {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"},
-    {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"},
-    {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"},
-    {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"},
-    {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"},
-    {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"},
-    {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"},
-    {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"},
-    {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"},
-    {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"},
-    {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
+    {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
+    {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
+    {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
+    {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
+    {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
+    {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
+    {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
+    {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
+    {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
+    {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
+    {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
+    {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
+    {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
 ]
 
 [package.dependencies]
@@ -1458,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "aa15e57300668bd23c051b4cd87bec4c1a58dcccd2f2b4767579fea7f2c5fa41"
+content-hash = "714083b7f30a677f9a358a9633970fb88b8198d50558a0b50bf311d4a209ed4c"
diff --git a/pyproject.toml b/pyproject.toml
index ca0346f..a164ef7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ typing-extensions = "^4.5.0"
 
 
 [tool.poetry.group.dev.dependencies]
-black = "^23.1.0"
+black = "^23.3.0"
 twine = "^4.0.2"
 mkdocs = "^1.4.2"
 mkdocstrings = {extras = ["python"], version = "^0.20.0"}

From 1895c1103379156f4bd2ae895cdab080ab9cd104 Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Sat, 6 May 2023 13:18:25 +0200
Subject: [PATCH 13/53] Rename postfix to suffix to match upstream

---
 examples/low_level_api/Chat.py                   | 1 +
 examples/low_level_api/common.py                 | 4 ++--
 examples/low_level_api/low_level_api_chat_cpp.py | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/low_level_api/Chat.py b/examples/low_level_api/Chat.py
index 1015508..fcef8cd 100644
--- a/examples/low_level_api/Chat.py
+++ b/examples/low_level_api/Chat.py
@@ -63,6 +63,7 @@ params = GptParams(
 	interactive=True,
 	antiprompt=[f"{USER_NAME}:"],
 	input_prefix=" ",
+	input_suffix=f"{AI_NAME}:",
 	prompt=prompt,
 )
 
diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py
index 6c35cc5..7a25582 100644
--- a/examples/low_level_api/common.py
+++ b/examples/low_level_api/common.py
@@ -37,6 +37,7 @@ class GptParams:
     prompt: str = ""
     path_session: str = ""
     input_prefix: str = " "
+    input_suffix: str = ""
     antiprompt: List[str] = field(default_factory=list)
 
     lora_adapter: str = ""
@@ -64,7 +65,6 @@ class GptParams:
     # Set to "\nUser:" etc.
     # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:""
     fix_prefix: str = ""
-    output_postfix: str = ""
     input_echo: bool = True,
 
     # Default instructions for Alpaca
@@ -110,6 +110,7 @@ def gpt_params_parse(argv = None):
     parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
     parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session")
     parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
+    parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix")
     parser.add_argument(
         "-r",
         "--reverse-prompt",
@@ -158,7 +159,6 @@ def gpt_params_parse(argv = None):
 
     #Custom args
     parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix")
-    parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix")
     parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo")
 
     parser.add_argument(
diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 4e129ee..72ced2b 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -527,8 +527,8 @@ n_keep = {self.params.n_keep}
 				self.input(self.read_input())
 			else:
 				print(self.params.input_prefix, end="")
-				self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.output_postfix}")
-				print(self.params.output_postfix,end="")
+				self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}")
+				print(self.params.input_suffix,end="")
 			self.set_color(CONSOLE_COLOR_DEFAULT)
 
 			try:

From 9797394c81133eebb367bd0673b6c89eefd5a38e Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Sat, 6 May 2023 13:27:52 +0200
Subject: [PATCH 14/53] Wrong logit_bias parsed type

---
 examples/low_level_api/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py
index 7a25582..2bfe356 100644
--- a/examples/low_level_api/common.py
+++ b/examples/low_level_api/common.py
@@ -180,7 +180,7 @@ def gpt_params_parse(argv = None):
     if (logit_bias_str != None):
         for i in logit_bias_str:
             if (m := re.match(r"(\d+)([-+]\d+)", i)):
-                params.logit_bias[int(m.group(1))] = int(m.group(2))
+                params.logit_bias[int(m.group(1))] = float(m.group(2))
 
     return params
 

From 3ceb47b597a8819db3afa851df4ae3211f2cb680 Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Sat, 6 May 2023 13:35:50 +0200
Subject: [PATCH 15/53] Fix mirastat requiring c_float

---
 .../low_level_api/low_level_api_chat_cpp.py   | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 72ced2b..55b24cd 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -357,7 +357,7 @@ n_keep = {self.params.n_keep}
 
 				# Apply params.logit_bias map
 				for key, value in self.params.logit_bias.items():
-					logits[key] += value
+					logits[key] += llama_cpp.c_float(value)
 
 				_arr = (llama_cpp.llama_token_data * n_vocab)(*[
 					llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
@@ -372,14 +372,14 @@ n_keep = {self.params.n_keep}
 				_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
 				llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
 					_arr,
-					last_n_repeat, self.params.repeat_penalty)
+					last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty))
 				llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
 					_arr,
-					last_n_repeat, self.params.frequency_penalty, self.params.presence_penalty)
+					last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
 
 				if not self.params.penalize_nl:
 					logits[llama_cpp.llama_token_nl()] = nl_logit
-
+				
 				if self.params.temp <= 0:
 					# Greedy sampling
 					id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p)
@@ -387,19 +387,19 @@ n_keep = {self.params.n_keep}
 					if self.params.mirostat == 1:
 						mirostat_mu = 2.0 * self.params.mirostat_tau
 						mirostat_m = 100
-						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp)
-						id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_m, mirostat_mu)
+						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
+						id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu))
 					elif self.params.mirostat == 2:
 						mirostat_mu = 2.0 * self.params.mirostat_tau
-						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp)
-						id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_mu)
+						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
+						id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
 					else:
 						# Temperature sampling
 						llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k)
-						llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, self.params.tfs_z)
-						llama_cpp.llama_sample_typical(self.ctx, candidates_p, self.params.typical_p)
-						llama_cpp.llama_sample_top_p(self.ctx, candidates_p, self.params.top_p)
-						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp)
+						llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z))
+						llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p))
+						llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p))
+						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
 						id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
 				# print("`{}`".format(candidates_p.size))
 

From 996f63e9e1804b2d9a91c5081665ea536a85542f Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Sat, 6 May 2023 15:16:58 +0200
Subject: [PATCH 16/53] Add utf8 to chat example

---
 examples/low_level_api/common.py              |  2 +-
 .../low_level_api/low_level_api_chat_cpp.py   | 73 +++++++-------
 examples/low_level_api/util.py                | 95 +++++++++++++++++++
 3 files changed, 130 insertions(+), 40 deletions(-)
 create mode 100644 examples/low_level_api/util.py

diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py
index 2bfe356..55d08db 100644
--- a/examples/low_level_api/common.py
+++ b/examples/low_level_api/common.py
@@ -102,7 +102,7 @@ def gpt_params_parse(argv = None):
     parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z")
     parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty")
     parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat")
-    parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau",dest="mirostat_tau")
+    parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau")
     parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")
 
     parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 55b24cd..9a9bc01 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -17,34 +17,7 @@ from os import cpu_count, path
 
 import llama_cpp
 from common import GptParams, gpt_params_parse, gpt_random_prompt
-
-ANSI_COLOR_RESET = "\x1b[0m"
-ANSI_COLOR_YELLOW = "\x1b[33m"
-ANSI_BOLD = "\x1b[1m"
-ANSI_COLOR_GREEN = "\x1b[32m"
-
-CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
-CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
-CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
-
-# Iterative search
-# Actively searches and prevents a pattern from being returned
-class IterSearch:
-	def __init__(self, pattern):
-		self.pattern = list(pattern)
-		self.buffer = []
-
-	def __call__(self, char):
-		self.buffer += [char]
-
-		if (self.pattern[:len(self.buffer)] == self.buffer):
-			if (len(self.buffer) >= len(self.pattern)):
-				self.buffer.clear()
-			return []
-
-		_tmp = self.buffer[:]
-		self.buffer.clear()
-		return _tmp
+import util
 
 # A LLaMA interactive session
 class LLaMAInteract:
@@ -82,6 +55,7 @@ specified) expect poor results""", file=sys.stderr)
 		self.first_antiprompt = []
 		self.remaining_tokens = self.params.n_predict
 		self.output_echo = self.params.input_echo
+		self.multibyte_fix = []
 
 		# model load
 		self.lparams = llama_cpp.llama_context_default_params()
@@ -188,7 +162,7 @@ specified) expect poor results""", file=sys.stderr)
 			self.params.interactive_start = True
 			_ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False)
 			self.first_antiprompt.append(_ptn)
-			self.antiecho = IterSearch(_ptn)
+			self.antiecho = util.IterSearch(_ptn)
 
 		# enable interactive mode if reverse prompt or interactive start is specified
 		if (len(self.params.antiprompt) != 0 or self.params.interactive_start):
@@ -256,14 +230,14 @@ n_keep = {self.params.n_keep}
  - If you want to submit another line, end your input in '\\'.
 
 """, file=sys.stderr)
-		self.set_color(CONSOLE_COLOR_PROMPT)
+		self.set_color(util.CONSOLE_COLOR_PROMPT)
 
 		self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
 
 
 	# tokenize a prompt
 	def _tokenize(self, prompt, bos=True):
-		_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
+		_arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
 		_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
 		return _arr[:_n]
 
@@ -295,7 +269,6 @@ n_keep = {self.params.n_keep}
 					self.params.path_session = ""
 
 				# try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
-				# REVIEW
 				if self.n_session_consumed < len(self.session_tokens):
 					for i in range(len(self.embd)):
 						if self.embd[i] != self.session_tokens[self.n_session_consumed]:
@@ -445,7 +418,7 @@ n_keep = {self.params.n_keep}
 
 			# reset color to default if we there is no pending user input
 			if (self.params.input_echo and len(self.embd_inp) == self.input_consumed):
-				self.set_color(CONSOLE_COLOR_DEFAULT)
+				self.set_color(util.CONSOLE_COLOR_DEFAULT)
 
 			if (self.params.interactive and len(self.embd_inp) <= self.input_consumed):
 				# if antiprompt is present, stop
@@ -486,12 +459,12 @@ n_keep = {self.params.n_keep}
 
 	def exit(self):
 		llama_cpp.llama_free(self.ctx)
-		self.set_color(CONSOLE_COLOR_DEFAULT)
+		self.set_color(util.CONSOLE_COLOR_DEFAULT)
 
 	# return past text
 	def past(self):
 		for id in self.last_n_tokens[-self.n_past:]:
-			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore")
+			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf8", errors="ignore")
 
 	# write input
 	def input(self, prompt: str):
@@ -505,7 +478,29 @@ n_keep = {self.params.n_keep}
 	def output(self):
 		self.remaining_tokens = self.params.n_predict
 		for id in self.generate():
-			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
+			cur_char = llama_cpp.llama_token_to_str(self.ctx, id)
+
+			# Add remainder of missing bytes
+			if None in self.multibyte_fix:
+				self.multibyte_fix[self.multibyte_fix.index(None)] = cur_char
+
+			# Return completed utf char
+			if len(self.multibyte_fix) > 0 and not None in self.multibyte_fix:
+				yield (b"".join(self.multibyte_fix)).decode("utf8")
+				self.multibyte_fix = []
+				continue
+
+			# Contains multi-byte UTF8
+			for num, pattern in [(2, 192), (3, 224), (4, 240)]:
+				# Bitwise AND check
+				if pattern & int.from_bytes(cur_char) == pattern:
+					self.multibyte_fix = [cur_char] + ([None] * (num-1))
+
+			# Stop incomplete bytes from passing
+			if len(self.multibyte_fix) > 0:
+				continue
+
+			yield cur_char.decode("utf8")
 
 	# read user input
 	def read_input(self):
@@ -521,7 +516,7 @@ n_keep = {self.params.n_keep}
 		self.params.input_echo = False
 
 		while self.params.interactive:
-			self.set_color(CONSOLE_COLOR_USER_INPUT)
+			self.set_color(util.CONSOLE_COLOR_USER_INPUT)
 			if (self.params.instruct):
 				print('\n> ', end="")
 				self.input(self.read_input())
@@ -529,13 +524,13 @@ n_keep = {self.params.n_keep}
 				print(self.params.input_prefix, end="")
 				self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}")
 				print(self.params.input_suffix,end="")
-			self.set_color(CONSOLE_COLOR_DEFAULT)
+			self.set_color(util.CONSOLE_COLOR_DEFAULT)
 
 			try:
 				for i in self.output():
 					print(i,end="",flush=True)
 			except KeyboardInterrupt:
-				self.set_color(CONSOLE_COLOR_DEFAULT)
+				self.set_color(util.CONSOLE_COLOR_DEFAULT)
 				if not self.params.instruct:
 					print(self.params.fix_prefix,end="")
 					self.input(self.params.fix_prefix)
diff --git a/examples/low_level_api/util.py b/examples/low_level_api/util.py
new file mode 100644
index 0000000..9d0ec2f
--- /dev/null
+++ b/examples/low_level_api/util.py
@@ -0,0 +1,95 @@
+
+ANSI_COLOR_RESET = "\x1b[0m"
+ANSI_COLOR_YELLOW = "\x1b[33m"
+ANSI_BOLD = "\x1b[1m"
+ANSI_COLOR_GREEN = "\x1b[32m"
+
+CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
+CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
+CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
+
+# Iterative search
+# Actively searches and prevents a pattern from being returned
+class IterSearch:
+	def __init__(self, pattern):
+		self.pattern = list(pattern)
+		self.buffer = []
+
+	def __call__(self, char):
+		self.buffer += [char]
+
+		if (self.pattern[:len(self.buffer)] == self.buffer):
+			if (len(self.buffer) >= len(self.pattern)):
+				self.buffer.clear()
+			return []
+
+		_tmp = self.buffer[:]
+		self.buffer.clear()
+		return _tmp
+
+class Circle:
+	def __init__(self, size, default=0):
+		self.list = [default] * size
+		self.maxsize = size
+		self.size = 0
+		self.offset = 0
+
+	def append(self, elem):
+		if self.size < self.maxsize:
+			self.list[self.size] = elem
+			self.size += 1
+		else:
+			self.list[self.offset] = elem
+			self.offset = (self.offset + 1) % self.maxsize
+
+	def __getitem__(self, val):
+		if isinstance(val, int):
+			if 0 > val or val >= self.size:
+				raise IndexError('Index out of range')
+			return self.list[val] if self.size < self.maxsize else self.list[(self.offset + val) % self.maxsize]
+		elif isinstance(val, slice):
+			start, stop, step = val.start, val.stop, val.step
+			if step is None:
+				step = 1
+			if start is None:
+				start = 0
+			if stop is None:
+				stop = self.size
+			if start < 0:
+				start = self.size + start
+			if stop < 0:
+				stop = self.size + stop
+
+			indices = range(start, stop, step)
+			return [self.list[(self.offset + i) % self.maxsize] for i in indices if i < self.size]
+		else:
+			raise TypeError('Invalid argument type')
+
+
+
+
+if __name__ == "__main__":
+	c = Circle(5)
+
+	c.append(1)
+	print(c.list)
+	print(c[:])
+	assert c[0] == 1
+	assert c[:5] == [1]
+
+	for i in range(2,5+1):
+		c.append(i)
+	print(c.list)
+	print(c[:])
+	assert c[0] == 1
+	assert c[:5] == [1,2,3,4,5]
+
+	for i in range(5+1,9+1):
+		c.append(i)
+	print(c.list)
+	print(c[:])
+	assert c[0] == 5
+	assert c[:5] == [5,6,7,8,9]
+	#assert c[:-5] == [5,6,7,8,9]
+	assert c[:10] == [5,6,7,8,9]
+

From fd80ddf703373f523bda4e62d24564fa8930f670 Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Sat, 6 May 2023 22:22:28 +0200
Subject: [PATCH 17/53] Fix a bug with wrong type

---
 examples/low_level_api/low_level_api_chat_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 9a9bc01..272b454 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -330,7 +330,7 @@ n_keep = {self.params.n_keep}
 
 				# Apply params.logit_bias map
 				for key, value in self.params.logit_bias.items():
-					logits[key] += llama_cpp.c_float(value)
+					logits[key] += value
 
 				_arr = (llama_cpp.llama_token_data * n_vocab)(*[
 					llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)

From fdcab2286c8d9e91779590d6facb3aee34456169 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 6 May 2023 21:11:57 +0000
Subject: [PATCH 18/53] Bump mkdocs-material from 9.1.4 to 9.1.9

Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.4 to 9.1.9.
- [Release notes](https://github.com/squidfunk/mkdocs-material/releases)
- [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG)
- [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.4...9.1.9)

---
updated-dependencies:
- dependency-name: mkdocs-material
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 8 ++++----
 pyproject.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 129f923..287d05e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -792,14 +792,14 @@ mkdocs = ">=1.1"
 
 [[package]]
 name = "mkdocs-material"
-version = "9.1.4"
+version = "9.1.9"
 description = "Documentation that simply works"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocs_material-9.1.4-py3-none-any.whl", hash = "sha256:4c92dcf9365068259bef3eed8e0dd5410056b6f7187bdea2d52848c0f94cd94c"},
-    {file = "mkdocs_material-9.1.4.tar.gz", hash = "sha256:c3a8943e9e4a7d2624291da365bbccf0b9f88688aa6947a46260d8c165cd4389"},
+    {file = "mkdocs_material-9.1.9-py3-none-any.whl", hash = "sha256:7db24261cb17400e132c46d17eea712bfe71056d892a9beba32cf68210297141"},
+    {file = "mkdocs_material-9.1.9.tar.gz", hash = "sha256:74d8da1371ab3a326868fe47bae3cbc4aa22e93c048b4ca5117e6817b88bd734"},
 ]
 
 [package.dependencies]
@@ -1458,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "714083b7f30a677f9a358a9633970fb88b8198d50558a0b50bf311d4a209ed4c"
+content-hash = "a921481e74f47e925f7ec2814fa0bc2e07707cb36fd12d9b33ecc6b0402a27c8"
diff --git a/pyproject.toml b/pyproject.toml
index a164ef7..55ca8ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ black = "^23.3.0"
 twine = "^4.0.2"
 mkdocs = "^1.4.2"
 mkdocstrings = {extras = ["python"], version = "^0.20.0"}
-mkdocs-material = "^9.1.4"
+mkdocs-material = "^9.1.9"
 pytest = "^7.2.2"
 httpx = "^0.24.0"
 

From 2a21b8f69e7049f03a4ab3e0b5ec51d81456a796 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 6 May 2023 21:16:08 +0000
Subject: [PATCH 19/53] Bump mkdocs from 1.4.2 to 1.4.3

Bumps [mkdocs](https://github.com/mkdocs/mkdocs) from 1.4.2 to 1.4.3.
- [Release notes](https://github.com/mkdocs/mkdocs/releases)
- [Commits](https://github.com/mkdocs/mkdocs/compare/1.4.2...1.4.3)

---
updated-dependencies:
- dependency-name: mkdocs
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 8 ++++----
 pyproject.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 287d05e..d30dc8f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -747,14 +747,14 @@ files = [
 
 [[package]]
 name = "mkdocs"
-version = "1.4.2"
+version = "1.4.3"
 description = "Project documentation with Markdown."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocs-1.4.2-py3-none-any.whl", hash = "sha256:c8856a832c1e56702577023cd64cc5f84948280c1c0fcc6af4cd39006ea6aa8c"},
-    {file = "mkdocs-1.4.2.tar.gz", hash = "sha256:8947af423a6d0facf41ea1195b8e1e8c85ad94ac95ae307fe11232e0424b11c5"},
+    {file = "mkdocs-1.4.3-py3-none-any.whl", hash = "sha256:6ee46d309bda331aac915cd24aab882c179a933bd9e77b80ce7d2eaaa3f689dd"},
+    {file = "mkdocs-1.4.3.tar.gz", hash = "sha256:5955093bbd4dd2e9403c5afaf57324ad8b04f16886512a3ee6ef828956481c57"},
 ]
 
 [package.dependencies]
@@ -1458,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "a921481e74f47e925f7ec2814fa0bc2e07707cb36fd12d9b33ecc6b0402a27c8"
+content-hash = "f2de41d10587a7f21e4891584de2c7152dfa6f75809144778b2dc34d93395abe"
diff --git a/pyproject.toml b/pyproject.toml
index 55ca8ce..1f79b74 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ typing-extensions = "^4.5.0"
 [tool.poetry.group.dev.dependencies]
 black = "^23.3.0"
 twine = "^4.0.2"
-mkdocs = "^1.4.2"
+mkdocs = "^1.4.3"
 mkdocstrings = {extras = ["python"], version = "^0.20.0"}
 mkdocs-material = "^9.1.9"
 pytest = "^7.2.2"

From 33d41fb8f3f949e29d4038fdf542ee8445af190a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 7 May 2023 00:07:39 +0000
Subject: [PATCH 20/53] Bump pytest from 7.2.2 to 7.3.1

Bumps [pytest](https://github.com/pytest-dev/pytest) from 7.2.2 to 7.3.1.
- [Release notes](https://github.com/pytest-dev/pytest/releases)
- [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest/compare/7.2.2...7.3.1)

---
updated-dependencies:
- dependency-name: pytest
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 30 +++++-------------------------
 pyproject.toml |  2 +-
 2 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index d30dc8f..0bd08d5 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -21,25 +21,6 @@ doc = ["packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
 test = ["contextlib2", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (<0.15)", "uvloop (>=0.15)"]
 trio = ["trio (>=0.16,<0.22)"]
 
-[[package]]
-name = "attrs"
-version = "22.2.0"
-description = "Classes Without Boilerplate"
-category = "dev"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"},
-    {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"},
-]
-
-[package.extras]
-cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"]
-dev = ["attrs[docs,tests]"]
-docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"]
-tests = ["attrs[tests-no-zope]", "zope.interface"]
-tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"]
-
 [[package]]
 name = "black"
 version = "23.3.0"
@@ -1007,18 +988,17 @@ pyyaml = "*"
 
 [[package]]
 name = "pytest"
-version = "7.2.2"
+version = "7.3.1"
 description = "pytest: simple powerful testing with Python"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.2.2-py3-none-any.whl", hash = "sha256:130328f552dcfac0b1cec75c12e3f005619dc5f874f0a06e8ff7263f0ee6225e"},
-    {file = "pytest-7.2.2.tar.gz", hash = "sha256:c99ab0c73aceb050f68929bc93af19ab6db0558791c6a0715723abe9d0ade9d4"},
+    {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
+    {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
 ]
 
 [package.dependencies]
-attrs = ">=19.2.0"
 colorama = {version = "*", markers = "sys_platform == \"win32\""}
 exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
 iniconfig = "*"
@@ -1027,7 +1007,7 @@ pluggy = ">=0.12,<2.0"
 tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
-testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
 
 [[package]]
 name = "python-dateutil"
@@ -1458,4 +1438,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "f2de41d10587a7f21e4891584de2c7152dfa6f75809144778b2dc34d93395abe"
+content-hash = "e02fff3d4a50fbc9a89f6f001409a5f066c26a341c2d5f2dfbfb32f07e711eca"
diff --git a/pyproject.toml b/pyproject.toml
index 1f79b74..6f83611 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,7 @@ twine = "^4.0.2"
 mkdocs = "^1.4.3"
 mkdocstrings = {extras = ["python"], version = "^0.20.0"}
 mkdocs-material = "^9.1.9"
-pytest = "^7.2.2"
+pytest = "^7.3.1"
 httpx = "^0.24.0"
 
 [build-system]

From ae3c639764359890e692776cfb87ff84b911532f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 7 May 2023 00:16:31 +0000
Subject: [PATCH 21/53] Bump mkdocstrings from 0.20.0 to 0.21.2

Bumps [mkdocstrings](https://github.com/mkdocstrings/mkdocstrings) from 0.20.0 to 0.21.2.
- [Release notes](https://github.com/mkdocstrings/mkdocstrings/releases)
- [Changelog](https://github.com/mkdocstrings/mkdocstrings/blob/master/CHANGELOG.md)
- [Commits](https://github.com/mkdocstrings/mkdocstrings/compare/0.20.0...0.21.2)

---
updated-dependencies:
- dependency-name: mkdocstrings
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 9 +++++----
 pyproject.toml | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 0bd08d5..5b364a7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -808,14 +808,14 @@ files = [
 
 [[package]]
 name = "mkdocstrings"
-version = "0.20.0"
+version = "0.21.2"
 description = "Automatic documentation from sources, for MkDocs."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocstrings-0.20.0-py3-none-any.whl", hash = "sha256:f17fc2c4f760ec302b069075ef9e31045aa6372ca91d2f35ded3adba8e25a472"},
-    {file = "mkdocstrings-0.20.0.tar.gz", hash = "sha256:c757f4f646d4f939491d6bc9256bfe33e36c5f8026392f49eaa351d241c838e5"},
+    {file = "mkdocstrings-0.21.2-py3-none-any.whl", hash = "sha256:949ef8da92df9d692ca07be50616459a6b536083a25520fd54b00e8814ce019b"},
+    {file = "mkdocstrings-0.21.2.tar.gz", hash = "sha256:304e56a2e90595708a38a13a278e538a67ad82052dd5c8b71f77a604a4f3d911"},
 ]
 
 [package.dependencies]
@@ -826,6 +826,7 @@ mkdocs = ">=1.2"
 mkdocs-autorefs = ">=0.3.1"
 mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""}
 pymdown-extensions = ">=6.3"
+typing-extensions = {version = ">=4.1", markers = "python_version < \"3.10\""}
 
 [package.extras]
 crystal = ["mkdocstrings-crystal (>=0.3.4)"]
@@ -1438,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "e02fff3d4a50fbc9a89f6f001409a5f066c26a341c2d5f2dfbfb32f07e711eca"
+content-hash = "e87403dcd0a0b8484436b02c392326adfaf22b8d7e182d77e4a155c67a7435bc"
diff --git a/pyproject.toml b/pyproject.toml
index 6f83611..a11faef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ typing-extensions = "^4.5.0"
 black = "^23.3.0"
 twine = "^4.0.2"
 mkdocs = "^1.4.3"
-mkdocstrings = {extras = ["python"], version = "^0.20.0"}
+mkdocstrings = {extras = ["python"], version = "^0.21.2"}
 mkdocs-material = "^9.1.9"
 pytest = "^7.3.1"
 httpx = "^0.24.0"

From bc853e3742fd2a4718bd66bd501bdb5ede50f6d3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 6 May 2023 21:32:50 -0400
Subject: [PATCH 22/53] Fix type for eval_logits in LlamaState object

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index a643f51..fc91ea4 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -51,7 +51,7 @@ class LlamaState:
     def __init__(
         self,
         eval_tokens: Deque[llama_cpp.llama_token],
-        eval_logits: Deque[List[llama_cpp.c_float]],
+        eval_logits: Deque[List[float]],
         llama_state,  # type: llama_cpp.Array[llama_cpp.c_uint8]
         llama_state_size: llama_cpp.c_size_t,
     ):

From c76e0913bbc6a039f5456ca44f4d84966e5c14fd Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 6 May 2023 22:18:31 -0400
Subject: [PATCH 23/53] Update issue templates

---
 .github/ISSUE_TEMPLATE/bug_report.md      | 80 +++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/feature_request.md | 20 ++++++
 2 files changed, 100 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..b8e33e5
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,80 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+# Prerequisites
+
+Please answer the following questions for yourself before submitting an issue.
+
+- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
+- [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md).
+- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
+- [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share.
+
+# Expected Behavior
+
+Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do.
+
+# Current Behavior
+
+Please provide a detailed written description of what `llama-cpp-python` did, instead.
+
+# Environment and Context
+
+Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
+
+* Physical (or virtual) hardware you are using, e.g. for Linux:
+
+`$ lscpu`
+
+* Operating System, e.g. for Linux:
+
+`$ uname -a`
+
+* SDK version, e.g. for Linux:
+
+```
+$ python3 --version
+$ make --version
+$ g++ --version
+```
+
+# Failure Information (for bugs)
+
+Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
+
+# Steps to Reproduce
+
+Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
+
+1. step 1
+2. step 2
+3. step 3
+4. etc.
+
+**Note: Many issues seem to be regarding performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.**
+
+# Failure Logs
+
+Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
+
+Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
+
+Example environment info:
+```
+llama-cpp-python$ git log | head -1
+commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2
+
+llama-cpp-python$ python3 --version
+Python 3.10.10
+
+llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette"
+fastapi            0.95.0
+sse-starlette      1.3.3
+uvicorn            0.21.1
+```
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..bbcbbe7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.

From 7c3743fe5f2781a8aab9ba8e15f4d250963747cf Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 00:12:47 -0400
Subject: [PATCH 24/53] Update llama.cpp

---
 llama_cpp/llama.py     | 4 ++++
 llama_cpp/llama_cpp.py | 8 ++++----
 vendor/llama.cpp       | 2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index fc91ea4..0db5c10 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -297,21 +297,25 @@ class Llama:
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.pointer(candidates),
                 k=top_k,
+                min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_tail_free(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.pointer(candidates),
                 z=llama_cpp.c_float(1.0),
+                min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_typical(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.pointer(candidates),
                 p=llama_cpp.c_float(1.0),
+                min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_top_p(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.pointer(candidates),
                 p=top_p,
+                min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_temperature(
                 ctx=self.ctx,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index ccec12c..527ed7c 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -515,7 +515,7 @@ def llama_sample_top_k(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     k: c_int,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
     return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
 
@@ -534,7 +534,7 @@ def llama_sample_top_p(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     p: c_float,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
     return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
 
@@ -553,7 +553,7 @@ def llama_sample_tail_free(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     z: c_float,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
     return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
 
@@ -572,7 +572,7 @@ def llama_sample_typical(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     p: c_float,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
     return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 2edbdb0..1b0fd45 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 2edbdb0f99336cb41f0995061c7602ed54beb863
+Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25

From 397ae97f64bb235db5a773a63caaeea5b258a20c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 01:41:19 -0400
Subject: [PATCH 25/53] Update README

---
 README.md | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c46fa11..9daca60 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,10 @@ You can force the use of `cmake` on Linux / MacOS setting the `FORCE_CMAKE=1` en
 
 ## High-level API
 
+The high-level API provides a simple managed interface through the `Llama` class.
+
+Below is a short example demonstrating how to use the high-level API to generate text:
+
 ```python
 >>> from llama_cpp import Llama
 >>> llm = Llama(model_path="./models/7B/ggml-model.bin")
@@ -90,8 +94,25 @@ docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-
 
 ## Low-level API
 
-The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`.
-The entire API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and should mirror [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
+The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`.
+The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
+
+Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
+
+```python
+>>> import llama_cpp
+>>> import ctypes
+>>> params = llama_cpp.llama_context_default_params()
+# use bytes for char * params
+>>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params)
+>>> max_tokens = params.n_ctx
+# use ctypes arrays for array params
+>>> tokens = (llama_cppp.llama_token * int(max_tokens))()
+>>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True))
+>>> llama_cpp.llama_free(ctx)
+```
+
+Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
 
 
 # Documentation

From c382d8f86a628edec4427ac01687babb5c4aa35f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 02:00:22 -0400
Subject: [PATCH 26/53] Revert "llama_cpp server: mark model as required"

This reverts commit e40fcb05754d0ec9c65359e245a436794cbfefdb.
---
 llama_cpp/server/app.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 595476f..0b7b1b2 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -149,8 +149,15 @@ class CreateCompletionRequest(BaseModel):
         description="The number of logprobs to generate. If None, no logprobs are generated."
     )
 
-    # ignored, but marked as required for the sake of compatibility with openai's api
-    model: str = model_field
+    # ignored or currently unsupported
+    model: Optional[str] = model_field
+    n: Optional[int] = 1
+    logprobs: Optional[int] = Field(None)
+    presence_penalty: Optional[float] = 0
+    frequency_penalty: Optional[float] = 0
+    best_of: Optional[int] = 1
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    user: Optional[str] = Field(None)
 
     # llama.cpp specific parameters
     top_k: int = top_k_field
@@ -190,11 +197,11 @@ def create_completion(
 
 
 class CreateEmbeddingRequest(BaseModel):
-    # ignored, but marked as required for the sake of compatibility with openai's api
-    model: str = model_field
+    model: Optional[str] = model_field
     input: str = Field(
         description="The input to embed."
     )
+    user: Optional[str]
 
     class Config:
         schema_extra = {
@@ -235,8 +242,13 @@ class CreateChatCompletionRequest(BaseModel):
     stop: Optional[List[str]] = stop_field
     stream: bool = stream_field
 
-    # ignored, but marked as required for the sake of compatibility with openai's api
-    model: str = model_field
+    # ignored or currently unsupported
+    model: Optional[str] = model_field
+    n: Optional[int] = 1
+    presence_penalty: Optional[float] = 0
+    frequency_penalty: Optional[float] = 0
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    user: Optional[str] = Field(None)
 
     # llama.cpp specific parameters
     top_k: int = top_k_field

From 86753976c4ce1289a784b7385f419f471f7e8a50 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 02:02:34 -0400
Subject: [PATCH 27/53] Revert "llama_cpp server: delete some ignored / unused
 parameters"

This reverts commit b47b9549d57f146a00ee19cd7d2bb294111abb67.
---
 llama_cpp/llama_types.py |  2 ++
 llama_cpp/server/app.py  | 17 ++++++++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index b770a01..bfc7342 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -60,6 +60,8 @@ class Completion(TypedDict):
 class ChatCompletionMessage(TypedDict):
     role: Literal["assistant", "user", "system"]
     content: str
+    user: NotRequired[str]
+
 
 class ChatCompletionChoice(TypedDict):
     index: int
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 0b7b1b2..ba2ca2f 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -185,7 +185,13 @@ def create_completion(
     completion_or_chunks = llama(
         **request.dict(
             exclude={
-                "model"
+                "model",
+                "n",
+                "frequency_penalty",
+                "presence_penalty",
+                "best_of",
+                "logit_bias",
+                "user",
             }
         )
     )
@@ -221,7 +227,7 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
 def create_embedding(
     request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
-    return llama.create_embedding(**request.dict(exclude={"model"}))
+    return llama.create_embedding(**request.dict(exclude={"model", "user"}))
 
 
 class ChatCompletionRequestMessage(BaseModel):
@@ -283,7 +289,12 @@ def create_chat_completion(
     completion_or_chunks = llama.create_chat_completion(
         **request.dict(
             exclude={
-                "model"
+                "model",
+                "n",
+                "presence_penalty",
+                "frequency_penalty",
+                "logit_bias",
+                "user",
             }
         ),
     )

From 1a00e452ea1e82232ffc035647b1c56116ae62ea Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 02:52:20 -0400
Subject: [PATCH 28/53] Update settings fields and defaults

---
 llama_cpp/server/app.py | 94 ++++++++++++++++++++++++-----------------
 1 file changed, 55 insertions(+), 39 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index ba2ca2f..48dfc5e 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -13,18 +13,41 @@ from sse_starlette.sse import EventSourceResponse
 
 
 class Settings(BaseSettings):
-    model: str
-    n_ctx: int = 2048
-    n_batch: int = 512
-    n_threads: int = max((os.cpu_count() or 2) // 2, 1)
-    f16_kv: bool = True
-    use_mlock: bool = False  # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
-    use_mmap: bool = True
-    embedding: bool = True
-    last_n_tokens_size: int = 64
-    logits_all: bool = False
-    cache: bool = False  # WARNING: This is an experimental feature
-    vocab_only: bool = False
+    model: str = Field(
+        description="The path to the model to use for generating completions."
+    )
+    n_ctx: int = Field(default=2048, ge=1, description="The context size.")
+    n_batch: int = Field(
+        default=512, ge=1, description="The batch size to use per eval."
+    )
+    n_threads: int = Field(
+        default=max((os.cpu_count() or 2) // 2, 1),
+        ge=1,
+        description="The number of threads to use.",
+    )
+    f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
+    use_mlock: bool = Field(
+        default=bool(llama_cpp.llama_mlock_supported().value),
+        description="Use mlock.",
+    )
+    use_mmap: bool = Field(
+        default=bool(llama_cpp.llama_mmap_supported().value),
+        description="Use mmap.",
+    )
+    embedding: bool = Field(default=True, description="Whether to use embeddings.")
+    last_n_tokens_size: int = Field(
+        default=64,
+        ge=0,
+        description="Last n tokens to keep for repeat penalty calculation.",
+    )
+    logits_all: bool = Field(default=True, description="Whether to return logits.")
+    cache: bool = Field(
+        default=False,
+        description="Use a cache to reduce processing times for evaluated prompts.",
+    )
+    vocab_only: bool = Field(
+        default=False, description="Whether to only return the vocabulary."
+    )
 
 
 router = APIRouter()
@@ -74,79 +97,75 @@ def get_llama():
     with llama_lock:
         yield llama
 
-model_field = Field(
-    description="The model to use for generating completions."
-)
+
+model_field = Field(description="The model to use for generating completions.")
 
 max_tokens_field = Field(
-    default=16,
-    ge=1,
-    le=2048,
-    description="The maximum number of tokens to generate."
+    default=16, ge=1, le=2048, description="The maximum number of tokens to generate."
 )
 
 temperature_field = Field(
     default=0.8,
     ge=0.0,
     le=2.0,
-    description="Adjust the randomness of the generated text.\n\n" +
-    "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run."
+    description="Adjust the randomness of the generated text.\n\n"
+    + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
 )
 
 top_p_field = Field(
     default=0.95,
     ge=0.0,
     le=1.0,
-    description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" +
-    "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text."
+    description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n"
+    + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.",
 )
 
 stop_field = Field(
     default=None,
-    description="A list of tokens at which to stop generation. If None, no stop tokens are used."
+    description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
 )
 
 stream_field = Field(
     default=False,
-    description="Whether to stream the results as they are generated. Useful for chatbots."
+    description="Whether to stream the results as they are generated. Useful for chatbots.",
 )
 
 top_k_field = Field(
     default=40,
     ge=0,
-    description="Limit the next token selection to the K most probable tokens.\n\n" +
-    "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text."
+    description="Limit the next token selection to the K most probable tokens.\n\n"
+    + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.",
 )
 
 repeat_penalty_field = Field(
     default=1.0,
     ge=0.0,
-    description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" +
-    "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient."
+    description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n"
+    + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
 )
 
+
 class CreateCompletionRequest(BaseModel):
     prompt: Optional[str] = Field(
-        default="",
-        description="The prompt to generate completions for."
+        default="", description="The prompt to generate completions for."
     )
     suffix: Optional[str] = Field(
         default=None,
-        description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots."
+        description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
     )
     max_tokens: int = max_tokens_field
     temperature: float = temperature_field
     top_p: float = top_p_field
     echo: bool = Field(
         default=False,
-        description="Whether to echo the prompt in the generated text. Useful for chatbots."
+        description="Whether to echo the prompt in the generated text. Useful for chatbots.",
     )
     stop: Optional[List[str]] = stop_field
     stream: bool = stream_field
     logprobs: Optional[int] = Field(
         default=None,
         ge=0,
-        description="The number of logprobs to generate. If None, no logprobs are generated."
+        description="The number of logprobs to generate. If None, no logprobs are generated.",
     )
 
     # ignored or currently unsupported
@@ -204,9 +223,7 @@ def create_completion(
 
 class CreateEmbeddingRequest(BaseModel):
     model: Optional[str] = model_field
-    input: str = Field(
-        description="The input to embed."
-    )
+    input: str = Field(description="The input to embed.")
     user: Optional[str]
 
     class Config:
@@ -239,8 +256,7 @@ class ChatCompletionRequestMessage(BaseModel):
 
 class CreateChatCompletionRequest(BaseModel):
     messages: List[ChatCompletionRequestMessage] = Field(
-        default=[],
-        description="A list of messages to generate completions for."
+        default=[], description="A list of messages to generate completions for."
     )
     max_tokens: int = max_tokens_field
     temperature: float = temperature_field

From 5a3413eee398ff36e0fb496a44e39d960f402a48 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 03:03:57 -0400
Subject: [PATCH 29/53] Update cpu_count

---
 llama_cpp/server/app.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 48dfc5e..dfb819c 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -1,8 +1,8 @@
-import os
 import json
+import multiprocessing
 from threading import Lock
 from typing import List, Optional, Union, Iterator, Dict
-from typing_extensions import TypedDict, Literal, Annotated
+from typing_extensions import TypedDict, Literal
 
 import llama_cpp
 
@@ -21,7 +21,7 @@ class Settings(BaseSettings):
         default=512, ge=1, description="The batch size to use per eval."
     )
     n_threads: int = Field(
-        default=max((os.cpu_count() or 2) // 2, 1),
+        default=max(multiprocessing.cpu_count() // 2, 1),
         ge=1,
         description="The number of threads to use.",
     )

From 3fbda717904080ec0286fc13488e3318e3dec75e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 03:04:22 -0400
Subject: [PATCH 30/53] Fix mlock_supported and mmap_supported return type

---
 llama_cpp/llama_cpp.py  | 12 ++++++++----
 llama_cpp/server/app.py |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 527ed7c..bce0fd7 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -157,7 +157,7 @@ _lib.llama_context_default_params.argtypes = []
 _lib.llama_context_default_params.restype = llama_context_params
 
 
-def llama_mmap_supported() -> c_bool:
+def llama_mmap_supported() -> bool:
     return _lib.llama_mmap_supported()
 
 
@@ -165,7 +165,7 @@ _lib.llama_mmap_supported.argtypes = []
 _lib.llama_mmap_supported.restype = c_bool
 
 
-def llama_mlock_supported() -> c_bool:
+def llama_mlock_supported() -> bool:
     return _lib.llama_mlock_supported()
 
 
@@ -387,7 +387,9 @@ _lib.llama_n_embd.restype = c_int
 # Can be mutated in order to change the probabilities of the next token
 # Rows: n_tokens
 # Cols: n_vocab
-def llama_get_logits(ctx: llama_context_p):  # type: (...) -> Array[float] # type: ignore
+def llama_get_logits(
+    ctx: llama_context_p,
+):  # type: (...) -> Array[float] # type: ignore
     return _lib.llama_get_logits(ctx)
 
 
@@ -397,7 +399,9 @@ _lib.llama_get_logits.restype = c_float_p
 
 # Get the embeddings for the input
 # shape: [n_embd] (1-dimensional)
-def llama_get_embeddings(ctx: llama_context_p):  # type: (...) -> Array[float] # type: ignore
+def llama_get_embeddings(
+    ctx: llama_context_p,
+):  # type: (...) -> Array[float] # type: ignore
     return _lib.llama_get_embeddings(ctx)
 
 
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index dfb819c..3e45684 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -27,11 +27,11 @@ class Settings(BaseSettings):
     )
     f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
     use_mlock: bool = Field(
-        default=bool(llama_cpp.llama_mlock_supported().value),
+        default=llama_cpp.llama_mlock_supported(),
         description="Use mlock.",
     )
     use_mmap: bool = Field(
-        default=bool(llama_cpp.llama_mmap_supported().value),
+        default=llama_cpp.llama_mmap_supported(),
         description="Use mmap.",
     )
     embedding: bool = Field(default=True, description="Whether to use embeddings.")

From 5f43c553d59f5ee8ca6bea3044d50ba40bc8b426 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 03:29:33 -0400
Subject: [PATCH 31/53] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a11faef..f6d1e9a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.43"
+version = "0.1.44"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 405886a..020d236 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.43",
+    version="0.1.44",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 627811ea837f6f3b108d916a5ae802111d0f0690 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 05:09:10 -0400
Subject: [PATCH 32/53] Add verbose flag to server

---
 llama_cpp/server/app.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 3e45684..f46f920 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -48,6 +48,9 @@ class Settings(BaseSettings):
     vocab_only: bool = Field(
         default=False, description="Whether to only return the vocabulary."
     )
+    verbose: bool = Field(
+        default=True, description="Whether to print debug information."
+    )
 
 
 router = APIRouter()
@@ -83,6 +86,7 @@ def create_app(settings: Optional[Settings] = None):
         n_ctx=settings.n_ctx,
         last_n_tokens_size=settings.last_n_tokens_size,
         vocab_only=settings.vocab_only,
+        verbose=settings.verbose,
     )
     if settings.cache:
         cache = llama_cpp.LlamaCache()

From 3adc8fb3ae887d385b4a884814f9055c7165f168 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 05:10:52 -0400
Subject: [PATCH 33/53] Update README to use cli options for server

---
 README.md | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 9daca60..9fa3bed 100644
--- a/README.md
+++ b/README.md
@@ -68,18 +68,9 @@ This allows you to use llama.cpp compatible models with any OpenAI compatible cl
 
 To install the server package and get started:
 
-Linux/MacOS
 ```bash
 pip install llama-cpp-python[server]
-export MODEL=./models/7B/ggml-model.bin
-python3 -m llama_cpp.server
-```
-
-Windows
-```cmd
-pip install llama-cpp-python[server]
-SET MODEL=..\models\7B\ggml-model.bin
-python3 -m llama_cpp.server
+python3 -m llama_cpp.server --model models/7B/ggml-model.bin
 ```
 
 Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.

From 4f8cf52a38761f8cd611d3f65f07b6fe382445a9 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 05:20:04 -0400
Subject: [PATCH 34/53] Update README

---
 README.md | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9fa3bed..b7772d9 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ This package provides:
   - OpenAI-like API
   - LangChain compatibility
 
-## Installation
+## Installation from PyPI (recommended)
 
 Install from PyPI (requires a c compiler):
 
@@ -26,8 +26,30 @@ pip install llama-cpp-python
 The above command will attempt to install the package and build build `llama.cpp` from source.
 This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system.
 
-This method defaults to using `make` to build `llama.cpp` on Linux / MacOS and `cmake` on Windows.
-You can force the use of `cmake` on Linux / MacOS setting the `FORCE_CMAKE=1` environment variable before installing.
+
+### Installation with OpenBLAS / cuBLAS / CLBlast
+
+`llama.cpp` supports multiple BLAS backends for faster processing.
+Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend.
+
+To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
+
+```bash
+LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
+```
+
+To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
+
+```bash
+LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
+```
+
+To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
+
+```bash
+LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python
+```
+
 
 ## High-level API
 

From 2753b853212bfb81a3643b69eb666443ad03d494 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 13:19:56 -0400
Subject: [PATCH 35/53] Format

---
 llama_cpp/llama.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 0db5c10..6836ea5 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -127,9 +127,7 @@ class Llama:
         self.last_n_tokens_size = last_n_tokens_size
         self.n_batch = min(n_ctx, n_batch)
         self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx)
-        self.eval_logits: Deque[List[float]] = deque(
-            maxlen=n_ctx if logits_all else 1
-        )
+        self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx if logits_all else 1)
 
         self.cache: Optional[LlamaCache] = None
 
@@ -547,12 +545,6 @@ class Llama:
                 finish_reason = "stop"
                 break
 
-            if self.cache and len(completion_tokens) == 0:
-                if prompt_tokens not in self.cache:
-                    if self.verbose:
-                        print("Llama._create_completion: cache miss", file=sys.stderr)
-                    self.cache[prompt_tokens] = self.save_state()
-
             completion_tokens.append(token)
 
             all_text = self.detokenize(completion_tokens)
@@ -611,6 +603,11 @@ class Llama:
                 finish_reason = "length"
                 break
 
+        if self.cache:
+            if self.verbose:
+                print("Llama._create_completion: cache save", file=sys.stderr)
+            self.cache[prompt_tokens + completion_tokens] = self.save_state()
+
         if stream:
             yield {
                 "id": completion_id,

From 8dfde63255651f05e015df6dcfb614b2eac7c1f5 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 19:30:14 -0400
Subject: [PATCH 36/53] Fix return type

---
 llama_cpp/llama_cpp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index bce0fd7..e60558c 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -260,7 +260,7 @@ _lib.llama_get_state_size.restype = c_size_t
 # Returns the number of bytes copied
 def llama_copy_state_data(
     ctx: llama_context_p, dest  # type: Array[c_uint8]
-) -> c_size_t:
+) -> int:
     return _lib.llama_copy_state_data(ctx, dest)
 
 
@@ -272,7 +272,7 @@ _lib.llama_copy_state_data.restype = c_size_t
 # Returns the number of bytes read
 def llama_set_state_data(
     ctx: llama_context_p, src  # type: Array[c_uint8]
-) -> c_size_t:
+) -> int:
     return _lib.llama_set_state_data(ctx, src)
 
 

From 0e94a70de1727c8071d5802c34ad83a1fee987b0 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 19:31:26 -0400
Subject: [PATCH 37/53] Add in-memory longest prefix cache. Closes #158

---
 llama_cpp/llama.py | 91 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 64 insertions(+), 27 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6836ea5..de06da0 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -5,7 +5,7 @@ import time
 import math
 import multiprocessing
 from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple
-from collections import deque
+from collections import deque, OrderedDict
 
 from . import llama_cpp
 from .llama_types import *
@@ -14,37 +14,50 @@ from .llama_types import *
 class LlamaCache:
     """Cache for a llama.cpp model."""
 
-    def __init__(self):
-        self.cache_state: Dict[Tuple[llama_cpp.llama_token, ...], "LlamaState"] = dict()
+    def __init__(self, capacity_bytes: int = (2 << 30)):
+        self.cache_state: OrderedDict[
+            Tuple[llama_cpp.llama_token, ...], "LlamaState"
+        ] = OrderedDict()
+        self.capacity_bytes = capacity_bytes
 
-    def _sorted_keys(self) -> List[Tuple[llama_cpp.llama_token, ...]]:
-        return [
-            key
-            for _, key in sorted(
-                ((len(key), key) for key in self.cache_state.keys()), reverse=True
-            )
-        ]
+    @property
+    def cache_size(self):
+        return sum([state.llama_state_size for state in self.cache_state.values()])
 
-    def _find_key(
-        self, key: Tuple[llama_cpp.llama_token, ...]
+    def _find_longest_prefix_key(
+        self,
+        key: Tuple[llama_cpp.llama_token, ...],
     ) -> Optional[Tuple[llama_cpp.llama_token, ...]]:
-        for k in self._sorted_keys():
-            if key[: len(k)] == k:
-                return k
-        return None
+        min_len = 0
+        min_key = None
+        keys = (
+            (k, Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys()
+        )
+        for k, prefix_len in keys:
+            if prefix_len > min_len:
+                min_len = prefix_len
+                min_key = k
+        return min_key
 
     def __getitem__(self, key: Sequence[llama_cpp.llama_token]) -> "LlamaState":
-        _key = self._find_key(tuple(key))
+        key = tuple(key)
+        _key = self._find_longest_prefix_key(key)
         if _key is None:
-            raise KeyError(f"Key not found: {key}")
-        return self.cache_state[_key]
+            raise KeyError(f"Key not found")
+        value = self.cache_state[_key]
+        self.cache_state.move_to_end(_key)
+        return value
 
     def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool:
-        return self._find_key(tuple(key)) is not None
+        return self._find_longest_prefix_key(tuple(key)) is not None
 
     def __setitem__(self, key: Sequence[llama_cpp.llama_token], value: "LlamaState"):
-        self.cache_state = dict()  # NOTE: Currently limit to one cache entry.
-        self.cache_state[tuple(key)] = value
+        key = tuple(key)
+        if key in self.cache_state:
+            del self.cache_state[key]
+        self.cache_state[key] = value
+        while self.cache_size > self.capacity_bytes:
+            self.cache_state.popitem(last=False)
 
 
 class LlamaState:
@@ -53,7 +66,7 @@ class LlamaState:
         eval_tokens: Deque[llama_cpp.llama_token],
         eval_logits: Deque[List[float]],
         llama_state,  # type: llama_cpp.Array[llama_cpp.c_uint8]
-        llama_state_size: llama_cpp.c_size_t,
+        llama_state_size: int,
     ):
         self.eval_tokens = eval_tokens
         self.eval_logits = eval_logits
@@ -526,10 +539,22 @@ class Llama:
                 "logprobs is not supported for models created with logits_all=False"
             )
 
-        if self.cache and prompt_tokens in self.cache:
-            if self.verbose:
-                print("Llama._create_completion: cache hit", file=sys.stderr)
-            self.load_state(self.cache[prompt_tokens])
+        if self.cache:
+            try:
+                cache_item = self.cache[prompt_tokens]
+                cache_prefix_len = Llama.longest_token_prefix(
+                    cache_item.eval_tokens, prompt_tokens
+                )
+                eval_prefix_len = Llama.longest_token_prefix(
+                    self.eval_tokens, prompt_tokens
+                )
+                if cache_prefix_len > eval_prefix_len:
+                    self.load_state(cache_item)
+                    if self.verbose:
+                        print("Llama._create_completion: cache hit", file=sys.stderr)
+            except KeyError:
+                if self.verbose:
+                    print("Llama._create_completion: cache miss", file=sys.stderr)
 
         finish_reason = "length"
         multibyte_fix = 0
@@ -1004,3 +1029,15 @@ class Llama:
         exps = [math.exp(float(x)) for x in logits]
         sum_exps = sum(exps)
         return [math.log(x / sum_exps) for x in exps]
+
+    @staticmethod
+    def longest_token_prefix(
+        a: Sequence[llama_cpp.llama_token], b: Sequence[llama_cpp.llama_token]
+    ):
+        longest_prefix = 0
+        for _a, _b in zip(a, b):
+            if _a == _b:
+                longest_prefix += 1
+            else:
+                break
+        return longest_prefix

From 14da46f16e46dba2a6964c8d0d7ddbce388182e5 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 19:33:17 -0400
Subject: [PATCH 38/53] Added cache size to settins object.

---
 llama_cpp/server/app.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index f46f920..e74d17d 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -45,6 +45,10 @@ class Settings(BaseSettings):
         default=False,
         description="Use a cache to reduce processing times for evaluated prompts.",
     )
+    cache_size: int = Field(
+        default=2 << 30,
+        description="The size of the cache in bytes. Only used if cache is True.",
+    )
     vocab_only: bool = Field(
         default=False, description="Whether to only return the vocabulary."
     )
@@ -89,7 +93,7 @@ def create_app(settings: Optional[Settings] = None):
         verbose=settings.verbose,
     )
     if settings.cache:
-        cache = llama_cpp.LlamaCache()
+        cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size)
         llama.set_cache(cache)
     return app
 

From e72f58614b35ae3f995fd46897f2272d8f23362c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 20:01:34 -0400
Subject: [PATCH 39/53] Change pointer to lower overhead byref

---
 llama_cpp/llama.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index de06da0..41e6fd8 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -295,47 +295,47 @@ class Llama:
             ctx=self.ctx,
             last_tokens_data=last_n_tokens_data,
             last_tokens_size=last_n_tokens_size,
-            candidates=llama_cpp.ctypes.pointer(candidates),
+            candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
             penalty=repeat_penalty,
         )
         if float(temp.value) == 0.0:
             return llama_cpp.llama_sample_token_greedy(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
             )
         else:
             llama_cpp.llama_sample_top_k(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                 k=top_k,
                 min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_tail_free(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                 z=llama_cpp.c_float(1.0),
                 min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_typical(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                 p=llama_cpp.c_float(1.0),
                 min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_top_p(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                 p=top_p,
                 min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_temperature(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                 temp=temp,
             )
             return llama_cpp.llama_sample_token(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
             )
 
     def sample(

From a0b61ea2a7c27660bc1421802c327b379a47a7d7 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 20:17:52 -0400
Subject: [PATCH 40/53] Bugfix for models endpoint

---
 llama_cpp/server/app.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index e74d17d..b46914e 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -357,7 +357,9 @@ GetModelResponse = create_model_from_typeddict(ModelList)
 
 
 @router.get("/v1/models", response_model=GetModelResponse)
-def get_models() -> ModelList:
+def get_models(
+    llama: llama_cpp.Llama = Depends(get_llama),
+) -> ModelList:
     return {
         "object": "list",
         "data": [

From 75d8619b1a373a3900dbbdaf2fc7f71343ae312e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 20:19:34 -0400
Subject: [PATCH 41/53] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f6d1e9a..781d21b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.44"
+version = "0.1.45"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 020d236..e2bc2da 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.44",
+    version="0.1.45",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 65d9cc050cb630a1d12f3874947b4729d1cbaab7 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 8 May 2023 01:30:18 -0400
Subject: [PATCH 42/53] Add openai frequency and presence penalty parameters.
 Closes #169

---
 llama_cpp/llama.py      | 38 ++++++++++++++++++++++++++++++++++++--
 llama_cpp/server/app.py |  4 ----
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 41e6fd8..7b53112 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -261,7 +261,7 @@ class Llama:
             ]
             self.eval_logits.extend(logits)
 
-    def _sample_top_p_top_k(
+    def _sample(
         self,
         last_n_tokens_data,  # type: llama_cpp.Array[llama_cpp.llama_token]
         last_n_tokens_size: llama_cpp.c_int,
@@ -269,6 +269,8 @@ class Llama:
         top_p: llama_cpp.c_float,
         temp: llama_cpp.c_float,
         repeat_penalty: llama_cpp.c_float,
+        frequency_penalty: llama_cpp.c_float,
+        presence_penalty: llama_cpp.c_float,
     ):
         assert self.ctx is not None
         assert len(self.eval_logits) > 0
@@ -298,6 +300,14 @@ class Llama:
             candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
             penalty=repeat_penalty,
         )
+        llama_cpp.llama_sample_frequency_and_presence_penalties(
+            ctx=self.ctx,
+            candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+            last_tokens_data=last_n_tokens_data,
+            last_tokens_size=last_n_tokens_size,
+            alpha_frequency=frequency_penalty,
+            alpha_presence=presence_penalty,
+        )
         if float(temp.value) == 0.0:
             return llama_cpp.llama_sample_token_greedy(
                 ctx=self.ctx,
@@ -344,6 +354,8 @@ class Llama:
         top_p: float,
         temp: float,
         repeat_penalty: float,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
     ):
         """Sample a token from the model.
 
@@ -360,7 +372,7 @@ class Llama:
         last_n_tokens_data = [llama_cpp.llama_token(0)] * max(
             0, self.last_n_tokens_size - len(self.eval_tokens)
         ) + list(self.eval_tokens)[-self.last_n_tokens_size :]
-        return self._sample_top_p_top_k(
+        return self._sample(
             last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)(
                 *last_n_tokens_data
             ),
@@ -369,6 +381,8 @@ class Llama:
             top_p=llama_cpp.c_float(top_p),
             temp=llama_cpp.c_float(temp),
             repeat_penalty=llama_cpp.c_float(repeat_penalty),
+            frequency_penalty=llama_cpp.c_float(frequency_penalty),
+            presence_penalty=llama_cpp.c_float(presence_penalty),
         )
 
     def generate(
@@ -378,6 +392,8 @@ class Llama:
         top_p: float,
         temp: float,
         repeat_penalty: float,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
         reset: bool = True,
     ) -> Generator[
         llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None
@@ -431,6 +447,8 @@ class Llama:
                 top_k=top_k,
                 top_p=top_p,
                 temp=temp,
+                frequency_penalty=frequency_penalty,
+                presence_penalty=presence_penalty,
                 repeat_penalty=repeat_penalty,
             )
             tokens_or_none = yield token
@@ -505,6 +523,8 @@ class Llama:
         logprobs: Optional[int] = None,
         echo: bool = False,
         stop: Optional[List[str]] = [],
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
@@ -563,6 +583,8 @@ class Llama:
             top_k=top_k,
             top_p=top_p,
             temp=temperature,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
             repeat_penalty=repeat_penalty,
         ):
             if token == llama_cpp.llama_token_eos():
@@ -737,6 +759,8 @@ class Llama:
         logprobs: Optional[int] = None,
         echo: bool = False,
         stop: Optional[List[str]] = [],
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
@@ -772,6 +796,8 @@ class Llama:
             logprobs=logprobs,
             echo=echo,
             stop=stop,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
             repeat_penalty=repeat_penalty,
             top_k=top_k,
             stream=stream,
@@ -792,6 +818,8 @@ class Llama:
         logprobs: Optional[int] = None,
         echo: bool = False,
         stop: Optional[List[str]] = [],
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
@@ -827,6 +855,8 @@ class Llama:
             logprobs=logprobs,
             echo=echo,
             stop=stop,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
             repeat_penalty=repeat_penalty,
             top_k=top_k,
             stream=stream,
@@ -899,6 +929,8 @@ class Llama:
         stream: bool = False,
         stop: Optional[List[str]] = [],
         max_tokens: int = 256,
+        presence_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         """Generate a chat completion from a list of messages.
@@ -932,6 +964,8 @@ class Llama:
             stream=stream,
             max_tokens=max_tokens,
             repeat_penalty=repeat_penalty,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
         )
         if stream:
             chunks: Iterator[CompletionChunk] = completion_or_chunks  # type: ignore
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index b46914e..c9f2aef 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -214,8 +214,6 @@ def create_completion(
             exclude={
                 "model",
                 "n",
-                "frequency_penalty",
-                "presence_penalty",
                 "best_of",
                 "logit_bias",
                 "user",
@@ -315,8 +313,6 @@ def create_chat_completion(
             exclude={
                 "model",
                 "n",
-                "presence_penalty",
-                "frequency_penalty",
                 "logit_bias",
                 "user",
             }

From 0d751a69a78c0a2f7b83c894d6a98ceec8daa680 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 8 May 2023 01:50:43 -0400
Subject: [PATCH 43/53] Set repeat_penalty to 0 by default

---
 llama_cpp/server/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index c9f2aef..b459b80 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -146,7 +146,7 @@ top_k_field = Field(
 )
 
 repeat_penalty_field = Field(
-    default=1.0,
+    default=0.0,
     ge=0.0,
     description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n"
     + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",

From 2c0d9b182cd417338a85396660d9828070b3373f Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Mon, 8 May 2023 15:27:03 +0200
Subject: [PATCH 44/53] Fix session loading and saving in low level example
 chat

---
 .../low_level_api/low_level_api_chat_cpp.py   | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 272b454..b86d723 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -112,16 +112,17 @@ specified) expect poor results""", file=sys.stderr)
 
 			if (path.exists(self.params.path_session)):
 				_session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
-				_n_token_count_out = llama_cpp.c_int()
+				_n_token_count_out = llama_cpp.c_size_t()
 				if (llama_cpp.llama_load_session_file(
 					self.ctx, 
 					self.params.path_session.encode("utf8"),
 					_session_tokens,
 					self.params.n_ctx,
 					ctypes.byref(_n_token_count_out)
-				) != 0):
+				) != 1):
 					print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr)
 					return
+				_n_token_count_out = _n_token_count_out.value
 				self.session_tokens = _session_tokens[:_n_token_count_out]
 				print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr)
 			else:
@@ -135,19 +136,21 @@ specified) expect poor results""", file=sys.stderr)
 			raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})")
 
 		# debug message about similarity of saved session, if applicable
-		n_matching_session_tokens = 0
+		self.n_matching_session_tokens = 0
 		if len(self.session_tokens) > 0:
 			for id in self.session_tokens:
-				if n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[n_matching_session_tokens]:
+				if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]:
 					break
-				n_matching_session_tokens += 1
+				self.n_matching_session_tokens += 1
 			
-			if n_matching_session_tokens >= len(self.embd_inp):
+			if self.n_matching_session_tokens >= len(self.embd_inp):
 				print(f"session file has exact match for prompt!")
-			elif n_matching_session_tokens < (len(self.embd_inp) / 2):
-				print(f"warning: session file has low similarity to prompt ({n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated")
+			elif self.n_matching_session_tokens < (len(self.embd_inp) / 2):
+				print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated")
 			else:
-				print(f"session file matches {n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt")
+				print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt")
+
+		self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
 
 		# number of tokens to keep when resetting context
 		if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct):
@@ -232,9 +235,6 @@ n_keep = {self.params.n_keep}
 """, file=sys.stderr)
 		self.set_color(util.CONSOLE_COLOR_PROMPT)
 
-		self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
-
-
 	# tokenize a prompt
 	def _tokenize(self, prompt, bos=True):
 		_arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
@@ -302,7 +302,7 @@ n_keep = {self.params.n_keep}
 				) != 0):
 					raise Exception("Failed to llama_eval!")
 
-				if len(self.embd) > 0 and not len(self.params.path_session) > 0:
+				if len(self.embd) > 0 and len(self.params.path_session) > 0:
 					self.session_tokens.extend(self.embd)
 					self.n_session_consumed = len(self.session_tokens)
 
@@ -319,7 +319,7 @@ n_keep = {self.params.n_keep}
 					llama_cpp.llama_save_session_file(
 						self.ctx,
 						self.params.path_session.encode("utf8"),
-						self.session_tokens,
+						(llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens),
 						len(self.session_tokens)
 					)
 

From eaf9f19aa98fa93fb078f31c6f65ce176629f808 Mon Sep 17 00:00:00 2001
From: Mug <2797716+SagsMug@users.noreply.github.com>
Date: Mon, 8 May 2023 15:27:42 +0200
Subject: [PATCH 45/53] Fix lora

---
 examples/low_level_api/low_level_api_chat_cpp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index b86d723..8773cb1 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -76,8 +76,8 @@ specified) expect poor results""", file=sys.stderr)
 		if (len(self.params.lora_adapter) > 0):
 			if (llama_cpp.llama_apply_lora_from_file(
 				self.ctx, 
-				self.params.lora_adapter, 
-				self.params.lora_base if len(self.params.lora_base) > 0 else None,
+				self.params.lora_adapter.encode("utf8"), 
+				self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None,
 				self.params.n_threads
 			) != 0):
 				print("error: failed to apply lora adapter")

From 022e9ebcb82092f3a2df2d2812796b34f35c6e53 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 8 May 2023 14:20:53 -0400
Subject: [PATCH 46/53] Use environment variable if parsed cli arg is None

---
 llama_cpp/server/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 5c9598a..57e0bf1 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -40,7 +40,7 @@ if __name__ == "__main__":
         )
 
     args = parser.parse_args()
-    settings = Settings(**vars(args))
+    settings = Settings(**{k: v for k, v in vars(args).items() if v is not None})
     app = create_app(settings=settings)
 
     uvicorn.run(

From 0d6c60097a1bab3f66f57bb20bfd7b513ffd0ff9 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 8 May 2023 14:21:15 -0400
Subject: [PATCH 47/53] Show default value when --help is called

---
 llama_cpp/server/__main__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 57e0bf1..18011e3 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -29,7 +29,9 @@ import uvicorn
 from llama_cpp.server.app import create_app, Settings
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
     for name, field in Settings.__fields__.items():
         parser.add_argument(
             f"--{name}",

From 6d69461ef55cccffd2b4ad2635081b31e7be6654 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 8 May 2023 14:21:47 -0400
Subject: [PATCH 48/53] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 781d21b..1eed653 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.45"
+version = "0.1.46"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index e2bc2da..504515e 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.45",
+    version="0.1.46",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 29f094bbcf21f24d6bdb1a4ee95d3a501387f08f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 8 May 2023 14:46:25 -0400
Subject: [PATCH 49/53] Bugfix: not falling back to environment variables when
 default is value is set.

---
 llama_cpp/server/__main__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 18011e3..4fe1d94 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -29,16 +29,16 @@ import uvicorn
 from llama_cpp.server.app import create_app, Settings
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
+    parser = argparse.ArgumentParser()
     for name, field in Settings.__fields__.items():
+        description = field.field_info.description
+        if field.default is not None and description is not None:
+            description += f" (default: {field.default})"
         parser.add_argument(
             f"--{name}",
             dest=name,
             type=field.type_,
-            default=field.default,
-            help=field.field_info.description,
+            help=description,
         )
 
     args = parser.parse_args()

From a3cc7bf5b2d790d528d851db0dfb624a73953e6c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 8 May 2023 14:46:50 -0400
Subject: [PATCH 50/53] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1eed653..24375ef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.46"
+version = "0.1.47"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 504515e..f965c0d 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.46",
+    version="0.1.47",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From b1489befda06ef15d224dc09fe8121e8c33ed1fe Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 8 May 2023 21:04:42 +0000
Subject: [PATCH 51/53] Bump mkdocs-material from 9.1.9 to 9.1.11

Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.9 to 9.1.11.
- [Release notes](https://github.com/squidfunk/mkdocs-material/releases)
- [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG)
- [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.9...9.1.11)

---
updated-dependencies:
- dependency-name: mkdocs-material
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 8 ++++----
 pyproject.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 5b364a7..5474bf4 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -773,14 +773,14 @@ mkdocs = ">=1.1"
 
 [[package]]
 name = "mkdocs-material"
-version = "9.1.9"
+version = "9.1.11"
 description = "Documentation that simply works"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocs_material-9.1.9-py3-none-any.whl", hash = "sha256:7db24261cb17400e132c46d17eea712bfe71056d892a9beba32cf68210297141"},
-    {file = "mkdocs_material-9.1.9.tar.gz", hash = "sha256:74d8da1371ab3a326868fe47bae3cbc4aa22e93c048b4ca5117e6817b88bd734"},
+    {file = "mkdocs_material-9.1.11-py3-none-any.whl", hash = "sha256:fbc86d50ec2cf34d40d5c4365780f290ceedde23f1a0704323b34e7f16b0c0dd"},
+    {file = "mkdocs_material-9.1.11.tar.gz", hash = "sha256:f5d473eb79d6640a5e668d4b2ab5b9de5e76ae0a0e2d864112df0cfe9016dc1d"},
 ]
 
 [package.dependencies]
@@ -1439,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "e87403dcd0a0b8484436b02c392326adfaf22b8d7e182d77e4a155c67a7435bc"
+content-hash = "6bea74d847b958639276d4be527c2b65dafeb0a455b6e3d1f29fee5171ce73b2"
diff --git a/pyproject.toml b/pyproject.toml
index 24375ef..5e12338 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ black = "^23.3.0"
 twine = "^4.0.2"
 mkdocs = "^1.4.3"
 mkdocstrings = {extras = ["python"], version = "^0.21.2"}
-mkdocs-material = "^9.1.9"
+mkdocs-material = "^9.1.11"
 pytest = "^7.3.1"
 httpx = "^0.24.0"
 

From 82d138fe547b6013743f8b712d37097d5433176f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 8 May 2023 18:49:11 -0400
Subject: [PATCH 52/53] Fix: default repeat_penalty

---
 llama_cpp/server/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index b459b80..621b73e 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -146,7 +146,7 @@ top_k_field = Field(
 )
 
 repeat_penalty_field = Field(
-    default=0.0,
+    default=1.1,
     ge=0.0,
     description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n"
     + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",

From c37883b477a8032c2b434ad6ce873694038c1b69 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 8 May 2023 18:49:37 -0400
Subject: [PATCH 53/53] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 24375ef..775fd49 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.47"
+version = "0.1.48"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index f965c0d..f4cbb60 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.47",
+    version="0.1.48",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",