From 389e09c2f55a9ac659fd64fada56cf750a7a378b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 14 May 2024 15:44:09 +0200
Subject: [PATCH 1/2] misc: Remove unnecessary metadata lookups (#1448)

Special tokens are already mapped from metadata by llama.cpp
---
 llama_cpp/llama.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 4212669..281ee79 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -410,8 +410,8 @@ class Llama:
         if self.verbose:
             print(f"Model metadata: {self.metadata}", file=sys.stderr)
 
-        eos_token_id = int(self.metadata.get("tokenizer.ggml.eos_token_id", self.token_eos()))
-        bos_token_id = int(self.metadata.get("tokenizer.ggml.bos_token_id", self.token_bos()))
+        eos_token_id = self.token_eos()
+        bos_token_id = self.token_bos()
 
         eos_token = self._model.token_get_text(eos_token_id)
         bos_token = self._model.token_get_text(bos_token_id)
@@ -961,9 +961,9 @@ class Llama:
 
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
         created: int = int(time.time())
-        prefix_token_id: int = int(self.metadata.get("tokenizer.ggml.prefix_token_id", self._model.token_prefix()))
-        middle_token_id: int = int(self.metadata.get("tokenizer.ggml.middle_token_id", self._model.token_middle()))
-        suffix_token_id: int = int(self.metadata.get("tokenizer.ggml.suffix_token_id", self._model.token_suffix()))
+        prefix_token_id: int = self._model.token_prefix()
+        middle_token_id: int = self._model.token_middle()
+        suffix_token_id: int = self._model.token_suffix()
         # If prompt is empty, initialize completion with BOS token to avoid
         # detokenization including a space at the beginning of the completion
         completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]

From 5212fb08ae69a721b2ced4e5e8b96ce642219e16 Mon Sep 17 00:00:00 2001
From: twaka <twaka@users.noreply.github.com>
Date: Tue, 14 May 2024 22:50:53 +0900
Subject: [PATCH 2/2] feat: add MinTokensLogitProcessor and min_tokens argument
 to server (#1333)

* implement min_tokens

* set default to 0

* pass min_tokens

* fix

* remove copy

* implement MinTokensLogitsProcessor

* format

* fix condition
---
 llama_cpp/llama.py        | 16 ++++++++++++++++
 llama_cpp/server/app.py   | 20 ++++++++++++++++++++
 llama_cpp/server/types.py |  8 ++++++++
 3 files changed, 44 insertions(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 281ee79..7145329 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -2084,3 +2084,19 @@ class StoppingCriteriaList(List[StoppingCriteria]):
         self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
     ) -> bool:
         return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
+
+
+class MinTokensLogitsProcessor(LogitsProcessor):
+    def __init__(self, min_tokens: int, token_eos: int):
+        self.min_tokens = min_tokens
+        self.token_eos = token_eos
+        self.prompt_tokens = None
+
+    def __call__(
+        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
+    ) -> npt.NDArray[np.single]:
+        if self.prompt_tokens is None:
+            self.prompt_tokens = len(input_ids)
+        if len(input_ids) - self.prompt_tokens < self.min_tokens:
+            scores[self.token_eos] = -np.inf
+        return scores
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 4cf10d1..4cda4af 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -275,6 +275,7 @@ async def create_completion(
         "best_of",
         "logit_bias_type",
         "user",
+        "min_tokens",
     }
     kwargs = body.model_dump(exclude=exclude)
 
@@ -288,6 +289,15 @@ async def create_completion(
     if body.grammar is not None:
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
+    if body.min_tokens > 0:
+        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
+            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
+        )
+        if "logits_processor" not in kwargs:
+            kwargs["logits_processor"] = _min_tokens_logits_processor
+        else:
+            kwargs["logits_processor"].extend(_min_tokens_logits_processor)
+
     iterator_or_completion: Union[
         llama_cpp.CreateCompletionResponse,
         Iterator[llama_cpp.CreateCompletionStreamResponse],
@@ -445,6 +455,7 @@ async def create_chat_completion(
         "n",
         "logit_bias_type",
         "user",
+        "min_tokens",
     }
     kwargs = body.model_dump(exclude=exclude)
     llama = llama_proxy(body.model)
@@ -458,6 +469,15 @@ async def create_chat_completion(
     if body.grammar is not None:
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
+    if body.min_tokens > 0:
+        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
+            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
+        )
+        if "logits_processor" not in kwargs:
+            kwargs["logits_processor"] = _min_tokens_logits_processor
+        else:
+            kwargs["logits_processor"].extend(_min_tokens_logits_processor)
+
     iterator_or_completion: Union[
         llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
     ] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
index a20b394..a75f9e5 100644
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@@ -16,6 +16,12 @@ max_tokens_field = Field(
     default=16, ge=1, description="The maximum number of tokens to generate."
 )
 
+min_tokens_field = Field(
+    default=0,
+    ge=0,
+    description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).",
+)
+
 temperature_field = Field(
     default=0.8,
     description="Adjust the randomness of the generated text.\n\n"
@@ -111,6 +117,7 @@ class CreateCompletionRequest(BaseModel):
     max_tokens: Optional[int] = Field(
         default=16, ge=0, description="The maximum number of tokens to generate."
     )
+    min_tokens: int = min_tokens_field
     temperature: float = temperature_field
     top_p: float = top_p_field
     min_p: float = min_p_field
@@ -206,6 +213,7 @@ class CreateChatCompletionRequest(BaseModel):
         default=None,
         description="The maximum number of tokens to generate. Defaults to inf",
     )
+    min_tokens: int = min_tokens_field
     logprobs: Optional[bool] = Field(
         default=False,
         description="Whether to output the logprobs or not. Default is True"