From 5212fb08ae69a721b2ced4e5e8b96ce642219e16 Mon Sep 17 00:00:00 2001
From: twaka <twaka@users.noreply.github.com>
Date: Tue, 14 May 2024 22:50:53 +0900
Subject: [PATCH] feat: add MinTokensLogitProcessor and min_tokens argument to
 server (#1333)

* implement min_tokens

* set default to 0

* pass min_tokens

* fix

* remove copy

* implement MinTokensLogitsProcessor

* format

* fix condition
---
 llama_cpp/llama.py        | 16 ++++++++++++++++
 llama_cpp/server/app.py   | 20 ++++++++++++++++++++
 llama_cpp/server/types.py |  8 ++++++++
 3 files changed, 44 insertions(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 281ee79..7145329 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -2084,3 +2084,19 @@ class StoppingCriteriaList(List[StoppingCriteria]):
         self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
     ) -> bool:
         return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
+
+
+class MinTokensLogitsProcessor(LogitsProcessor):
+    def __init__(self, min_tokens: int, token_eos: int):
+        self.min_tokens = min_tokens
+        self.token_eos = token_eos
+        self.prompt_tokens = None
+
+    def __call__(
+        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
+    ) -> npt.NDArray[np.single]:
+        if self.prompt_tokens is None:
+            self.prompt_tokens = len(input_ids)
+        if len(input_ids) - self.prompt_tokens < self.min_tokens:
+            scores[self.token_eos] = -np.inf
+        return scores
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 4cf10d1..4cda4af 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -275,6 +275,7 @@ async def create_completion(
         "best_of",
         "logit_bias_type",
         "user",
+        "min_tokens",
     }
     kwargs = body.model_dump(exclude=exclude)
 
@@ -288,6 +289,15 @@ async def create_completion(
     if body.grammar is not None:
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
+    if body.min_tokens > 0:
+        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
+            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
+        )
+        if "logits_processor" not in kwargs:
+            kwargs["logits_processor"] = _min_tokens_logits_processor
+        else:
+            kwargs["logits_processor"].extend(_min_tokens_logits_processor)
+
     iterator_or_completion: Union[
         llama_cpp.CreateCompletionResponse,
         Iterator[llama_cpp.CreateCompletionStreamResponse],
@@ -445,6 +455,7 @@ async def create_chat_completion(
         "n",
         "logit_bias_type",
         "user",
+        "min_tokens",
     }
     kwargs = body.model_dump(exclude=exclude)
     llama = llama_proxy(body.model)
@@ -458,6 +469,15 @@ async def create_chat_completion(
     if body.grammar is not None:
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
+    if body.min_tokens > 0:
+        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
+            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
+        )
+        if "logits_processor" not in kwargs:
+            kwargs["logits_processor"] = _min_tokens_logits_processor
+        else:
+            kwargs["logits_processor"].extend(_min_tokens_logits_processor)
+
     iterator_or_completion: Union[
         llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
     ] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
index a20b394..a75f9e5 100644
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@@ -16,6 +16,12 @@ max_tokens_field = Field(
     default=16, ge=1, description="The maximum number of tokens to generate."
 )
 
+min_tokens_field = Field(
+    default=0,
+    ge=0,
+    description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).",
+)
+
 temperature_field = Field(
     default=0.8,
     description="Adjust the randomness of the generated text.\n\n"
@@ -111,6 +117,7 @@ class CreateCompletionRequest(BaseModel):
     max_tokens: Optional[int] = Field(
         default=16, ge=0, description="The maximum number of tokens to generate."
     )
+    min_tokens: int = min_tokens_field
     temperature: float = temperature_field
     top_p: float = top_p_field
     min_p: float = min_p_field
@@ -206,6 +213,7 @@ class CreateChatCompletionRequest(BaseModel):
         default=None,
         description="The maximum number of tokens to generate. Defaults to inf",
     )
+    min_tokens: int = min_tokens_field
     logprobs: Optional[bool] = Field(
         default=False,
         description="Whether to output the logprobs or not. Default is True"