From 389e09c2f55a9ac659fd64fada56cf750a7a378b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 14 May 2024 15:44:09 +0200 Subject: [PATCH 1/2] misc: Remove unnecessary metadata lookups (#1448) Special tokens are already mapped from metadata by llama.cpp --- llama_cpp/llama.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4212669..281ee79 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -410,8 +410,8 @@ class Llama: if self.verbose: print(f"Model metadata: {self.metadata}", file=sys.stderr) - eos_token_id = int(self.metadata.get("tokenizer.ggml.eos_token_id", self.token_eos())) - bos_token_id = int(self.metadata.get("tokenizer.ggml.bos_token_id", self.token_bos())) + eos_token_id = self.token_eos() + bos_token_id = self.token_bos() eos_token = self._model.token_get_text(eos_token_id) bos_token = self._model.token_get_text(bos_token_id) @@ -961,9 +961,9 @@ class Llama: completion_id: str = f"cmpl-{str(uuid.uuid4())}" created: int = int(time.time()) - prefix_token_id: int = int(self.metadata.get("tokenizer.ggml.prefix_token_id", self._model.token_prefix())) - middle_token_id: int = int(self.metadata.get("tokenizer.ggml.middle_token_id", self._model.token_middle())) - suffix_token_id: int = int(self.metadata.get("tokenizer.ggml.suffix_token_id", self._model.token_suffix())) + prefix_token_id: int = self._model.token_prefix() + middle_token_id: int = self._model.token_middle() + suffix_token_id: int = self._model.token_suffix() # If prompt is empty, initialize completion with BOS token to avoid # detokenization including a space at the beginning of the completion completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()] From 5212fb08ae69a721b2ced4e5e8b96ce642219e16 Mon Sep 17 00:00:00 2001 From: twaka Date: Tue, 14 May 2024 22:50:53 +0900 Subject: [PATCH 2/2] feat: add MinTokensLogitProcessor and min_tokens argument to server (#1333) * implement min_tokens * set default to 0 * pass min_tokens * fix * remove copy * implement MinTokensLogitsProcessor * format * fix condition --- llama_cpp/llama.py | 16 ++++++++++++++++ llama_cpp/server/app.py | 20 ++++++++++++++++++++ llama_cpp/server/types.py | 8 ++++++++ 3 files changed, 44 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 281ee79..7145329 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -2084,3 +2084,19 @@ class StoppingCriteriaList(List[StoppingCriteria]): self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single] ) -> bool: return any([stopping_criteria(input_ids, logits) for stopping_criteria in self]) + + +class MinTokensLogitsProcessor(LogitsProcessor): + def __init__(self, min_tokens: int, token_eos: int): + self.min_tokens = min_tokens + self.token_eos = token_eos + self.prompt_tokens = None + + def __call__( + self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single] + ) -> npt.NDArray[np.single]: + if self.prompt_tokens is None: + self.prompt_tokens = len(input_ids) + if len(input_ids) - self.prompt_tokens < self.min_tokens: + scores[self.token_eos] = -np.inf + return scores diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 4cf10d1..4cda4af 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -275,6 +275,7 @@ async def create_completion( "best_of", "logit_bias_type", "user", + "min_tokens", } kwargs = body.model_dump(exclude=exclude) @@ -288,6 +289,15 @@ async def create_completion( if body.grammar is not None: kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) + if body.min_tokens > 0: + _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( + [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] + ) + if "logits_processor" not in kwargs: + kwargs["logits_processor"] = _min_tokens_logits_processor + else: + kwargs["logits_processor"].extend(_min_tokens_logits_processor) + iterator_or_completion: Union[ llama_cpp.CreateCompletionResponse, Iterator[llama_cpp.CreateCompletionStreamResponse], @@ -445,6 +455,7 @@ async def create_chat_completion( "n", "logit_bias_type", "user", + "min_tokens", } kwargs = body.model_dump(exclude=exclude) llama = llama_proxy(body.model) @@ -458,6 +469,15 @@ async def create_chat_completion( if body.grammar is not None: kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) + if body.min_tokens > 0: + _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( + [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] + ) + if "logits_processor" not in kwargs: + kwargs["logits_processor"] = _min_tokens_logits_processor + else: + kwargs["logits_processor"].extend(_min_tokens_logits_processor) + iterator_or_completion: Union[ llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk] ] = await run_in_threadpool(llama.create_chat_completion, **kwargs) diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index a20b394..a75f9e5 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -16,6 +16,12 @@ max_tokens_field = Field( default=16, ge=1, description="The maximum number of tokens to generate." ) +min_tokens_field = Field( + default=0, + ge=0, + description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).", +) + temperature_field = Field( default=0.8, description="Adjust the randomness of the generated text.\n\n" @@ -111,6 +117,7 @@ class CreateCompletionRequest(BaseModel): max_tokens: Optional[int] = Field( default=16, ge=0, description="The maximum number of tokens to generate." ) + min_tokens: int = min_tokens_field temperature: float = temperature_field top_p: float = top_p_field min_p: float = min_p_field @@ -206,6 +213,7 @@ class CreateChatCompletionRequest(BaseModel): default=None, description="The maximum number of tokens to generate. Defaults to inf", ) + min_tokens: int = min_tokens_field logprobs: Optional[bool] = Field( default=False, description="Whether to output the logprobs or not. Default is True"