server: Support none defaulting to infinity for completions (#111)

* Support defaulting to infinity or -1 for chat completions

* Check if completion_tokens is none in error handler.

* fix: max_tokens in create completion should match openai spec

* Fix __call__

---------

Co-authored-by: Andrei Betlen <abetlen@gmail.com>
This commit is contained in:
swg 2023-12-22 14:05:13 -05:00 committed by GitHub
parent 12b7f2f4e9
commit 4b01a873ef
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 6 additions and 4 deletions

View file

@ -1917,7 +1917,7 @@ class Llama:
completion_or_chunks = self._create_completion(
prompt=prompt,
suffix=suffix,
max_tokens=max_tokens,
max_tokens=-1 if max_tokens is None else max_tokens,
temperature=temperature,
top_p=top_p,
min_p=min_p,
@ -1951,7 +1951,7 @@ class Llama:
self,
prompt: str,
suffix: Optional[str] = None,
max_tokens: int = 128,
max_tokens: Optional[int] = 16,
temperature: float = 0.8,
top_p: float = 0.95,
min_p: float = 0.05,

View file

@ -72,7 +72,7 @@ class ErrorResponseFormatters:
return 400, ErrorResponse(
message=message.format(
context_window,
completion_tokens + prompt_tokens,
(completion_tokens or 0) + prompt_tokens,
prompt_tokens,
completion_tokens,
), # type: ignore

View file

@ -110,7 +110,9 @@ class CreateCompletionRequest(BaseModel):
default=None,
description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
)
max_tokens: int = max_tokens_field
max_tokens: Optional[int] = Field(
default=16, ge=0, description="The maximum number of tokens to generate."
)
temperature: float = temperature_field
top_p: float = top_p_field
min_p: float = min_p_field