Add unlimited max_tokens
This commit is contained in:
parent
a5554a2f02
commit
90e1021154
1 changed files with 12 additions and 4 deletions
|
@ -317,7 +317,15 @@ class Llama:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
llama_cpp.llama_reset_timings(self.ctx)
|
llama_cpp.llama_reset_timings(self.ctx)
|
||||||
|
|
||||||
if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
|
if max_tokens <= 0:
|
||||||
|
# Unlimited, depending on n_ctx.
|
||||||
|
if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)):
|
||||||
|
raise ValueError(
|
||||||
|
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens)
|
||||||
|
elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
|
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
|
||||||
)
|
)
|
||||||
|
@ -455,7 +463,7 @@ class Llama:
|
||||||
Args:
|
Args:
|
||||||
prompt: The prompt to generate text from.
|
prompt: The prompt to generate text from.
|
||||||
suffix: A suffix to append to the generated text. If None, no suffix is appended.
|
suffix: A suffix to append to the generated text. If None, no suffix is appended.
|
||||||
max_tokens: The maximum number of tokens to generate.
|
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
|
||||||
temperature: The temperature to use for sampling.
|
temperature: The temperature to use for sampling.
|
||||||
top_p: The top-p value to use for sampling.
|
top_p: The top-p value to use for sampling.
|
||||||
logprobs: The number of logprobs to return. If None, no logprobs are returned.
|
logprobs: The number of logprobs to return. If None, no logprobs are returned.
|
||||||
|
@ -510,7 +518,7 @@ class Llama:
|
||||||
Args:
|
Args:
|
||||||
prompt: The prompt to generate text from.
|
prompt: The prompt to generate text from.
|
||||||
suffix: A suffix to append to the generated text. If None, no suffix is appended.
|
suffix: A suffix to append to the generated text. If None, no suffix is appended.
|
||||||
max_tokens: The maximum number of tokens to generate.
|
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
|
||||||
temperature: The temperature to use for sampling.
|
temperature: The temperature to use for sampling.
|
||||||
top_p: The top-p value to use for sampling.
|
top_p: The top-p value to use for sampling.
|
||||||
logprobs: The number of logprobs to return. If None, no logprobs are returned.
|
logprobs: The number of logprobs to return. If None, no logprobs are returned.
|
||||||
|
@ -619,7 +627,7 @@ class Llama:
|
||||||
top_k: The top-k value to use for sampling.
|
top_k: The top-k value to use for sampling.
|
||||||
stream: Whether to stream the results.
|
stream: Whether to stream the results.
|
||||||
stop: A list of strings to stop generation when encountered.
|
stop: A list of strings to stop generation when encountered.
|
||||||
max_tokens: The maximum number of tokens to generate.
|
max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
|
||||||
repeat_penalty: The penalty to apply to repeated tokens.
|
repeat_penalty: The penalty to apply to repeated tokens.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
Loading…
Reference in a new issue