From 4026166e682e6c080aeb50e00c4bb2312145f061 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 24 Nov 2023 03:24:19 -0500 Subject: [PATCH] docs: Update completion and chat_completion parameter docstrings --- llama_cpp/llama.py | 60 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index a15bc7b..df08c16 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1863,13 +1863,27 @@ class Llama: suffix: A suffix to append to the generated text. If None, no suffix is appended. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. temperature: The temperature to use for sampling. - top_p: The top-p value to use for sampling. + top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 + typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. logprobs: The number of logprobs to return. If None, no logprobs are returned. echo: Whether to echo the prompt. stop: A list of strings to stop generation when encountered. + frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt. + presence_penalty: The penalty to apply to tokens based on their presence in the prompt. repeat_penalty: The penalty to apply to repeated tokens. - top_k: The top-k value to use for sampling. + top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 stream: Whether to stream the results. + seed: The seed to use for sampling. + tfs_z: The tail-free sampling parameter. Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. + mirostat_mode: The mirostat sampling mode. + mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. + mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + model: The name to use for the model in the completion object. + stopping_criteria: A list of stopping criteria to use. + logits_processor: A list of logits processors to use. + grammar: A grammar to use for constrained sampling. + logit_bias: A logit bias to use. Raises: ValueError: If the requested tokens exceed the context window. @@ -1944,15 +1958,29 @@ class Llama: Args: prompt: The prompt to generate text from. suffix: A suffix to append to the generated text. If None, no suffix is appended. - max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx. + max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. temperature: The temperature to use for sampling. - top_p: The top-p value to use for sampling. + top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 + typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. logprobs: The number of logprobs to return. If None, no logprobs are returned. echo: Whether to echo the prompt. stop: A list of strings to stop generation when encountered. + frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt. + presence_penalty: The penalty to apply to tokens based on their presence in the prompt. repeat_penalty: The penalty to apply to repeated tokens. - top_k: The top-k value to use for sampling. + top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 stream: Whether to stream the results. + seed: The seed to use for sampling. + tfs_z: The tail-free sampling parameter. Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. + mirostat_mode: The mirostat sampling mode. + mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. + mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + model: The name to use for the model in the completion object. + stopping_criteria: A list of stopping criteria to use. + logits_processor: A list of logits processors to use. + grammar: A grammar to use for constrained sampling. + logit_bias: A logit bias to use. Raises: ValueError: If the requested tokens exceed the context window. @@ -2024,13 +2052,31 @@ class Llama: Args: messages: A list of messages to generate a response for. + functions: A list of functions to use for the chat completion. + function_call: A function call to use for the chat completion. + tools: A list of tools to use for the chat completion. + tool_choice: A tool choice to use for the chat completion. temperature: The temperature to use for sampling. - top_p: The top-p value to use for sampling. - top_k: The top-k value to use for sampling. + top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 + typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. stream: Whether to stream the results. stop: A list of strings to stop generation when encountered. + seed: The seed to use for sampling. + response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json. max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx. + presence_penalty: The penalty to apply to tokens based on their presence in the prompt. + frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt. repeat_penalty: The penalty to apply to repeated tokens. + tfs_z: The tail-free sampling parameter. + mirostat_mode: The mirostat sampling mode. + mirostat_tau: The mirostat sampling tau parameter. + mirostat_eta: The mirostat sampling eta parameter. + model: The name to use for the model in the completion object. + logits_processor: A list of logits processors to use. + grammar: A grammar to use. + logit_bias: A logit bias to use. Returns: Generated chat completion or a stream of chat completion chunks.