From 6308f21d5e76f483f73da1b5da81123cef9c160f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 26 Nov 2023 15:56:40 -0500 Subject: [PATCH] docs: Update Llama docs --- llama_cpp/llama.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index df08c16..3cb07e5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -798,17 +798,21 @@ class Llama: vocab_only: Only load the vocabulary no weights. use_mmap: Use mmap if possible. use_mlock: Force the system to keep the model in RAM. - seed: Random seed. -1 for random. - n_ctx: Context size. - n_batch: Batch size for prompt processing (must be >= 32 to use BLAS) - n_threads: Number of threads to use. If None, the number of threads is automatically determined. - n_threads_batch: Number of threads to use for batch processing. If None, use n_threads. - rope_scaling_type: Type of rope scaling to use. - rope_freq_base: Base frequency for rope sampling. - rope_freq_scale: Scale factor for rope sampling. - mul_mat_q: if true, use experimental mul_mat_q kernels - f16_kv: Use half-precision for key/value cache. - logits_all: Return logits for all tokens, not just the last token. + seed: RNG seed, -1 for random + n_ctx: Text context, 0 = from model + n_batch: Prompt processing maximum batch size + n_threads: Number of threads to use for generation + n_threads_batch: Number of threads to use for batch processing + rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054 + rope_freq_base: RoPE base frequency, 0 = from model + rope_freq_scale: RoPE frequency scaling factor, 0 = from model + yarn_ext_factor: YaRN extrapolation mix factor, negative = from model + yarn_attn_factor: YaRN magnitude scaling factor + yarn_beta_fast: YaRN low correction dim + yarn_beta_slow: YaRN high correction dim + yarn_orig_ctx: YaRN original context size + f16_kv: Use fp16 for KV cache, fp32 otherwise + logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs. embedding: Embedding mode only. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.