docs: Update Llama docs

2023-11-26 15:56:40 -05:00 · 2023-11-26 15:56:40 -05:00 · 6308f21d5e
commit 6308f21d5e
parent f03a38e62a
1 changed files with 15 additions and 11 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -798,17 +798,21 @@ class Llama:
            vocab_only: Only load the vocabulary no weights.
            use_mmap: Use mmap if possible.
            use_mlock: Force the system to keep the model in RAM.
-            seed: Random seed. -1 for random.
-            n_ctx: Context size.
-            n_batch: Batch size for prompt processing (must be >= 32 to use BLAS)
-            n_threads: Number of threads to use. If None, the number of threads is automatically determined.
-            n_threads_batch: Number of threads to use for batch processing. If None, use n_threads.
-            rope_scaling_type: Type of rope scaling to use.
-            rope_freq_base: Base frequency for rope sampling.
-            rope_freq_scale: Scale factor for rope sampling.
-            mul_mat_q: if true, use experimental mul_mat_q kernels
-            f16_kv: Use half-precision for key/value cache.
-            logits_all: Return logits for all tokens, not just the last token.
+            seed: RNG seed, -1 for random
+            n_ctx: Text context, 0 = from model
+            n_batch: Prompt processing maximum batch size
+            n_threads: Number of threads to use for generation
+            n_threads_batch: Number of threads to use for batch processing
+            rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
+            rope_freq_base: RoPE base frequency, 0 = from model
+            rope_freq_scale: RoPE frequency scaling factor, 0 = from model
+            yarn_ext_factor: YaRN extrapolation mix factor, negative = from model
+            yarn_attn_factor: YaRN magnitude scaling factor
+            yarn_beta_fast: YaRN low correction dim
+            yarn_beta_slow: YaRN high correction dim
+            yarn_orig_ctx: YaRN original context size
+            f16_kv: Use fp16 for KV cache, fp32 otherwise
+            logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
            embedding: Embedding mode only.
            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
            lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.