Compare commits

..

9 commits

Author SHA1 Message Date
966f8cb64f
Merge https://github.com/abetlen/llama-cpp-python 2024-01-14 14:56:35 +05:30
Andrei Betlen
7c898d5684 Update llama.cpp 2024-01-13 22:37:49 -05:00
Andrei Betlen
bb610b9428 Update llama.cpp 2024-01-11 22:51:12 -05:00
Andrei Betlen
f0159663d9 Bump version 2024-01-10 02:51:17 -05:00
Stephen Hankinson
df3be58d6c
Add ability to pass in penalize_nl param (#1068) 2024-01-10 02:46:27 -05:00
Joseph Turian
2ddce7294e
print_grammar to stderr (#1052) 2024-01-10 02:46:03 -05:00
Andrei Betlen
431cb3ec81 Update llama.cpp 2024-01-09 15:32:39 -05:00
Andrei Betlen
1ae05c102b Update llama.cpp 2024-01-08 14:51:29 -05:00
Andrei Betlen
142a9e1bc3 Update llama.cpp 2024-01-05 16:20:50 -05:00
6 changed files with 115 additions and 28 deletions

View file

@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
## [0.2.28]
- feat: Update llama.cpp to ggerganov/llama.cpp@6efb8eb30e7025b168f3fda3ff83b9b386428ad6
- feat: Add ability to pass in penalize_nl param by @shankinson in #1068
- fix: print_grammar to stderr by @turian in #1052
## [0.2.27] ## [0.2.27]
- feat: Update llama.cpp to ggerganov/llama.cpp@b3a7c20b5c035250257d2b62851c379b159c899a - feat: Update llama.cpp to ggerganov/llama.cpp@b3a7c20b5c035250257d2b62851c379b159c899a

View file

@ -1,4 +1,4 @@
from .llama_cpp import * from .llama_cpp import *
from .llama import * from .llama import *
__version__ = "0.2.27" __version__ = "0.2.28"

View file

@ -1201,6 +1201,7 @@ class Llama:
mirostat_mode: int = 0, mirostat_mode: int = 0,
mirostat_tau: float = 5.0, mirostat_tau: float = 5.0,
mirostat_eta: float = 0.1, mirostat_eta: float = 0.1,
penalize_nl: bool = True,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: Optional[LogitsProcessorList] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None,
grammar: Optional[LlamaGrammar] = None, grammar: Optional[LlamaGrammar] = None,
@ -1261,6 +1262,7 @@ class Llama:
mirostat_eta=mirostat_eta, mirostat_eta=mirostat_eta,
logits_processor=logits_processor, logits_processor=logits_processor,
grammar=grammar, grammar=grammar,
penalize_nl=penalize_nl,
) )
if stopping_criteria is not None and stopping_criteria( if stopping_criteria is not None and stopping_criteria(
self._input_ids, self._scores[-1, :] self._input_ids, self._scores[-1, :]

View file

@ -104,7 +104,7 @@ LLAMA_DEFAULT_SEED = 0xFFFFFFFF
# define LLAMA_MAX_RNG_STATE (64*1024) # define LLAMA_MAX_RNG_STATE (64*1024)
LLAMA_MAX_RNG_STATE = 64 * 1024 LLAMA_MAX_RNG_STATE = 64 * 1024
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' # define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
LLAMA_FILE_MAGIC_GGLA = 0x67676C61 LLAMA_FILE_MAGIC_GGLA = 0x67676C61
# define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' # define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
@ -112,8 +112,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN # define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
# define LLAMA_SESSION_VERSION 3 # define LLAMA_SESSION_VERSION 4
LLAMA_SESSION_VERSION = 3 LLAMA_SESSION_VERSION = 4
# struct llama_model; # struct llama_model;
@ -179,6 +179,9 @@ LLAMA_TOKEN_TYPE_BYTE = 6
# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
# }; # };
@ -199,6 +202,9 @@ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16 LLAMA_FTYPE_MOSTLY_Q5_K_S = 16
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17 LLAMA_FTYPE_MOSTLY_Q5_K_M = 17
LLAMA_FTYPE_MOSTLY_Q6_K = 18 LLAMA_FTYPE_MOSTLY_Q6_K = 18
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
LLAMA_FTYPE_GUESSED = 1024 LLAMA_FTYPE_GUESSED = 1024
# enum llama_rope_scaling_type { # enum llama_rope_scaling_type {
@ -214,6 +220,14 @@ LLAMA_ROPE_SCALING_LINEAR = 1
LLAMA_ROPE_SCALING_YARN = 2 LLAMA_ROPE_SCALING_YARN = 2
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
# enum llama_split_mode {
# LLAMA_SPLIT_NONE = 0, // single GPU
# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
# LLAMA_SPLIT_ROW = 2, // split rows across GPUs
# };
LLAMA_SPLIT_NONE = 0
LLAMA_SPLIT_LAYER = 1
LLAMA_SPLIT_ROW = 2
# typedef struct llama_token_data { # typedef struct llama_token_data {
# llama_token id; // token id # llama_token id; // token id
@ -222,11 +236,12 @@ LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
# } llama_token_data; # } llama_token_data;
class llama_token_data(Structure): class llama_token_data(Structure):
"""Used to store token data """Used to store token data
Attributes: Attributes:
id (llama_token): token id id (llama_token): token id
logit (float): log-odds of the token logit (float): log-odds of the token
p (float): probability of the token""" p (float): probability of the token"""
_fields_ = [ _fields_ = [
("id", llama_token), ("id", llama_token),
("logit", c_float), ("logit", c_float),
@ -244,11 +259,12 @@ llama_token_data_p = POINTER(llama_token_data)
# } llama_token_data_array; # } llama_token_data_array;
class llama_token_data_array(Structure): class llama_token_data_array(Structure):
"""Used to sample tokens given logits """Used to sample tokens given logits
Attributes: Attributes:
data (ctypes.Array[llama_token_data]): token data data (ctypes.Array[llama_token_data]): token data
size (int): size of the array size (int): size of the array
sorted (bool): whether the array is sorted""" sorted (bool): whether the array is sorted"""
_fields_ = [ _fields_ = [
("data", llama_token_data_p), ("data", llama_token_data_p),
("size", c_size_t), ("size", c_size_t),
@ -303,7 +319,8 @@ class llama_batch(Structure):
token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL) token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL)
embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL) embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence
seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs""" seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs
"""
_fields_ = [ _fields_ = [
("n_tokens", c_int32), ("n_tokens", c_int32),
@ -318,6 +335,7 @@ class llama_batch(Structure):
("all_seq_id", llama_seq_id), ("all_seq_id", llama_seq_id),
] ]
# enum llama_model_kv_override_type { # enum llama_model_kv_override_type {
# LLAMA_KV_OVERRIDE_INT, # LLAMA_KV_OVERRIDE_INT,
# LLAMA_KV_OVERRIDE_FLOAT, # LLAMA_KV_OVERRIDE_FLOAT,
@ -327,6 +345,7 @@ LLAMA_KV_OVERRIDE_INT = 0
LLAMA_KV_OVERRIDE_FLOAT = 1 LLAMA_KV_OVERRIDE_FLOAT = 1
LLAMA_KV_OVERRIDE_BOOL = 2 LLAMA_KV_OVERRIDE_BOOL = 2
# struct llama_model_kv_override { # struct llama_model_kv_override {
# char key[128]; # char key[128];
# enum llama_model_kv_override_type tag; # enum llama_model_kv_override_type tag;
@ -343,6 +362,7 @@ class llama_model_kv_override_value(CtypesUnion):
("bool_value", c_bool), ("bool_value", c_bool),
] ]
class llama_model_kv_override(Structure): class llama_model_kv_override(Structure):
_fields_ = [ _fields_ = [
("key", ctypes.c_char * 128), ("key", ctypes.c_char * 128),
@ -350,15 +370,25 @@ class llama_model_kv_override(Structure):
("value", llama_model_kv_override_value), ("value", llama_model_kv_override_value),
] ]
# struct llama_model_params { # struct llama_model_params {
# int32_t n_gpu_layers; // number of layers to store in VRAM # int32_t n_gpu_layers; // number of layers to store in VRAM
# int32_t main_gpu; // the GPU that is used for scratch and small tensors # enum llama_split_mode split_mode; // how to split the model across multiple GPUs
# const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
# // main_gpu interpretation depends on split_mode:
# // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
# // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
# // LLAMA_SPLIT_LAYER: ignored
# int32_t main_gpu;
# // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
# const float * tensor_split;
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. # // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
# // If the provided progress_callback returns true, model loading continues. # // If the provided progress_callback returns true, model loading continues.
# // If it returns false, model loading is immediately aborted. # // If it returns false, model loading is immediately aborted.
# llama_progress_callback progress_callback; # llama_progress_callback progress_callback;
# // context pointer passed to the progress callback # // context pointer passed to the progress callback
# void * progress_callback_user_data; # void * progress_callback_user_data;
@ -372,19 +402,22 @@ class llama_model_kv_override(Structure):
# }; # };
class llama_model_params(Structure): class llama_model_params(Structure):
"""Parameters for llama_model """Parameters for llama_model
Attributes: Attributes:
n_gpu_layers (int): number of layers to store in VRAM n_gpu_layers (int): number of layers to store in VRAM
main_gpu (int): the GPU that is used for scratch and small tensors split_mode (int): how to split the model across multiple GPUs
tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted. progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
vocab_only (bool): only load the vocabulary, no weights vocab_only (bool): only load the vocabulary, no weights
use_mmap (bool): use mmap if possible use_mmap (bool): use mmap if possible
use_mlock (bool): force system to keep model in RAM""" use_mlock (bool): force system to keep model in RAM"""
_fields_ = [ _fields_ = [
("n_gpu_layers", c_int32), ("n_gpu_layers", c_int32),
("split_mode", c_int),
("main_gpu", c_int32), ("main_gpu", c_int32),
("tensor_split", c_float_p), ("tensor_split", c_float_p),
("progress_callback", llama_progress_callback), ("progress_callback", llama_progress_callback),
@ -416,6 +449,7 @@ class llama_model_params(Structure):
# enum ggml_type type_k; // data type for K cache # enum ggml_type type_k; // data type for K cache
# enum ggml_type type_v; // data type for V cache # enum ggml_type type_v; // data type for V cache
# // Keep the booleans together to avoid misalignment during copy-by-value. # // Keep the booleans together to avoid misalignment during copy-by-value.
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) # bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
@ -424,7 +458,7 @@ class llama_model_params(Structure):
# }; # };
class llama_context_params(Structure): class llama_context_params(Structure):
"""Parameters for llama_context """Parameters for llama_context
Attributes: Attributes:
seed (int): RNG seed, -1 for random seed (int): RNG seed, -1 for random
n_ctx (int): text context, 0 = from model n_ctx (int): text context, 0 = from model
@ -444,7 +478,9 @@ class llama_context_params(Structure):
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true) mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
embedding (bool): embedding mode only embedding (bool): embedding mode only
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU""" offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
"""
_fields_ = [ _fields_ = [
("seed", c_uint32), ("seed", c_uint32),
("n_ctx", c_uint32), ("n_ctx", c_uint32),
@ -493,14 +529,16 @@ It might not exist for progress report where '.' is output repeatedly."""
# } llama_model_quantize_params; # } llama_model_quantize_params;
class llama_model_quantize_params(Structure): class llama_model_quantize_params(Structure):
"""Parameters for llama_model_quantize """Parameters for llama_model_quantize
Attributes: Attributes:
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
ftype (int): quantize to this llama_ftype ftype (int): quantize to this llama_ftype
allow_requantize (bool): allow quantizing non-f32/f16 tensors allow_requantize (bool): allow quantizing non-f32/f16 tensors
quantize_output_tensor (bool): quantize output.weight quantize_output_tensor (bool): quantize output.weight
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
pure (bool): disable k-quant mixtures and quantize all tensors to the same type""" pure (bool): disable k-quant mixtures and quantize all tensors to the same type
"""
_fields_ = [ _fields_ = [
("nthread", c_int32), ("nthread", c_int32),
("ftype", c_int), ("ftype", c_int),
@ -745,13 +783,16 @@ def llama_n_ctx(ctx: llama_context_p) -> int:
_lib.llama_n_ctx.argtypes = [llama_context_p] _lib.llama_n_ctx.argtypes = [llama_context_p]
_lib.llama_n_ctx.restype = c_uint32 _lib.llama_n_ctx.restype = c_uint32
# LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); # LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
def llama_n_batch(ctx: llama_context_p) -> int: def llama_n_batch(ctx: llama_context_p) -> int:
return _lib.llama_n_batch(ctx) return _lib.llama_n_batch(ctx)
_lib.llama_n_batch.argtypes = [llama_context_p] _lib.llama_n_batch.argtypes = [llama_context_p]
_lib.llama_n_batch.restype = c_uint32 _lib.llama_n_batch.restype = c_uint32
# LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); # LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
def llama_vocab_type(model: llama_model_p) -> int: def llama_vocab_type(model: llama_model_p) -> int:
return _lib.llama_vocab_type(model) return _lib.llama_vocab_type(model)
@ -1080,7 +1121,7 @@ _lib.llama_kv_cache_view_init.restype = llama_kv_cache_view
# // Free a KV cache view. (use only for debugging purposes) # // Free a KV cache view. (use only for debugging purposes)
# LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view); # LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
"""Free a KV cache view. (use only for debugging purposes)""" """Free a KV cache view. (use only for debugging purposes)"""
return _lib.llama_kv_cache_view_free(view) return _lib.llama_kv_cache_view_free(view)
@ -1091,7 +1132,7 @@ _lib.llama_kv_cache_view_free.restype = None
# // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes) # // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
# LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view); # LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
def llama_kv_cache_view_update(ctx: llama_context_p, view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore def llama_kv_cache_view_update(ctx: llama_context_p, view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
"""Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)""" """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
return _lib.llama_kv_cache_view_update(ctx, view) return _lib.llama_kv_cache_view_update(ctx, view)
@ -1251,6 +1292,40 @@ _lib.llama_kv_cache_seq_shift.argtypes = [
] ]
_lib.llama_kv_cache_seq_shift.restype = None _lib.llama_kv_cache_seq_shift.restype = None
# // Integer division of the positions by factor of `d > 1`
# // If the KV cache is RoPEd, the KV data is updated accordingly
# // p0 < 0 : [0, p1]
# // p1 < 0 : [p0, inf)
# LLAMA_API void llama_kv_cache_seq_div(
# struct llama_context * ctx,
# llama_seq_id seq_id,
# llama_pos p0,
# llama_pos p1,
# int d);
def llama_kv_cache_seq_div(
ctx: llama_context_p,
seq_id: Union[llama_seq_id, int],
p0: Union[llama_pos, int],
p1: Union[llama_pos, int],
d: Union[c_int, int],
):
"""Integer division of the positions by factor of `d > 1`
If the KV cache is RoPEd, the KV data is updated accordingly
p0 < 0 : [0, p1]
p1 < 0 : [p0, inf)"""
return _lib.llama_kv_cache_seq_div(ctx, seq_id, p0, p1, d)
_lib.llama_kv_cache_seq_div.argtypes = [
llama_context_p,
llama_seq_id,
llama_pos,
llama_pos,
c_int,
]
_lib.llama_kv_cache_seq_div.restype = None
# // # //
# // State / sessions # // State / sessions
# // # //
@ -2063,10 +2138,11 @@ def llama_sample_temp(
temp: Union[c_float, float], temp: Union[c_float, float],
): ):
"""Temperature sampling described in academic paper "Generating Long Sequences with Sparse Transformers" https://arxiv.org/abs/1904.10509 """Temperature sampling described in academic paper "Generating Long Sequences with Sparse Transformers" https://arxiv.org/abs/1904.10509
Parameters: Parameters:
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.""" temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
"""
return _lib.llama_sample_temp(ctx, candidates, temp) return _lib.llama_sample_temp(ctx, candidates, temp)
@ -2111,10 +2187,11 @@ def llama_sample_grammar(
grammar, # type: llama_grammar_p grammar, # type: llama_grammar_p
): ):
"""Apply constraints from grammar """Apply constraints from grammar
Parameters: Parameters:
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
grammar: A grammar object containing the rules and constraints to apply to the generated text.""" grammar: A grammar object containing the rules and constraints to apply to the generated text.
"""
return _lib.llama_sample_grammar(ctx, candidates, grammar) return _lib.llama_sample_grammar(ctx, candidates, grammar)
@ -2148,13 +2225,14 @@ def llama_sample_token_mirostat(
mu, # type: _Pointer[c_float] mu, # type: _Pointer[c_float]
) -> int: ) -> int:
"""Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. """Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
Parameters: Parameters:
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
m: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. m: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.""" mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
"""
return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
@ -2188,12 +2266,13 @@ def llama_sample_token_mirostat_v2(
mu, # type: _Pointer[c_float] mu, # type: _Pointer[c_float]
) -> int: ) -> int:
"""Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. """Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
Parameters: Parameters:
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.""" mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
"""
return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)

View file

@ -72,7 +72,7 @@ class LlamaGrammar:
) )
if verbose: if verbose:
print(f"{cls.from_string.__name__} grammar:", file=sys.stderr) print(f"{cls.from_string.__name__} grammar:", file=sys.stderr)
print_grammar(sys.stdout, parsed_grammar) print_grammar(sys.stderr, parsed_grammar)
print(file=sys.stderr) print(file=sys.stderr)
return cls(parsed_grammar) return cls(parsed_grammar)

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit b3a7c20b5c035250257d2b62851c379b159c899a Subproject commit 76484fbfd355df388f71d6edaa98e1692a74de7e