diff --git a/CHANGELOG.md b/CHANGELOG.md index bcf1665..5061247 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.28] + +- feat: Update llama.cpp to ggerganov/llama.cpp@6efb8eb30e7025b168f3fda3ff83b9b386428ad6 +- feat: Add ability to pass in penalize_nl param by @shankinson in #1068 +- fix: print_grammar to stderr by @turian in #1052 + ## [0.2.27] - feat: Update llama.cpp to ggerganov/llama.cpp@b3a7c20b5c035250257d2b62851c379b159c899a diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index d3fe66b..33234fb 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.27" \ No newline at end of file +__version__ = "0.2.28" \ No newline at end of file diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 9178a22..7c819b0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1201,6 +1201,7 @@ class Llama: mirostat_mode: int = 0, mirostat_tau: float = 5.0, mirostat_eta: float = 0.1, + penalize_nl: bool = True, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, grammar: Optional[LlamaGrammar] = None, @@ -1261,6 +1262,7 @@ class Llama: mirostat_eta=mirostat_eta, logits_processor=logits_processor, grammar=grammar, + penalize_nl=penalize_nl, ) if stopping_criteria is not None and stopping_criteria( self._input_ids, self._scores[-1, :] diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 4aada53..989b67a 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -104,7 +104,7 @@ LLAMA_DEFAULT_SEED = 0xFFFFFFFF # define LLAMA_MAX_RNG_STATE (64*1024) LLAMA_MAX_RNG_STATE = 64 * 1024 -#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +# define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' LLAMA_FILE_MAGIC_GGLA = 0x67676C61 # define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' @@ -112,8 +112,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E # define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN -# define LLAMA_SESSION_VERSION 3 -LLAMA_SESSION_VERSION = 3 +# define LLAMA_SESSION_VERSION 4 +LLAMA_SESSION_VERSION = 4 # struct llama_model; @@ -179,6 +179,9 @@ LLAMA_TOKEN_TYPE_BYTE = 6 # LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; @@ -199,6 +202,9 @@ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15 LLAMA_FTYPE_MOSTLY_Q5_K_S = 16 LLAMA_FTYPE_MOSTLY_Q5_K_M = 17 LLAMA_FTYPE_MOSTLY_Q6_K = 18 +LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19 +LLAMA_FTYPE_MOSTLY_IQ2_XS = 20 +LLAMA_FTYPE_MOSTLY_Q2_K_S = 21 LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { @@ -214,6 +220,14 @@ LLAMA_ROPE_SCALING_LINEAR = 1 LLAMA_ROPE_SCALING_YARN = 2 LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN +# enum llama_split_mode { +# LLAMA_SPLIT_NONE = 0, // single GPU +# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs +# LLAMA_SPLIT_ROW = 2, // split rows across GPUs +# }; +LLAMA_SPLIT_NONE = 0 +LLAMA_SPLIT_LAYER = 1 +LLAMA_SPLIT_ROW = 2 # typedef struct llama_token_data { # llama_token id; // token id @@ -222,11 +236,12 @@ LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN # } llama_token_data; class llama_token_data(Structure): """Used to store token data - + Attributes: id (llama_token): token id logit (float): log-odds of the token p (float): probability of the token""" + _fields_ = [ ("id", llama_token), ("logit", c_float), @@ -244,11 +259,12 @@ llama_token_data_p = POINTER(llama_token_data) # } llama_token_data_array; class llama_token_data_array(Structure): """Used to sample tokens given logits - + Attributes: data (ctypes.Array[llama_token_data]): token data size (int): size of the array sorted (bool): whether the array is sorted""" + _fields_ = [ ("data", llama_token_data_p), ("size", c_size_t), @@ -303,7 +319,8 @@ class llama_batch(Structure): token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL) embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL) pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence - seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs""" + seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs + """ _fields_ = [ ("n_tokens", c_int32), @@ -318,6 +335,7 @@ class llama_batch(Structure): ("all_seq_id", llama_seq_id), ] + # enum llama_model_kv_override_type { # LLAMA_KV_OVERRIDE_INT, # LLAMA_KV_OVERRIDE_FLOAT, @@ -327,6 +345,7 @@ LLAMA_KV_OVERRIDE_INT = 0 LLAMA_KV_OVERRIDE_FLOAT = 1 LLAMA_KV_OVERRIDE_BOOL = 2 + # struct llama_model_kv_override { # char key[128]; # enum llama_model_kv_override_type tag; @@ -343,6 +362,7 @@ class llama_model_kv_override_value(CtypesUnion): ("bool_value", c_bool), ] + class llama_model_kv_override(Structure): _fields_ = [ ("key", ctypes.c_char * 128), @@ -350,15 +370,25 @@ class llama_model_kv_override(Structure): ("value", llama_model_kv_override_value), ] + # struct llama_model_params { # int32_t n_gpu_layers; // number of layers to store in VRAM -# int32_t main_gpu; // the GPU that is used for scratch and small tensors -# const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) +# enum llama_split_mode split_mode; // how to split the model across multiple GPUs + +# // main_gpu interpretation depends on split_mode: +# // LLAMA_SPLIT_NONE: the GPU that is used for the entire model +# // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results +# // LLAMA_SPLIT_LAYER: ignored +# int32_t main_gpu; + +# // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES +# const float * tensor_split; # // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. # // If the provided progress_callback returns true, model loading continues. # // If it returns false, model loading is immediately aborted. # llama_progress_callback progress_callback; + # // context pointer passed to the progress callback # void * progress_callback_user_data; @@ -372,19 +402,22 @@ class llama_model_kv_override(Structure): # }; class llama_model_params(Structure): """Parameters for llama_model - + Attributes: n_gpu_layers (int): number of layers to store in VRAM - main_gpu (int): the GPU that is used for scratch and small tensors - tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) + split_mode (int): how to split the model across multiple GPUs + main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored + tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted. progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data vocab_only (bool): only load the vocabulary, no weights use_mmap (bool): use mmap if possible use_mlock (bool): force system to keep model in RAM""" + _fields_ = [ ("n_gpu_layers", c_int32), + ("split_mode", c_int), ("main_gpu", c_int32), ("tensor_split", c_float_p), ("progress_callback", llama_progress_callback), @@ -416,6 +449,7 @@ class llama_model_params(Structure): # enum ggml_type type_k; // data type for K cache # enum ggml_type type_v; // data type for V cache + # // Keep the booleans together to avoid misalignment during copy-by-value. # bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) # bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) @@ -424,7 +458,7 @@ class llama_model_params(Structure): # }; class llama_context_params(Structure): """Parameters for llama_context - + Attributes: seed (int): RNG seed, -1 for random n_ctx (int): text context, 0 = from model @@ -444,7 +478,9 @@ class llama_context_params(Structure): mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true) logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) embedding (bool): embedding mode only - offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU""" + offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU + """ + _fields_ = [ ("seed", c_uint32), ("n_ctx", c_uint32), @@ -493,14 +529,16 @@ It might not exist for progress report where '.' is output repeatedly.""" # } llama_model_quantize_params; class llama_model_quantize_params(Structure): """Parameters for llama_model_quantize - + Attributes: nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() ftype (int): quantize to this llama_ftype allow_requantize (bool): allow quantizing non-f32/f16 tensors quantize_output_tensor (bool): quantize output.weight only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored - pure (bool): disable k-quant mixtures and quantize all tensors to the same type""" + pure (bool): disable k-quant mixtures and quantize all tensors to the same type + """ + _fields_ = [ ("nthread", c_int32), ("ftype", c_int), @@ -745,13 +783,16 @@ def llama_n_ctx(ctx: llama_context_p) -> int: _lib.llama_n_ctx.argtypes = [llama_context_p] _lib.llama_n_ctx.restype = c_uint32 + # LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); def llama_n_batch(ctx: llama_context_p) -> int: return _lib.llama_n_batch(ctx) + _lib.llama_n_batch.argtypes = [llama_context_p] _lib.llama_n_batch.restype = c_uint32 + # LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); def llama_vocab_type(model: llama_model_p) -> int: return _lib.llama_vocab_type(model) @@ -1080,7 +1121,7 @@ _lib.llama_kv_cache_view_init.restype = llama_kv_cache_view # // Free a KV cache view. (use only for debugging purposes) # LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view); -def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore +def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore """Free a KV cache view. (use only for debugging purposes)""" return _lib.llama_kv_cache_view_free(view) @@ -1091,7 +1132,7 @@ _lib.llama_kv_cache_view_free.restype = None # // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes) # LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view); -def llama_kv_cache_view_update(ctx: llama_context_p, view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore +def llama_kv_cache_view_update(ctx: llama_context_p, view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)""" return _lib.llama_kv_cache_view_update(ctx, view) @@ -1251,6 +1292,40 @@ _lib.llama_kv_cache_seq_shift.argtypes = [ ] _lib.llama_kv_cache_seq_shift.restype = None + +# // Integer division of the positions by factor of `d > 1` +# // If the KV cache is RoPEd, the KV data is updated accordingly +# // p0 < 0 : [0, p1] +# // p1 < 0 : [p0, inf) +# LLAMA_API void llama_kv_cache_seq_div( +# struct llama_context * ctx, +# llama_seq_id seq_id, +# llama_pos p0, +# llama_pos p1, +# int d); +def llama_kv_cache_seq_div( + ctx: llama_context_p, + seq_id: Union[llama_seq_id, int], + p0: Union[llama_pos, int], + p1: Union[llama_pos, int], + d: Union[c_int, int], +): + """Integer division of the positions by factor of `d > 1` + If the KV cache is RoPEd, the KV data is updated accordingly + p0 < 0 : [0, p1] + p1 < 0 : [p0, inf)""" + return _lib.llama_kv_cache_seq_div(ctx, seq_id, p0, p1, d) + + +_lib.llama_kv_cache_seq_div.argtypes = [ + llama_context_p, + llama_seq_id, + llama_pos, + llama_pos, + c_int, +] +_lib.llama_kv_cache_seq_div.restype = None + # // # // State / sessions # // @@ -2063,10 +2138,11 @@ def llama_sample_temp( temp: Union[c_float, float], ): """Temperature sampling described in academic paper "Generating Long Sequences with Sparse Transformers" https://arxiv.org/abs/1904.10509 - + Parameters: candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. - temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.""" + temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. + """ return _lib.llama_sample_temp(ctx, candidates, temp) @@ -2111,10 +2187,11 @@ def llama_sample_grammar( grammar, # type: llama_grammar_p ): """Apply constraints from grammar - + Parameters: candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. - grammar: A grammar object containing the rules and constraints to apply to the generated text.""" + grammar: A grammar object containing the rules and constraints to apply to the generated text. + """ return _lib.llama_sample_grammar(ctx, candidates, grammar) @@ -2148,13 +2225,14 @@ def llama_sample_token_mirostat( mu, # type: _Pointer[c_float] ) -> int: """Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. - + Parameters: candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. m: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. - mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.""" + mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. + """ return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) @@ -2188,12 +2266,13 @@ def llama_sample_token_mirostat_v2( mu, # type: _Pointer[c_float] ) -> int: """Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. - + Parameters: candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. - mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.""" + mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. + """ return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index aec023c..0c3b2e0 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -72,7 +72,7 @@ class LlamaGrammar: ) if verbose: print(f"{cls.from_string.__name__} grammar:", file=sys.stderr) - print_grammar(sys.stdout, parsed_grammar) + print_grammar(sys.stderr, parsed_grammar) print(file=sys.stderr) return cls(parsed_grammar) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b3a7c20..76484fb 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b3a7c20b5c035250257d2b62851c379b159c899a +Subproject commit 76484fbfd355df388f71d6edaa98e1692a74de7e