This commit is contained in:
baalajimaestro 2024-01-14 14:56:35 +05:30
commit 966f8cb64f
Signed by: baalajimaestro
GPG key ID: F93C394FE9BBAFD5
6 changed files with 115 additions and 28 deletions

View file

@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
## [0.2.28]
- feat: Update llama.cpp to ggerganov/llama.cpp@6efb8eb30e7025b168f3fda3ff83b9b386428ad6
- feat: Add ability to pass in penalize_nl param by @shankinson in #1068
- fix: print_grammar to stderr by @turian in #1052
## [0.2.27] ## [0.2.27]
- feat: Update llama.cpp to ggerganov/llama.cpp@b3a7c20b5c035250257d2b62851c379b159c899a - feat: Update llama.cpp to ggerganov/llama.cpp@b3a7c20b5c035250257d2b62851c379b159c899a

View file

@ -1,4 +1,4 @@
from .llama_cpp import * from .llama_cpp import *
from .llama import * from .llama import *
__version__ = "0.2.27" __version__ = "0.2.28"

View file

@ -1201,6 +1201,7 @@ class Llama:
mirostat_mode: int = 0, mirostat_mode: int = 0,
mirostat_tau: float = 5.0, mirostat_tau: float = 5.0,
mirostat_eta: float = 0.1, mirostat_eta: float = 0.1,
penalize_nl: bool = True,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: Optional[LogitsProcessorList] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None,
grammar: Optional[LlamaGrammar] = None, grammar: Optional[LlamaGrammar] = None,
@ -1261,6 +1262,7 @@ class Llama:
mirostat_eta=mirostat_eta, mirostat_eta=mirostat_eta,
logits_processor=logits_processor, logits_processor=logits_processor,
grammar=grammar, grammar=grammar,
penalize_nl=penalize_nl,
) )
if stopping_criteria is not None and stopping_criteria( if stopping_criteria is not None and stopping_criteria(
self._input_ids, self._scores[-1, :] self._input_ids, self._scores[-1, :]

View file

@ -112,8 +112,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN # define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
# define LLAMA_SESSION_VERSION 3 # define LLAMA_SESSION_VERSION 4
LLAMA_SESSION_VERSION = 3 LLAMA_SESSION_VERSION = 4
# struct llama_model; # struct llama_model;
@ -179,6 +179,9 @@ LLAMA_TOKEN_TYPE_BYTE = 6
# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
# }; # };
@ -199,6 +202,9 @@ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16 LLAMA_FTYPE_MOSTLY_Q5_K_S = 16
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17 LLAMA_FTYPE_MOSTLY_Q5_K_M = 17
LLAMA_FTYPE_MOSTLY_Q6_K = 18 LLAMA_FTYPE_MOSTLY_Q6_K = 18
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
LLAMA_FTYPE_GUESSED = 1024 LLAMA_FTYPE_GUESSED = 1024
# enum llama_rope_scaling_type { # enum llama_rope_scaling_type {
@ -214,6 +220,14 @@ LLAMA_ROPE_SCALING_LINEAR = 1
LLAMA_ROPE_SCALING_YARN = 2 LLAMA_ROPE_SCALING_YARN = 2
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
# enum llama_split_mode {
# LLAMA_SPLIT_NONE = 0, // single GPU
# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
# LLAMA_SPLIT_ROW = 2, // split rows across GPUs
# };
LLAMA_SPLIT_NONE = 0
LLAMA_SPLIT_LAYER = 1
LLAMA_SPLIT_ROW = 2
# typedef struct llama_token_data { # typedef struct llama_token_data {
# llama_token id; // token id # llama_token id; // token id
@ -227,6 +241,7 @@ class llama_token_data(Structure):
id (llama_token): token id id (llama_token): token id
logit (float): log-odds of the token logit (float): log-odds of the token
p (float): probability of the token""" p (float): probability of the token"""
_fields_ = [ _fields_ = [
("id", llama_token), ("id", llama_token),
("logit", c_float), ("logit", c_float),
@ -249,6 +264,7 @@ class llama_token_data_array(Structure):
data (ctypes.Array[llama_token_data]): token data data (ctypes.Array[llama_token_data]): token data
size (int): size of the array size (int): size of the array
sorted (bool): whether the array is sorted""" sorted (bool): whether the array is sorted"""
_fields_ = [ _fields_ = [
("data", llama_token_data_p), ("data", llama_token_data_p),
("size", c_size_t), ("size", c_size_t),
@ -303,7 +319,8 @@ class llama_batch(Structure):
token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL) token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL)
embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL) embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence
seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs""" seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs
"""
_fields_ = [ _fields_ = [
("n_tokens", c_int32), ("n_tokens", c_int32),
@ -318,6 +335,7 @@ class llama_batch(Structure):
("all_seq_id", llama_seq_id), ("all_seq_id", llama_seq_id),
] ]
# enum llama_model_kv_override_type { # enum llama_model_kv_override_type {
# LLAMA_KV_OVERRIDE_INT, # LLAMA_KV_OVERRIDE_INT,
# LLAMA_KV_OVERRIDE_FLOAT, # LLAMA_KV_OVERRIDE_FLOAT,
@ -327,6 +345,7 @@ LLAMA_KV_OVERRIDE_INT = 0
LLAMA_KV_OVERRIDE_FLOAT = 1 LLAMA_KV_OVERRIDE_FLOAT = 1
LLAMA_KV_OVERRIDE_BOOL = 2 LLAMA_KV_OVERRIDE_BOOL = 2
# struct llama_model_kv_override { # struct llama_model_kv_override {
# char key[128]; # char key[128];
# enum llama_model_kv_override_type tag; # enum llama_model_kv_override_type tag;
@ -343,6 +362,7 @@ class llama_model_kv_override_value(CtypesUnion):
("bool_value", c_bool), ("bool_value", c_bool),
] ]
class llama_model_kv_override(Structure): class llama_model_kv_override(Structure):
_fields_ = [ _fields_ = [
("key", ctypes.c_char * 128), ("key", ctypes.c_char * 128),
@ -350,15 +370,25 @@ class llama_model_kv_override(Structure):
("value", llama_model_kv_override_value), ("value", llama_model_kv_override_value),
] ]
# struct llama_model_params { # struct llama_model_params {
# int32_t n_gpu_layers; // number of layers to store in VRAM # int32_t n_gpu_layers; // number of layers to store in VRAM
# int32_t main_gpu; // the GPU that is used for scratch and small tensors # enum llama_split_mode split_mode; // how to split the model across multiple GPUs
# const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
# // main_gpu interpretation depends on split_mode:
# // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
# // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
# // LLAMA_SPLIT_LAYER: ignored
# int32_t main_gpu;
# // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
# const float * tensor_split;
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. # // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
# // If the provided progress_callback returns true, model loading continues. # // If the provided progress_callback returns true, model loading continues.
# // If it returns false, model loading is immediately aborted. # // If it returns false, model loading is immediately aborted.
# llama_progress_callback progress_callback; # llama_progress_callback progress_callback;
# // context pointer passed to the progress callback # // context pointer passed to the progress callback
# void * progress_callback_user_data; # void * progress_callback_user_data;
@ -375,16 +405,19 @@ class llama_model_params(Structure):
Attributes: Attributes:
n_gpu_layers (int): number of layers to store in VRAM n_gpu_layers (int): number of layers to store in VRAM
main_gpu (int): the GPU that is used for scratch and small tensors split_mode (int): how to split the model across multiple GPUs
tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted. progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
vocab_only (bool): only load the vocabulary, no weights vocab_only (bool): only load the vocabulary, no weights
use_mmap (bool): use mmap if possible use_mmap (bool): use mmap if possible
use_mlock (bool): force system to keep model in RAM""" use_mlock (bool): force system to keep model in RAM"""
_fields_ = [ _fields_ = [
("n_gpu_layers", c_int32), ("n_gpu_layers", c_int32),
("split_mode", c_int),
("main_gpu", c_int32), ("main_gpu", c_int32),
("tensor_split", c_float_p), ("tensor_split", c_float_p),
("progress_callback", llama_progress_callback), ("progress_callback", llama_progress_callback),
@ -416,6 +449,7 @@ class llama_model_params(Structure):
# enum ggml_type type_k; // data type for K cache # enum ggml_type type_k; // data type for K cache
# enum ggml_type type_v; // data type for V cache # enum ggml_type type_v; // data type for V cache
# // Keep the booleans together to avoid misalignment during copy-by-value. # // Keep the booleans together to avoid misalignment during copy-by-value.
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) # bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
@ -444,7 +478,9 @@ class llama_context_params(Structure):
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true) mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
embedding (bool): embedding mode only embedding (bool): embedding mode only
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU""" offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
"""
_fields_ = [ _fields_ = [
("seed", c_uint32), ("seed", c_uint32),
("n_ctx", c_uint32), ("n_ctx", c_uint32),
@ -500,7 +536,9 @@ class llama_model_quantize_params(Structure):
allow_requantize (bool): allow quantizing non-f32/f16 tensors allow_requantize (bool): allow quantizing non-f32/f16 tensors
quantize_output_tensor (bool): quantize output.weight quantize_output_tensor (bool): quantize output.weight
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
pure (bool): disable k-quant mixtures and quantize all tensors to the same type""" pure (bool): disable k-quant mixtures and quantize all tensors to the same type
"""
_fields_ = [ _fields_ = [
("nthread", c_int32), ("nthread", c_int32),
("ftype", c_int), ("ftype", c_int),
@ -745,13 +783,16 @@ def llama_n_ctx(ctx: llama_context_p) -> int:
_lib.llama_n_ctx.argtypes = [llama_context_p] _lib.llama_n_ctx.argtypes = [llama_context_p]
_lib.llama_n_ctx.restype = c_uint32 _lib.llama_n_ctx.restype = c_uint32
# LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); # LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
def llama_n_batch(ctx: llama_context_p) -> int: def llama_n_batch(ctx: llama_context_p) -> int:
return _lib.llama_n_batch(ctx) return _lib.llama_n_batch(ctx)
_lib.llama_n_batch.argtypes = [llama_context_p] _lib.llama_n_batch.argtypes = [llama_context_p]
_lib.llama_n_batch.restype = c_uint32 _lib.llama_n_batch.restype = c_uint32
# LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); # LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
def llama_vocab_type(model: llama_model_p) -> int: def llama_vocab_type(model: llama_model_p) -> int:
return _lib.llama_vocab_type(model) return _lib.llama_vocab_type(model)
@ -1251,6 +1292,40 @@ _lib.llama_kv_cache_seq_shift.argtypes = [
] ]
_lib.llama_kv_cache_seq_shift.restype = None _lib.llama_kv_cache_seq_shift.restype = None
# // Integer division of the positions by factor of `d > 1`
# // If the KV cache is RoPEd, the KV data is updated accordingly
# // p0 < 0 : [0, p1]
# // p1 < 0 : [p0, inf)
# LLAMA_API void llama_kv_cache_seq_div(
# struct llama_context * ctx,
# llama_seq_id seq_id,
# llama_pos p0,
# llama_pos p1,
# int d);
def llama_kv_cache_seq_div(
ctx: llama_context_p,
seq_id: Union[llama_seq_id, int],
p0: Union[llama_pos, int],
p1: Union[llama_pos, int],
d: Union[c_int, int],
):
"""Integer division of the positions by factor of `d > 1`
If the KV cache is RoPEd, the KV data is updated accordingly
p0 < 0 : [0, p1]
p1 < 0 : [p0, inf)"""
return _lib.llama_kv_cache_seq_div(ctx, seq_id, p0, p1, d)
_lib.llama_kv_cache_seq_div.argtypes = [
llama_context_p,
llama_seq_id,
llama_pos,
llama_pos,
c_int,
]
_lib.llama_kv_cache_seq_div.restype = None
# // # //
# // State / sessions # // State / sessions
# // # //
@ -2066,7 +2141,8 @@ def llama_sample_temp(
Parameters: Parameters:
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.""" temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
"""
return _lib.llama_sample_temp(ctx, candidates, temp) return _lib.llama_sample_temp(ctx, candidates, temp)
@ -2114,7 +2190,8 @@ def llama_sample_grammar(
Parameters: Parameters:
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
grammar: A grammar object containing the rules and constraints to apply to the generated text.""" grammar: A grammar object containing the rules and constraints to apply to the generated text.
"""
return _lib.llama_sample_grammar(ctx, candidates, grammar) return _lib.llama_sample_grammar(ctx, candidates, grammar)
@ -2154,7 +2231,8 @@ def llama_sample_token_mirostat(
tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
m: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. m: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.""" mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
"""
return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
@ -2193,7 +2271,8 @@ def llama_sample_token_mirostat_v2(
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.""" mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
"""
return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)

View file

@ -72,7 +72,7 @@ class LlamaGrammar:
) )
if verbose: if verbose:
print(f"{cls.from_string.__name__} grammar:", file=sys.stderr) print(f"{cls.from_string.__name__} grammar:", file=sys.stderr)
print_grammar(sys.stdout, parsed_grammar) print_grammar(sys.stderr, parsed_grammar)
print(file=sys.stderr) print(file=sys.stderr)
return cls(parsed_grammar) return cls(parsed_grammar)

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit b3a7c20b5c035250257d2b62851c379b159c899a Subproject commit 76484fbfd355df388f71d6edaa98e1692a74de7e