|
|
|
@ -112,8 +112,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E
|
|
|
|
|
|
|
|
|
|
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
|
|
|
|
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
|
|
|
|
|
# define LLAMA_SESSION_VERSION 4
|
|
|
|
|
LLAMA_SESSION_VERSION = 4
|
|
|
|
|
# define LLAMA_SESSION_VERSION 3
|
|
|
|
|
LLAMA_SESSION_VERSION = 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# struct llama_model;
|
|
|
|
@ -179,9 +179,6 @@ LLAMA_TOKEN_TYPE_BYTE = 6
|
|
|
|
|
# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
|
|
|
|
# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
|
|
|
|
# LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
|
|
|
|
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
|
|
|
|
# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
|
|
|
|
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
|
|
|
|
|
|
|
|
|
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
|
|
|
|
# };
|
|
|
|
@ -202,9 +199,6 @@ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15
|
|
|
|
|
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16
|
|
|
|
|
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17
|
|
|
|
|
LLAMA_FTYPE_MOSTLY_Q6_K = 18
|
|
|
|
|
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19
|
|
|
|
|
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
|
|
|
|
|
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
|
|
|
|
|
LLAMA_FTYPE_GUESSED = 1024
|
|
|
|
|
|
|
|
|
|
# enum llama_rope_scaling_type {
|
|
|
|
@ -220,14 +214,6 @@ LLAMA_ROPE_SCALING_LINEAR = 1
|
|
|
|
|
LLAMA_ROPE_SCALING_YARN = 2
|
|
|
|
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
|
|
|
|
|
|
|
|
|
|
# enum llama_split_mode {
|
|
|
|
|
# LLAMA_SPLIT_NONE = 0, // single GPU
|
|
|
|
|
# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
|
|
|
|
# LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
|
|
|
|
# };
|
|
|
|
|
LLAMA_SPLIT_NONE = 0
|
|
|
|
|
LLAMA_SPLIT_LAYER = 1
|
|
|
|
|
LLAMA_SPLIT_ROW = 2
|
|
|
|
|
|
|
|
|
|
# typedef struct llama_token_data {
|
|
|
|
|
# llama_token id; // token id
|
|
|
|
@ -241,7 +227,6 @@ class llama_token_data(Structure):
|
|
|
|
|
id (llama_token): token id
|
|
|
|
|
logit (float): log-odds of the token
|
|
|
|
|
p (float): probability of the token"""
|
|
|
|
|
|
|
|
|
|
_fields_ = [
|
|
|
|
|
("id", llama_token),
|
|
|
|
|
("logit", c_float),
|
|
|
|
@ -264,7 +249,6 @@ class llama_token_data_array(Structure):
|
|
|
|
|
data (ctypes.Array[llama_token_data]): token data
|
|
|
|
|
size (int): size of the array
|
|
|
|
|
sorted (bool): whether the array is sorted"""
|
|
|
|
|
|
|
|
|
|
_fields_ = [
|
|
|
|
|
("data", llama_token_data_p),
|
|
|
|
|
("size", c_size_t),
|
|
|
|
@ -319,8 +303,7 @@ class llama_batch(Structure):
|
|
|
|
|
token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL)
|
|
|
|
|
embd (ctypes.Array[ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
|
|
|
|
pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence
|
|
|
|
|
seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs
|
|
|
|
|
"""
|
|
|
|
|
seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs"""
|
|
|
|
|
|
|
|
|
|
_fields_ = [
|
|
|
|
|
("n_tokens", c_int32),
|
|
|
|
@ -335,7 +318,6 @@ class llama_batch(Structure):
|
|
|
|
|
("all_seq_id", llama_seq_id),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# enum llama_model_kv_override_type {
|
|
|
|
|
# LLAMA_KV_OVERRIDE_INT,
|
|
|
|
|
# LLAMA_KV_OVERRIDE_FLOAT,
|
|
|
|
@ -345,7 +327,6 @@ LLAMA_KV_OVERRIDE_INT = 0
|
|
|
|
|
LLAMA_KV_OVERRIDE_FLOAT = 1
|
|
|
|
|
LLAMA_KV_OVERRIDE_BOOL = 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# struct llama_model_kv_override {
|
|
|
|
|
# char key[128];
|
|
|
|
|
# enum llama_model_kv_override_type tag;
|
|
|
|
@ -362,7 +343,6 @@ class llama_model_kv_override_value(CtypesUnion):
|
|
|
|
|
("bool_value", c_bool),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class llama_model_kv_override(Structure):
|
|
|
|
|
_fields_ = [
|
|
|
|
|
("key", ctypes.c_char * 128),
|
|
|
|
@ -370,25 +350,15 @@ class llama_model_kv_override(Structure):
|
|
|
|
|
("value", llama_model_kv_override_value),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# struct llama_model_params {
|
|
|
|
|
# int32_t n_gpu_layers; // number of layers to store in VRAM
|
|
|
|
|
# enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
|
|
|
|
|
|
|
|
|
# // main_gpu interpretation depends on split_mode:
|
|
|
|
|
# // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
|
|
|
|
|
# // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
|
|
|
|
|
# // LLAMA_SPLIT_LAYER: ignored
|
|
|
|
|
# int32_t main_gpu;
|
|
|
|
|
|
|
|
|
|
# // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
|
|
|
|
# const float * tensor_split;
|
|
|
|
|
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
|
|
|
|
# const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
|
|
|
|
|
|
|
|
|
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
|
|
|
|
# // If the provided progress_callback returns true, model loading continues.
|
|
|
|
|
# // If it returns false, model loading is immediately aborted.
|
|
|
|
|
# llama_progress_callback progress_callback;
|
|
|
|
|
|
|
|
|
|
# // context pointer passed to the progress callback
|
|
|
|
|
# void * progress_callback_user_data;
|
|
|
|
|
|
|
|
|
@ -405,19 +375,16 @@ class llama_model_params(Structure):
|
|
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
|
n_gpu_layers (int): number of layers to store in VRAM
|
|
|
|
|
split_mode (int): how to split the model across multiple GPUs
|
|
|
|
|
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
|
|
|
|
|
tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
|
|
|
|
main_gpu (int): the GPU that is used for scratch and small tensors
|
|
|
|
|
tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
|
|
|
|
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
|
|
|
|
|
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
|
|
|
|
|
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
|
|
|
|
|
vocab_only (bool): only load the vocabulary, no weights
|
|
|
|
|
use_mmap (bool): use mmap if possible
|
|
|
|
|
use_mlock (bool): force system to keep model in RAM"""
|
|
|
|
|
|
|
|
|
|
_fields_ = [
|
|
|
|
|
("n_gpu_layers", c_int32),
|
|
|
|
|
("split_mode", c_int),
|
|
|
|
|
("main_gpu", c_int32),
|
|
|
|
|
("tensor_split", c_float_p),
|
|
|
|
|
("progress_callback", llama_progress_callback),
|
|
|
|
@ -449,7 +416,6 @@ class llama_model_params(Structure):
|
|
|
|
|
# enum ggml_type type_k; // data type for K cache
|
|
|
|
|
# enum ggml_type type_v; // data type for V cache
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // Keep the booleans together to avoid misalignment during copy-by-value.
|
|
|
|
|
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
|
|
|
|
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
|
|
|
@ -478,9 +444,7 @@ class llama_context_params(Structure):
|
|
|
|
|
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
|
|
|
|
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
|
|
|
|
embedding (bool): embedding mode only
|
|
|
|
|
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU"""
|
|
|
|
|
_fields_ = [
|
|
|
|
|
("seed", c_uint32),
|
|
|
|
|
("n_ctx", c_uint32),
|
|
|
|
@ -536,9 +500,7 @@ class llama_model_quantize_params(Structure):
|
|
|
|
|
allow_requantize (bool): allow quantizing non-f32/f16 tensors
|
|
|
|
|
quantize_output_tensor (bool): quantize output.weight
|
|
|
|
|
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
|
|
|
|
pure (bool): disable k-quant mixtures and quantize all tensors to the same type
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
pure (bool): disable k-quant mixtures and quantize all tensors to the same type"""
|
|
|
|
|
_fields_ = [
|
|
|
|
|
("nthread", c_int32),
|
|
|
|
|
("ftype", c_int),
|
|
|
|
@ -783,16 +745,13 @@ def llama_n_ctx(ctx: llama_context_p) -> int:
|
|
|
|
|
_lib.llama_n_ctx.argtypes = [llama_context_p]
|
|
|
|
|
_lib.llama_n_ctx.restype = c_uint32
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
|
|
|
|
def llama_n_batch(ctx: llama_context_p) -> int:
|
|
|
|
|
return _lib.llama_n_batch(ctx)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_lib.llama_n_batch.argtypes = [llama_context_p]
|
|
|
|
|
_lib.llama_n_batch.restype = c_uint32
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
|
|
|
|
def llama_vocab_type(model: llama_model_p) -> int:
|
|
|
|
|
return _lib.llama_vocab_type(model)
|
|
|
|
@ -1292,40 +1251,6 @@ _lib.llama_kv_cache_seq_shift.argtypes = [
|
|
|
|
|
]
|
|
|
|
|
_lib.llama_kv_cache_seq_shift.restype = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // Integer division of the positions by factor of `d > 1`
|
|
|
|
|
# // If the KV cache is RoPEd, the KV data is updated accordingly
|
|
|
|
|
# // p0 < 0 : [0, p1]
|
|
|
|
|
# // p1 < 0 : [p0, inf)
|
|
|
|
|
# LLAMA_API void llama_kv_cache_seq_div(
|
|
|
|
|
# struct llama_context * ctx,
|
|
|
|
|
# llama_seq_id seq_id,
|
|
|
|
|
# llama_pos p0,
|
|
|
|
|
# llama_pos p1,
|
|
|
|
|
# int d);
|
|
|
|
|
def llama_kv_cache_seq_div(
|
|
|
|
|
ctx: llama_context_p,
|
|
|
|
|
seq_id: Union[llama_seq_id, int],
|
|
|
|
|
p0: Union[llama_pos, int],
|
|
|
|
|
p1: Union[llama_pos, int],
|
|
|
|
|
d: Union[c_int, int],
|
|
|
|
|
):
|
|
|
|
|
"""Integer division of the positions by factor of `d > 1`
|
|
|
|
|
If the KV cache is RoPEd, the KV data is updated accordingly
|
|
|
|
|
p0 < 0 : [0, p1]
|
|
|
|
|
p1 < 0 : [p0, inf)"""
|
|
|
|
|
return _lib.llama_kv_cache_seq_div(ctx, seq_id, p0, p1, d)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_lib.llama_kv_cache_seq_div.argtypes = [
|
|
|
|
|
llama_context_p,
|
|
|
|
|
llama_seq_id,
|
|
|
|
|
llama_pos,
|
|
|
|
|
llama_pos,
|
|
|
|
|
c_int,
|
|
|
|
|
]
|
|
|
|
|
_lib.llama_kv_cache_seq_div.restype = None
|
|
|
|
|
|
|
|
|
|
# //
|
|
|
|
|
# // State / sessions
|
|
|
|
|
# //
|
|
|
|
@ -2141,8 +2066,7 @@ def llama_sample_temp(
|
|
|
|
|
|
|
|
|
|
Parameters:
|
|
|
|
|
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
|
|
|
|
temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
|
|
|
|
"""
|
|
|
|
|
temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text."""
|
|
|
|
|
return _lib.llama_sample_temp(ctx, candidates, temp)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -2190,8 +2114,7 @@ def llama_sample_grammar(
|
|
|
|
|
|
|
|
|
|
Parameters:
|
|
|
|
|
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
|
|
|
|
grammar: A grammar object containing the rules and constraints to apply to the generated text.
|
|
|
|
|
"""
|
|
|
|
|
grammar: A grammar object containing the rules and constraints to apply to the generated text."""
|
|
|
|
|
return _lib.llama_sample_grammar(ctx, candidates, grammar)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -2231,8 +2154,7 @@ def llama_sample_token_mirostat(
|
|
|
|
|
tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
|
|
|
|
eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
|
|
|
|
m: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
|
|
|
|
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
|
|
|
|
"""
|
|
|
|
|
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal."""
|
|
|
|
|
return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -2271,8 +2193,7 @@ def llama_sample_token_mirostat_v2(
|
|
|
|
|
candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
|
|
|
|
tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
|
|
|
|
eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
|
|
|
|
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
|
|
|
|
"""
|
|
|
|
|
mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal."""
|
|
|
|
|
return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|