Update llama.cpp
This commit is contained in:
parent
f3b844ed0a
commit
f1edc66b21
2 changed files with 46 additions and 7 deletions
|
@ -103,8 +103,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E
|
||||||
|
|
||||||
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
|
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
|
||||||
# define LLAMA_SESSION_VERSION 2
|
# define LLAMA_SESSION_VERSION 3
|
||||||
LLAMA_SESSION_VERSION = 2
|
LLAMA_SESSION_VERSION = 3
|
||||||
|
|
||||||
|
|
||||||
# struct llama_model;
|
# struct llama_model;
|
||||||
|
@ -309,6 +309,35 @@ class llama_batch(Structure):
|
||||||
("all_seq_id", llama_seq_id),
|
("all_seq_id", llama_seq_id),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# enum llama_model_kv_override_type {
|
||||||
|
# LLAMA_KV_OVERRIDE_INT,
|
||||||
|
# LLAMA_KV_OVERRIDE_FLOAT,
|
||||||
|
# LLAMA_KV_OVERRIDE_BOOL,
|
||||||
|
# };
|
||||||
|
class llama_model_kv_override_type(Structure):
|
||||||
|
_fields_ = [
|
||||||
|
("LLAMA_KV_OVERRIDE_INT", c_int),
|
||||||
|
("LLAMA_KV_OVERRIDE_FLOAT", c_int),
|
||||||
|
("LLAMA_KV_OVERRIDE_BOOL", c_int),
|
||||||
|
]
|
||||||
|
|
||||||
|
# struct llama_model_kv_override {
|
||||||
|
# char key[128];
|
||||||
|
# enum llama_model_kv_override_type tag;
|
||||||
|
# union {
|
||||||
|
# int64_t int_value;
|
||||||
|
# double float_value;
|
||||||
|
# bool bool_value;
|
||||||
|
# };
|
||||||
|
# };
|
||||||
|
class llama_model_kv_override(Structure):
|
||||||
|
_fields_ = [
|
||||||
|
("key", ctypes.c_char * 128),
|
||||||
|
("tag", llama_model_kv_override_type),
|
||||||
|
("int_value", ctypes.c_int64),
|
||||||
|
("float_value", c_double),
|
||||||
|
("bool_value", c_bool),
|
||||||
|
]
|
||||||
|
|
||||||
# struct llama_model_params {
|
# struct llama_model_params {
|
||||||
# int32_t n_gpu_layers; // number of layers to store in VRAM
|
# int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
|
@ -320,6 +349,8 @@ class llama_batch(Structure):
|
||||||
# // context pointer passed to the progress callback
|
# // context pointer passed to the progress callback
|
||||||
# void * progress_callback_user_data;
|
# void * progress_callback_user_data;
|
||||||
|
|
||||||
|
# // override key-value pairs of the model meta data
|
||||||
|
# const struct llama_model_kv_override * kv_overrides;
|
||||||
|
|
||||||
# // Keep the booleans together to avoid misalignment during copy-by-value.
|
# // Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
# bool vocab_only; // only load the vocabulary, no weights
|
# bool vocab_only; // only load the vocabulary, no weights
|
||||||
|
@ -335,6 +366,7 @@ class llama_model_params(Structure):
|
||||||
tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
||||||
progress_callback (llama_progress_callback): called with a progress value between 0 and 1, pass NULL to disable
|
progress_callback (llama_progress_callback): called with a progress value between 0 and 1, pass NULL to disable
|
||||||
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
|
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
|
||||||
|
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
|
||||||
vocab_only (bool): only load the vocabulary, no weights
|
vocab_only (bool): only load the vocabulary, no weights
|
||||||
use_mmap (bool): use mmap if possible
|
use_mmap (bool): use mmap if possible
|
||||||
use_mlock (bool): force system to keep model in RAM"""
|
use_mlock (bool): force system to keep model in RAM"""
|
||||||
|
@ -344,6 +376,7 @@ class llama_model_params(Structure):
|
||||||
("tensor_split", c_float_p),
|
("tensor_split", c_float_p),
|
||||||
("progress_callback", llama_progress_callback),
|
("progress_callback", llama_progress_callback),
|
||||||
("progress_callback_user_data", c_void_p),
|
("progress_callback_user_data", c_void_p),
|
||||||
|
("kv_overrides", POINTER(llama_model_kv_override)),
|
||||||
("vocab_only", c_bool),
|
("vocab_only", c_bool),
|
||||||
("use_mmap", c_bool),
|
("use_mmap", c_bool),
|
||||||
("use_mlock", c_bool),
|
("use_mlock", c_bool),
|
||||||
|
@ -367,12 +400,14 @@ class llama_model_params(Structure):
|
||||||
# float yarn_beta_slow; // YaRN high correction dim
|
# float yarn_beta_slow; // YaRN high correction dim
|
||||||
# uint32_t yarn_orig_ctx; // YaRN original context size
|
# uint32_t yarn_orig_ctx; // YaRN original context size
|
||||||
|
|
||||||
|
# enum ggml_type type_k; // data type for K cache
|
||||||
|
# enum ggml_type type_v; // data type for V cache
|
||||||
|
|
||||||
# // Keep the booleans together to avoid misalignment during copy-by-value.
|
# // Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||||
# bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
|
||||||
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
# bool embedding; // embedding mode only
|
# bool embedding; // embedding mode only
|
||||||
|
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
# };
|
# };
|
||||||
class llama_context_params(Structure):
|
class llama_context_params(Structure):
|
||||||
"""Parameters for llama_context
|
"""Parameters for llama_context
|
||||||
|
@ -391,6 +426,8 @@ class llama_context_params(Structure):
|
||||||
yarn_beta_fast (float): YaRN low correction dim
|
yarn_beta_fast (float): YaRN low correction dim
|
||||||
yarn_beta_slow (float): YaRN high correction dim
|
yarn_beta_slow (float): YaRN high correction dim
|
||||||
yarn_orig_ctx (int): YaRN original context size
|
yarn_orig_ctx (int): YaRN original context size
|
||||||
|
type_k (int): data type for K cache
|
||||||
|
type_v (int): data type for V cache
|
||||||
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||||
f16_kv (bool): use fp16 for KV cache, fp32 otherwise
|
f16_kv (bool): use fp16 for KV cache, fp32 otherwise
|
||||||
logits_all (bool): the llama_eval() call computes all logits, not just the last one
|
logits_all (bool): the llama_eval() call computes all logits, not just the last one
|
||||||
|
@ -409,6 +446,8 @@ class llama_context_params(Structure):
|
||||||
("yarn_beta_fast", c_float),
|
("yarn_beta_fast", c_float),
|
||||||
("yarn_beta_slow", c_float),
|
("yarn_beta_slow", c_float),
|
||||||
("yarn_orig_ctx", c_uint32),
|
("yarn_orig_ctx", c_uint32),
|
||||||
|
("type_k", c_int),
|
||||||
|
("type_v", c_int),
|
||||||
("mul_mat_q", c_bool),
|
("mul_mat_q", c_bool),
|
||||||
("f16_kv", c_bool),
|
("f16_kv", c_bool),
|
||||||
("logits_all", c_bool),
|
("logits_all", c_bool),
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 1f5cd83275fabb43f2ae92c30033b384a3eb37b4
|
Subproject commit 8a7b2fa528f130631a5f43648481596ab320ed5a
|
Loading…
Reference in a new issue