feat: Update llama.cpp
This commit is contained in:
parent
cf1fdd8a9a
commit
f062a7f51d
3 changed files with 4 additions and 12 deletions
|
@ -86,7 +86,6 @@ class Llama:
|
||||||
yarn_beta_fast: float = 32.0,
|
yarn_beta_fast: float = 32.0,
|
||||||
yarn_beta_slow: float = 1.0,
|
yarn_beta_slow: float = 1.0,
|
||||||
yarn_orig_ctx: int = 0,
|
yarn_orig_ctx: int = 0,
|
||||||
mul_mat_q: bool = True,
|
|
||||||
logits_all: bool = False,
|
logits_all: bool = False,
|
||||||
embedding: bool = False,
|
embedding: bool = False,
|
||||||
offload_kqv: bool = True,
|
offload_kqv: bool = True,
|
||||||
|
@ -291,7 +290,6 @@ class Llama:
|
||||||
yarn_beta_slow if yarn_beta_slow != 0.0 else 0
|
yarn_beta_slow if yarn_beta_slow != 0.0 else 0
|
||||||
)
|
)
|
||||||
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
|
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
|
||||||
self.context_params.mul_mat_q = mul_mat_q
|
|
||||||
self.context_params.logits_all = (
|
self.context_params.logits_all = (
|
||||||
logits_all if draft_model is None else True
|
logits_all if draft_model is None else True
|
||||||
) # Must be set to True for speculative decoding
|
) # Must be set to True for speculative decoding
|
||||||
|
@ -1724,7 +1722,6 @@ class Llama:
|
||||||
yarn_beta_fast=self.context_params.yarn_beta_fast,
|
yarn_beta_fast=self.context_params.yarn_beta_fast,
|
||||||
yarn_beta_slow=self.context_params.yarn_beta_slow,
|
yarn_beta_slow=self.context_params.yarn_beta_slow,
|
||||||
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
|
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
|
||||||
mul_mat_q=self.context_params.mul_mat_q,
|
|
||||||
logits_all=self.context_params.logits_all,
|
logits_all=self.context_params.logits_all,
|
||||||
embedding=self.context_params.embedding,
|
embedding=self.context_params.embedding,
|
||||||
# Sampling Params
|
# Sampling Params
|
||||||
|
@ -1768,7 +1765,6 @@ class Llama:
|
||||||
yarn_beta_fast=state["yarn_beta_fast"],
|
yarn_beta_fast=state["yarn_beta_fast"],
|
||||||
yarn_beta_slow=state["yarn_beta_slow"],
|
yarn_beta_slow=state["yarn_beta_slow"],
|
||||||
yarn_orig_ctx=state["yarn_orig_ctx"],
|
yarn_orig_ctx=state["yarn_orig_ctx"],
|
||||||
mul_mat_q=state["mul_mat_q"],
|
|
||||||
logits_all=state["logits_all"],
|
logits_all=state["logits_all"],
|
||||||
embedding=state["embedding"],
|
embedding=state["embedding"],
|
||||||
# Sampling Params
|
# Sampling Params
|
||||||
|
|
|
@ -559,9 +559,7 @@ class llama_model_params(ctypes.Structure):
|
||||||
# enum ggml_type type_k; // data type for K cache
|
# enum ggml_type type_k; // data type for K cache
|
||||||
# enum ggml_type type_v; // data type for V cache
|
# enum ggml_type type_v; // data type for V cache
|
||||||
|
|
||||||
|
|
||||||
# // Keep the booleans together to avoid misalignment during copy-by-value.
|
# // Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
|
||||||
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
# bool embedding; // embedding mode only
|
# bool embedding; // embedding mode only
|
||||||
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
|
@ -589,7 +587,6 @@ class llama_context_params(ctypes.Structure):
|
||||||
cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
|
cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
|
||||||
type_k (int): data type for K cache
|
type_k (int): data type for K cache
|
||||||
type_v (int): data type for V cache
|
type_v (int): data type for V cache
|
||||||
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
|
||||||
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
embedding (bool): embedding mode only
|
embedding (bool): embedding mode only
|
||||||
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
|
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
|
@ -615,7 +612,6 @@ class llama_context_params(ctypes.Structure):
|
||||||
("cb_eval_user_data", ctypes.c_void_p),
|
("cb_eval_user_data", ctypes.c_void_p),
|
||||||
("type_k", ctypes.c_int),
|
("type_k", ctypes.c_int),
|
||||||
("type_v", ctypes.c_int),
|
("type_v", ctypes.c_int),
|
||||||
("mul_mat_q", ctypes.c_bool),
|
|
||||||
("logits_all", ctypes.c_bool),
|
("logits_all", ctypes.c_bool),
|
||||||
("embedding", ctypes.c_bool),
|
("embedding", ctypes.c_bool),
|
||||||
("offload_kqv", ctypes.c_bool),
|
("offload_kqv", ctypes.c_bool),
|
||||||
|
@ -1519,11 +1515,11 @@ def llama_copy_state_data(
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
# Set the state reading from the specified address
|
# // Set the state reading from the specified address
|
||||||
# Returns the number of bytes read
|
# // Returns the number of bytes read
|
||||||
# LLAMA_API size_t llama_set_state_data(
|
# LLAMA_API size_t llama_set_state_data(
|
||||||
# struct llama_context * ctx,
|
# struct llama_context * ctx,
|
||||||
# uint8_t * src);
|
# const uint8_t * src);
|
||||||
@ctypes_function(
|
@ctypes_function(
|
||||||
"llama_set_state_data",
|
"llama_set_state_data",
|
||||||
[llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)],
|
[llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)],
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 08c5ee87e4cceb603ecceac90734fcdade57311b
|
Subproject commit c2224f003bf9cf558b1a3c57033563e11a4de9a5
|
Loading…
Reference in a new issue