feat: Update llama.cpp
This commit is contained in:
parent
cf1fdd8a9a
commit
f062a7f51d
3 changed files with 4 additions and 12 deletions
|
@ -86,7 +86,6 @@ class Llama:
|
|||
yarn_beta_fast: float = 32.0,
|
||||
yarn_beta_slow: float = 1.0,
|
||||
yarn_orig_ctx: int = 0,
|
||||
mul_mat_q: bool = True,
|
||||
logits_all: bool = False,
|
||||
embedding: bool = False,
|
||||
offload_kqv: bool = True,
|
||||
|
@ -291,7 +290,6 @@ class Llama:
|
|||
yarn_beta_slow if yarn_beta_slow != 0.0 else 0
|
||||
)
|
||||
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
|
||||
self.context_params.mul_mat_q = mul_mat_q
|
||||
self.context_params.logits_all = (
|
||||
logits_all if draft_model is None else True
|
||||
) # Must be set to True for speculative decoding
|
||||
|
@ -1724,7 +1722,6 @@ class Llama:
|
|||
yarn_beta_fast=self.context_params.yarn_beta_fast,
|
||||
yarn_beta_slow=self.context_params.yarn_beta_slow,
|
||||
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
|
||||
mul_mat_q=self.context_params.mul_mat_q,
|
||||
logits_all=self.context_params.logits_all,
|
||||
embedding=self.context_params.embedding,
|
||||
# Sampling Params
|
||||
|
@ -1768,7 +1765,6 @@ class Llama:
|
|||
yarn_beta_fast=state["yarn_beta_fast"],
|
||||
yarn_beta_slow=state["yarn_beta_slow"],
|
||||
yarn_orig_ctx=state["yarn_orig_ctx"],
|
||||
mul_mat_q=state["mul_mat_q"],
|
||||
logits_all=state["logits_all"],
|
||||
embedding=state["embedding"],
|
||||
# Sampling Params
|
||||
|
|
|
@ -559,9 +559,7 @@ class llama_model_params(ctypes.Structure):
|
|||
# enum ggml_type type_k; // data type for K cache
|
||||
# enum ggml_type type_v; // data type for V cache
|
||||
|
||||
|
||||
# // Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
# bool embedding; // embedding mode only
|
||||
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
|
@ -589,7 +587,6 @@ class llama_context_params(ctypes.Structure):
|
|||
cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
|
||||
type_k (int): data type for K cache
|
||||
type_v (int): data type for V cache
|
||||
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
embedding (bool): embedding mode only
|
||||
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
|
||||
|
@ -615,7 +612,6 @@ class llama_context_params(ctypes.Structure):
|
|||
("cb_eval_user_data", ctypes.c_void_p),
|
||||
("type_k", ctypes.c_int),
|
||||
("type_v", ctypes.c_int),
|
||||
("mul_mat_q", ctypes.c_bool),
|
||||
("logits_all", ctypes.c_bool),
|
||||
("embedding", ctypes.c_bool),
|
||||
("offload_kqv", ctypes.c_bool),
|
||||
|
@ -1519,11 +1515,11 @@ def llama_copy_state_data(
|
|||
...
|
||||
|
||||
|
||||
# Set the state reading from the specified address
|
||||
# Returns the number of bytes read
|
||||
# // Set the state reading from the specified address
|
||||
# // Returns the number of bytes read
|
||||
# LLAMA_API size_t llama_set_state_data(
|
||||
# struct llama_context * ctx,
|
||||
# uint8_t * src);
|
||||
# const uint8_t * src);
|
||||
@ctypes_function(
|
||||
"llama_set_state_data",
|
||||
[llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)],
|
||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 08c5ee87e4cceb603ecceac90734fcdade57311b
|
||||
Subproject commit c2224f003bf9cf558b1a3c57033563e11a4de9a5
|
Loading…
Reference in a new issue