feat: Update llama.cpp

This commit is contained in:
Andrei Betlen 2024-03-01 12:57:16 -05:00
parent cf1fdd8a9a
commit f062a7f51d
3 changed files with 4 additions and 12 deletions

View file

@ -86,7 +86,6 @@ class Llama:
yarn_beta_fast: float = 32.0, yarn_beta_fast: float = 32.0,
yarn_beta_slow: float = 1.0, yarn_beta_slow: float = 1.0,
yarn_orig_ctx: int = 0, yarn_orig_ctx: int = 0,
mul_mat_q: bool = True,
logits_all: bool = False, logits_all: bool = False,
embedding: bool = False, embedding: bool = False,
offload_kqv: bool = True, offload_kqv: bool = True,
@ -291,7 +290,6 @@ class Llama:
yarn_beta_slow if yarn_beta_slow != 0.0 else 0 yarn_beta_slow if yarn_beta_slow != 0.0 else 0
) )
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0 self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
self.context_params.mul_mat_q = mul_mat_q
self.context_params.logits_all = ( self.context_params.logits_all = (
logits_all if draft_model is None else True logits_all if draft_model is None else True
) # Must be set to True for speculative decoding ) # Must be set to True for speculative decoding
@ -1724,7 +1722,6 @@ class Llama:
yarn_beta_fast=self.context_params.yarn_beta_fast, yarn_beta_fast=self.context_params.yarn_beta_fast,
yarn_beta_slow=self.context_params.yarn_beta_slow, yarn_beta_slow=self.context_params.yarn_beta_slow,
yarn_orig_ctx=self.context_params.yarn_orig_ctx, yarn_orig_ctx=self.context_params.yarn_orig_ctx,
mul_mat_q=self.context_params.mul_mat_q,
logits_all=self.context_params.logits_all, logits_all=self.context_params.logits_all,
embedding=self.context_params.embedding, embedding=self.context_params.embedding,
# Sampling Params # Sampling Params
@ -1768,7 +1765,6 @@ class Llama:
yarn_beta_fast=state["yarn_beta_fast"], yarn_beta_fast=state["yarn_beta_fast"],
yarn_beta_slow=state["yarn_beta_slow"], yarn_beta_slow=state["yarn_beta_slow"],
yarn_orig_ctx=state["yarn_orig_ctx"], yarn_orig_ctx=state["yarn_orig_ctx"],
mul_mat_q=state["mul_mat_q"],
logits_all=state["logits_all"], logits_all=state["logits_all"],
embedding=state["embedding"], embedding=state["embedding"],
# Sampling Params # Sampling Params

View file

@ -559,9 +559,7 @@ class llama_model_params(ctypes.Structure):
# enum ggml_type type_k; // data type for K cache # enum ggml_type type_k; // data type for K cache
# enum ggml_type type_v; // data type for V cache # enum ggml_type type_v; // data type for V cache
# // Keep the booleans together to avoid misalignment during copy-by-value. # // Keep the booleans together to avoid misalignment during copy-by-value.
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
# bool embedding; // embedding mode only # bool embedding; // embedding mode only
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
@ -589,7 +587,6 @@ class llama_context_params(ctypes.Structure):
cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
type_k (int): data type for K cache type_k (int): data type for K cache
type_v (int): data type for V cache type_v (int): data type for V cache
mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
embedding (bool): embedding mode only embedding (bool): embedding mode only
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
@ -615,7 +612,6 @@ class llama_context_params(ctypes.Structure):
("cb_eval_user_data", ctypes.c_void_p), ("cb_eval_user_data", ctypes.c_void_p),
("type_k", ctypes.c_int), ("type_k", ctypes.c_int),
("type_v", ctypes.c_int), ("type_v", ctypes.c_int),
("mul_mat_q", ctypes.c_bool),
("logits_all", ctypes.c_bool), ("logits_all", ctypes.c_bool),
("embedding", ctypes.c_bool), ("embedding", ctypes.c_bool),
("offload_kqv", ctypes.c_bool), ("offload_kqv", ctypes.c_bool),
@ -1519,11 +1515,11 @@ def llama_copy_state_data(
... ...
# Set the state reading from the specified address # // Set the state reading from the specified address
# Returns the number of bytes read # // Returns the number of bytes read
# LLAMA_API size_t llama_set_state_data( # LLAMA_API size_t llama_set_state_data(
# struct llama_context * ctx, # struct llama_context * ctx,
# uint8_t * src); # const uint8_t * src);
@ctypes_function( @ctypes_function(
"llama_set_state_data", "llama_set_state_data",
[llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)], [llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)],

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 08c5ee87e4cceb603ecceac90734fcdade57311b Subproject commit c2224f003bf9cf558b1a3c57033563e11a4de9a5