feat: Update llama.cpp

2024-03-01 12:57:16 -05:00 · 2024-03-01 12:57:16 -05:00 · f062a7f51d
commit f062a7f51d
parent cf1fdd8a9a
3 changed files with 4 additions and 12 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -86,7 +86,6 @@ class Llama:
        yarn_beta_fast: float = 32.0,
        yarn_beta_slow: float = 1.0,
        yarn_orig_ctx: int = 0,
-        mul_mat_q: bool = True,
        logits_all: bool = False,
        embedding: bool = False,
        offload_kqv: bool = True,
@ -291,7 +290,6 @@ class Llama:
            yarn_beta_slow if yarn_beta_slow != 0.0 else 0
        )
        self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
-        self.context_params.mul_mat_q = mul_mat_q
        self.context_params.logits_all = (
            logits_all if draft_model is None else True
        )  # Must be set to True for speculative decoding
@ -1724,7 +1722,6 @@ class Llama:
            yarn_beta_fast=self.context_params.yarn_beta_fast,
            yarn_beta_slow=self.context_params.yarn_beta_slow,
            yarn_orig_ctx=self.context_params.yarn_orig_ctx,
-            mul_mat_q=self.context_params.mul_mat_q,
            logits_all=self.context_params.logits_all,
            embedding=self.context_params.embedding,
            # Sampling Params
@ -1768,7 +1765,6 @@ class Llama:
            yarn_beta_fast=state["yarn_beta_fast"],
            yarn_beta_slow=state["yarn_beta_slow"],
            yarn_orig_ctx=state["yarn_orig_ctx"],
-            mul_mat_q=state["mul_mat_q"],
            logits_all=state["logits_all"],
            embedding=state["embedding"],
            # Sampling Params
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -559,9 +559,7 @@ class llama_model_params(ctypes.Structure):
 #     enum ggml_type type_k; // data type for K cache
 #     enum ggml_type type_v; // data type for V cache

-
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
-#     bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
 #     bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
 #     bool embedding;   // embedding mode only
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
@ -589,7 +587,6 @@ class llama_context_params(ctypes.Structure):
        cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
        type_k (int): data type for K cache
        type_v (int): data type for V cache
-        mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
        logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
        embedding (bool): embedding mode only
        offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
@ -615,7 +612,6 @@ class llama_context_params(ctypes.Structure):
        ("cb_eval_user_data", ctypes.c_void_p),
        ("type_k", ctypes.c_int),
        ("type_v", ctypes.c_int),
-        ("mul_mat_q", ctypes.c_bool),
        ("logits_all", ctypes.c_bool),
        ("embedding", ctypes.c_bool),
        ("offload_kqv", ctypes.c_bool),
@ -1519,11 +1515,11 @@ def llama_copy_state_data(
    ...


-# Set the state reading from the specified address
-# Returns the number of bytes read
+# // Set the state reading from the specified address
+# // Returns the number of bytes read
 # LLAMA_API size_t llama_set_state_data(
 #         struct llama_context * ctx,
-#                      uint8_t * src);
+#                const uint8_t * src);
@ctypes_function(
    "llama_set_state_data",
    [llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)],
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 08c5ee87e4cceb603ecceac90734fcdade57311b
+Subproject commit c2224f003bf9cf558b1a3c57033563e11a4de9a5