Update llama.cpp

2023-12-11 10:21:35 -05:00 · 2023-12-11 10:21:35 -05:00 · f1edc66b21
commit f1edc66b21
parent f3b844ed0a
2 changed files with 46 additions and 7 deletions
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -103,8 +103,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E

 # define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
-# define LLAMA_SESSION_VERSION 2
-LLAMA_SESSION_VERSION = 2
+# define LLAMA_SESSION_VERSION 3
+LLAMA_SESSION_VERSION = 3


 # struct llama_model;
@ -309,6 +309,35 @@ class llama_batch(Structure):
        ("all_seq_id", llama_seq_id),
    ]

+# enum llama_model_kv_override_type {
+#     LLAMA_KV_OVERRIDE_INT,
+#     LLAMA_KV_OVERRIDE_FLOAT,
+#     LLAMA_KV_OVERRIDE_BOOL,
+# };
+class llama_model_kv_override_type(Structure):
+    _fields_ = [
+        ("LLAMA_KV_OVERRIDE_INT", c_int),
+        ("LLAMA_KV_OVERRIDE_FLOAT", c_int),
+        ("LLAMA_KV_OVERRIDE_BOOL", c_int),
+    ]
+
+# struct llama_model_kv_override {
+#     char key[128];
+#     enum llama_model_kv_override_type tag;
+#     union {
+#         int64_t int_value;
+#         double float_value;
+#         bool bool_value;
+#     };
+# };
+class llama_model_kv_override(Structure):
+    _fields_ = [
+        ("key", ctypes.c_char * 128),
+        ("tag", llama_model_kv_override_type),
+        ("int_value", ctypes.c_int64),
+        ("float_value", c_double),
+        ("bool_value", c_bool),
+    ]

 # struct llama_model_params {
 #     int32_t n_gpu_layers; // number of layers to store in VRAM
@ -320,6 +349,8 @@ class llama_batch(Structure):
 #     // context pointer passed to the progress callback
 #     void * progress_callback_user_data;

+#     // override key-value pairs of the model meta data
+#     const struct llama_model_kv_override * kv_overrides;

 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool vocab_only; // only load the vocabulary, no weights
@ -335,6 +366,7 @@ class llama_model_params(Structure):
        tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
        progress_callback (llama_progress_callback): called with a progress value between 0 and 1, pass NULL to disable
        progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
+        kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
        vocab_only (bool): only load the vocabulary, no weights
        use_mmap (bool): use mmap if possible
        use_mlock (bool): force system to keep model in RAM"""
@ -344,6 +376,7 @@ class llama_model_params(Structure):
        ("tensor_split", c_float_p),
        ("progress_callback", llama_progress_callback),
        ("progress_callback_user_data", c_void_p),
+        ("kv_overrides", POINTER(llama_model_kv_override)),
        ("vocab_only", c_bool),
        ("use_mmap", c_bool),
        ("use_mlock", c_bool),
@ -367,12 +400,14 @@ class llama_model_params(Structure):
 #     float    yarn_beta_slow;   // YaRN high correction dim
 #     uint32_t yarn_orig_ctx;    // YaRN original context size

+#     enum ggml_type type_k; // data type for K cache
+#     enum ggml_type type_v; // data type for V cache

 #     // Keep the booleans together to avoid misalignment during copy-by-value.
-#     bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-#     bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
-#     bool logits_all; // the llama_eval() call computes all logits, not just the last one
-#     bool embedding;  // embedding mode only
+#     bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+#     bool logits_all;  // the llama_eval() call computes all logits, not just the last one
+#     bool embedding;   // embedding mode only
+#     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
 # };
 class llama_context_params(Structure):
    """Parameters for llama_context
@ -391,6 +426,8 @@ class llama_context_params(Structure):
        yarn_beta_fast (float): YaRN low correction dim
        yarn_beta_slow (float): YaRN high correction dim
        yarn_orig_ctx (int): YaRN original context size
+        type_k (int): data type for K cache
+        type_v (int): data type for V cache
        mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
        f16_kv (bool): use fp16 for KV cache, fp32 otherwise
        logits_all (bool): the llama_eval() call computes all logits, not just the last one
@ -409,6 +446,8 @@ class llama_context_params(Structure):
        ("yarn_beta_fast", c_float),
        ("yarn_beta_slow", c_float),
        ("yarn_orig_ctx", c_uint32),
+        ("type_k", c_int),
+        ("type_v", c_int),
        ("mul_mat_q", c_bool),
        ("f16_kv", c_bool),
        ("logits_all", c_bool),
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 1f5cd83275fabb43f2ae92c30033b384a3eb37b4
+Subproject commit 8a7b2fa528f130631a5f43648481596ab320ed5a