From f1edc66b21f05a0b593a0cd4a478b8a8a06ae69a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 11 Dec 2023 10:21:35 -0500 Subject: [PATCH] Update llama.cpp --- llama_cpp/llama_cpp.py | 51 +++++++++++++++++++++++++++++++++++++----- vendor/llama.cpp | 2 +- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 4e4c870..6c442d0 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -103,8 +103,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E # define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN -# define LLAMA_SESSION_VERSION 2 -LLAMA_SESSION_VERSION = 2 +# define LLAMA_SESSION_VERSION 3 +LLAMA_SESSION_VERSION = 3 # struct llama_model; @@ -309,6 +309,35 @@ class llama_batch(Structure): ("all_seq_id", llama_seq_id), ] +# enum llama_model_kv_override_type { +# LLAMA_KV_OVERRIDE_INT, +# LLAMA_KV_OVERRIDE_FLOAT, +# LLAMA_KV_OVERRIDE_BOOL, +# }; +class llama_model_kv_override_type(Structure): + _fields_ = [ + ("LLAMA_KV_OVERRIDE_INT", c_int), + ("LLAMA_KV_OVERRIDE_FLOAT", c_int), + ("LLAMA_KV_OVERRIDE_BOOL", c_int), + ] + +# struct llama_model_kv_override { +# char key[128]; +# enum llama_model_kv_override_type tag; +# union { +# int64_t int_value; +# double float_value; +# bool bool_value; +# }; +# }; +class llama_model_kv_override(Structure): + _fields_ = [ + ("key", ctypes.c_char * 128), + ("tag", llama_model_kv_override_type), + ("int_value", ctypes.c_int64), + ("float_value", c_double), + ("bool_value", c_bool), + ] # struct llama_model_params { # int32_t n_gpu_layers; // number of layers to store in VRAM @@ -320,6 +349,8 @@ class llama_batch(Structure): # // context pointer passed to the progress callback # void * progress_callback_user_data; +# // override key-value pairs of the model meta data +# const struct llama_model_kv_override * kv_overrides; # // Keep the booleans together to avoid misalignment during copy-by-value. # bool vocab_only; // only load the vocabulary, no weights @@ -335,6 +366,7 @@ class llama_model_params(Structure): tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) progress_callback (llama_progress_callback): called with a progress value between 0 and 1, pass NULL to disable progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback + kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data vocab_only (bool): only load the vocabulary, no weights use_mmap (bool): use mmap if possible use_mlock (bool): force system to keep model in RAM""" @@ -344,6 +376,7 @@ class llama_model_params(Structure): ("tensor_split", c_float_p), ("progress_callback", llama_progress_callback), ("progress_callback_user_data", c_void_p), + ("kv_overrides", POINTER(llama_model_kv_override)), ("vocab_only", c_bool), ("use_mmap", c_bool), ("use_mlock", c_bool), @@ -367,12 +400,14 @@ class llama_model_params(Structure): # float yarn_beta_slow; // YaRN high correction dim # uint32_t yarn_orig_ctx; // YaRN original context size +# enum ggml_type type_k; // data type for K cache +# enum ggml_type type_v; // data type for V cache # // Keep the booleans together to avoid misalignment during copy-by-value. -# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) -# bool f16_kv; // use fp16 for KV cache, fp32 otherwise -# bool logits_all; // the llama_eval() call computes all logits, not just the last one -# bool embedding; // embedding mode only +# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) +# bool logits_all; // the llama_eval() call computes all logits, not just the last one +# bool embedding; // embedding mode only +# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU # }; class llama_context_params(Structure): """Parameters for llama_context @@ -391,6 +426,8 @@ class llama_context_params(Structure): yarn_beta_fast (float): YaRN low correction dim yarn_beta_slow (float): YaRN high correction dim yarn_orig_ctx (int): YaRN original context size + type_k (int): data type for K cache + type_v (int): data type for V cache mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true) f16_kv (bool): use fp16 for KV cache, fp32 otherwise logits_all (bool): the llama_eval() call computes all logits, not just the last one @@ -409,6 +446,8 @@ class llama_context_params(Structure): ("yarn_beta_fast", c_float), ("yarn_beta_slow", c_float), ("yarn_orig_ctx", c_uint32), + ("type_k", c_int), + ("type_v", c_int), ("mul_mat_q", c_bool), ("f16_kv", c_bool), ("logits_all", c_bool), diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1f5cd83..8a7b2fa 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1f5cd83275fabb43f2ae92c30033b384a3eb37b4 +Subproject commit 8a7b2fa528f130631a5f43648481596ab320ed5a