From 454c9bb1cb3a58ecc37b58abeff6f245e3b8f316 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 27 May 2024 10:51:57 -0400
Subject: [PATCH] feat: Update llama.cpp

---
 llama_cpp/llama_cpp.py | 21 +++++++++++++++++----
 vendor/llama.cpp       |  2 +-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 4284019..d9b5087 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -300,6 +300,7 @@ LLAMA_VOCAB_TYPE_WPM = 3
 #     LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
 #     LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
 #     LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
+#     LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
 # };
 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -315,6 +316,7 @@ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10
 LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11
 LLAMA_VOCAB_PRE_TYPE_OLMO = 12
 LLAMA_VOCAB_PRE_TYPE_DBRX = 13
+LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
 
 
 # // note: these values should be synchronized with ggml_rope
@@ -718,6 +720,8 @@ class llama_model_params(ctypes.Structure):
     ]
 
 
+# // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
+# //       https://github.com/ggerganov/llama.cpp/pull/7544
 # struct llama_context_params {
 #     uint32_t seed;              // RNG seed, -1 for random
 #     uint32_t n_ctx;             // text context, 0 = from model
@@ -744,15 +748,14 @@ class llama_model_params(ctypes.Structure):
 #     ggml_backend_sched_eval_callback cb_eval;
 #     void * cb_eval_user_data;
 
-#     enum ggml_type type_k; // data type for K cache
-#     enum ggml_type type_v; // data type for V cache
+#     enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
+#     enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
 
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
 #     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-#     bool flash_attn;  // whether to use flash attention
-
+#     bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
 
 #     // Abort callback
 #     // if it returns true, execution of llama_decode() will be aborted
@@ -2454,6 +2457,16 @@ def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /)
     ...
 
 
+# // Identify if Token Id is a control token or a render-able token
+# LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
+@ctypes_function(
+    "llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool
+)
+def llama_token_is_control(model: llama_model_p, token: Union[llama_token, int], /) -> bool:
+    """Identify if Token Id is a control token or a render-able token"""
+    ...
+
+
 # // Special tokens
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 0df0aa8..5487593 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 0df0aa8e43c3378975269a51f9b876c8692e70da
+Subproject commit 5487593bc7ee0b65b9d2e2985b4b61dc77043101