From 454c9bb1cb3a58ecc37b58abeff6f245e3b8f316 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 27 May 2024 10:51:57 -0400 Subject: [PATCH] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 21 +++++++++++++++++---- vendor/llama.cpp | 2 +- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 4284019..d9b5087 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -300,6 +300,7 @@ LLAMA_VOCAB_TYPE_WPM = 3 # LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11, # LLAMA_VOCAB_PRE_TYPE_OLMO = 12, # LLAMA_VOCAB_PRE_TYPE_DBRX = 13, +# LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -315,6 +316,7 @@ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10 LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11 LLAMA_VOCAB_PRE_TYPE_OLMO = 12 LLAMA_VOCAB_PRE_TYPE_DBRX = 13 +LLAMA_VOCAB_PRE_TYPE_SMAUG = 14 # // note: these values should be synchronized with ggml_rope @@ -718,6 +720,8 @@ class llama_model_params(ctypes.Structure): ] +# // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations +# // https://github.com/ggerganov/llama.cpp/pull/7544 # struct llama_context_params { # uint32_t seed; // RNG seed, -1 for random # uint32_t n_ctx; // text context, 0 = from model @@ -744,15 +748,14 @@ class llama_model_params(ctypes.Structure): # ggml_backend_sched_eval_callback cb_eval; # void * cb_eval_user_data; -# enum ggml_type type_k; // data type for K cache -# enum ggml_type type_v; // data type for V cache +# enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] +# enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] # // Keep the booleans together to avoid misalignment during copy-by-value. # bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool embeddings; // if true, extract embeddings (together with logits) # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU -# bool flash_attn; // whether to use flash attention - +# bool flash_attn; // whether to use flash attention [EXPERIMENTAL] # // Abort callback # // if it returns true, execution of llama_decode() will be aborted @@ -2454,6 +2457,16 @@ def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /) ... +# // Identify if Token Id is a control token or a render-able token +# LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token); +@ctypes_function( + "llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool +) +def llama_token_is_control(model: llama_model_p, token: Union[llama_token, int], /) -> bool: + """Identify if Token Id is a control token or a render-able token""" + ... + + # // Special tokens diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 0df0aa8..5487593 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 0df0aa8e43c3378975269a51f9b876c8692e70da +Subproject commit 5487593bc7ee0b65b9d2e2985b4b61dc77043101