From 454c9bb1cb3a58ecc37b58abeff6f245e3b8f316 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 27 May 2024 10:51:57 -0400
Subject: [PATCH 1/7] feat: Update llama.cpp

---
 llama_cpp/llama_cpp.py | 21 +++++++++++++++++----
 vendor/llama.cpp       |  2 +-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 4284019..d9b5087 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -300,6 +300,7 @@ LLAMA_VOCAB_TYPE_WPM = 3
 #     LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
 #     LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
 #     LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
+#     LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
 # };
 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -315,6 +316,7 @@ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10
 LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11
 LLAMA_VOCAB_PRE_TYPE_OLMO = 12
 LLAMA_VOCAB_PRE_TYPE_DBRX = 13
+LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
 
 
 # // note: these values should be synchronized with ggml_rope
@@ -718,6 +720,8 @@ class llama_model_params(ctypes.Structure):
     ]
 
 
+# // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
+# //       https://github.com/ggerganov/llama.cpp/pull/7544
 # struct llama_context_params {
 #     uint32_t seed;              // RNG seed, -1 for random
 #     uint32_t n_ctx;             // text context, 0 = from model
@@ -744,15 +748,14 @@ class llama_model_params(ctypes.Structure):
 #     ggml_backend_sched_eval_callback cb_eval;
 #     void * cb_eval_user_data;
 
-#     enum ggml_type type_k; // data type for K cache
-#     enum ggml_type type_v; // data type for V cache
+#     enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
+#     enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
 
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
 #     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-#     bool flash_attn;  // whether to use flash attention
-
+#     bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
 
 #     // Abort callback
 #     // if it returns true, execution of llama_decode() will be aborted
@@ -2454,6 +2457,16 @@ def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /)
     ...
 
 
+# // Identify if Token Id is a control token or a render-able token
+# LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
+@ctypes_function(
+    "llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool
+)
+def llama_token_is_control(model: llama_model_p, token: Union[llama_token, int], /) -> bool:
+    """Identify if Token Id is a control token or a render-able token"""
+    ...
+
+
 # // Special tokens
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 0df0aa8..5487593 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 0df0aa8e43c3378975269a51f9b876c8692e70da
+Subproject commit 5487593bc7ee0b65b9d2e2985b4b61dc77043101

From c564007ff6c93d3408031db9562de13d50112707 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 27 May 2024 10:57:17 -0400
Subject: [PATCH 2/7] chore(deps): bump pypa/cibuildwheel from 2.18.0 to 2.18.1
 (#1472)

updated-dependencies:
- dependency-name: pypa/cibuildwheel
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Andrei <abetlen@gmail.com>
---
 .github/workflows/build-and-release.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index b50da48..957912d 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -29,7 +29,7 @@ jobs:
           python -m pip install -e .[all]
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.18.0
+        uses: pypa/cibuildwheel@v2.18.1
         env:
           # disable repair
           CIBW_REPAIR_WHEEL_COMMAND: ""
@@ -56,7 +56,7 @@ jobs:
           platforms: linux/arm64
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.18.0
+        uses: pypa/cibuildwheel@v2.18.1
         env:
           CIBW_SKIP: "*musllinux* pp*"
           CIBW_REPAIR_WHEEL_COMMAND: ""

From c26004b1be20c485ea547a87b3871551f6a8774c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 28 May 2024 22:52:03 -0400
Subject: [PATCH 3/7] feat: Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 5487593..504f0c3 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 5487593bc7ee0b65b9d2e2985b4b61dc77043101
+Subproject commit 504f0c340f6b5e04de682f6ddefdd3b81208df5d

From 2907c26906272a333ab913054f7ae7cf2bbd94ed Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 28 May 2024 22:52:28 -0400
Subject: [PATCH 4/7] misc: Update debug build to keep all debug symbols for
 easier gdb debugging

---
 Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 83085d2..3796d17 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,13 @@ build:
 	python3 -m pip install --verbose -e .
 
 build.debug:
-	CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false  --editable .
+	python3 -m pip install \
+		--verbose \
+		--config-settings=cmake.verbose=true \
+		--config-settings=logging.level=INFO \
+		--config-settings=install.strip=false  \
+		--config-settings=cmake.args="-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_C_FLAGS='-ggdb -O0';-DCMAKE_CXX_FLAGS='-ggdb -O0'" \
+		--editable .
 
 build.cuda:
 	CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .

From df45a4b3fe46e72664bda87301b318210c6d4782 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 29 May 2024 02:02:22 -0400
Subject: [PATCH 5/7] fix: fix string value kv_overrides. Closes #1487

---
 llama_cpp/llama.py        | 13 ++++++++-----
 llama_cpp/server/model.py |  4 +++-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6dad650..6d872e3 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -6,6 +6,7 @@ import uuid
 import time
 import json
 import ctypes
+import typing
 import fnmatch
 import multiprocessing
 
@@ -249,13 +250,13 @@ class Llama:
                 self._kv_overrides_array[i].key = k.encode("utf-8")
                 if isinstance(v, bool):
                     self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
-                    self._kv_overrides_array[i].value.bool_value = v
+                    self._kv_overrides_array[i].value.val_bool = v
                 elif isinstance(v, int):
                     self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
-                    self._kv_overrides_array[i].value.int_value = v
+                    self._kv_overrides_array[i].value.val_i64 = v
                 elif isinstance(v, float):
                     self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
-                    self._kv_overrides_array[i].value.float_value = v
+                    self._kv_overrides_array[i].value.val_f64 = v
                 elif isinstance(v, str): # type: ignore
                     v_bytes = v.encode("utf-8")
                     if len(v_bytes) > 128: # TODO: Make this a constant
@@ -263,10 +264,12 @@ class Llama:
                     v_bytes = v_bytes.ljust(128, b"\0")
                     self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
                     # copy min(v_bytes, 128) to str_value
+                    address = typing.cast(int, ctypes.addressof(self._kv_overrides_array[i].value) + llama_cpp.llama_model_kv_override_value.val_str.offset)
+                    buffer_start = ctypes.cast(address, ctypes.POINTER(ctypes.c_char))
                     ctypes.memmove(
-                        self._kv_overrides_array[i].value.str_value,
+                        buffer_start,
                         v_bytes,
-                        min(len(v_bytes), 128),
+                        128,
                     )
                 else:
                     raise ValueError(f"Unknown value type for {k}: {v}")
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index f002924..4f83716 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -183,7 +183,7 @@ class LlamaProxy:
                 num_pred_tokens=settings.draft_model_num_pred_tokens
             )
 
-        kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None
+        kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None
         if settings.kv_overrides is not None:
             assert isinstance(settings.kv_overrides, list)
             kv_overrides = {}
@@ -197,6 +197,8 @@ class LlamaProxy:
                         kv_overrides[key] = int(value)
                     elif value_type == "float":
                         kv_overrides[key] = float(value)
+                    elif value_type == "str":
+                        kv_overrides[key] = value
                     else:
                         raise ValueError(f"Unknown value type {value_type}")
 

From 91d05aba469e9ce4632995b8f41e7c3b3c3518a5 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 29 May 2024 02:28:58 -0400
Subject: [PATCH 6/7] fix: adjust kv_override member names to match llama.cpp

---
 llama_cpp/llama_cpp.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index d9b5087..1668717 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -613,17 +613,17 @@ LLAMA_KV_OVERRIDE_TYPE_STR = 3
 # };
 class llama_model_kv_override_value(ctypes.Union):
     _fields_ = [
-        ("int_value", ctypes.c_int64),
-        ("float_value", ctypes.c_double),
-        ("bool_value", ctypes.c_bool),
-        ("str_value", ctypes.c_char * 128),
+        ("val_i64", ctypes.c_int64),
+        ("val_f64", ctypes.c_double),
+        ("val_bool", ctypes.c_bool),
+        ("val_str", ctypes.c_char * 128),
     ]
 
     if TYPE_CHECKING:
-        int_value: int
-        float_value: float
-        bool_value: bool
-        str_value: bytes
+        val_i64: int
+        val_f64: float
+        val_bool: bool
+        val_str: bytes
 
 
 class llama_model_kv_override(ctypes.Structure):

From 165b4dc6c188f8fda2fc616154e111f710484eba Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 29 May 2024 02:29:44 -0400
Subject: [PATCH 7/7] fix: Fix typo in Llama3VisionAlphaChatHandler. Closes
 #1488

---
 llama_cpp/llama_chat_format.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index e2d7e27..8f3b1de 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3098,7 +3098,7 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
         "{% endif %}"
     )
 
-class Llama3VisionAlpha(Llava15ChatHandler):
+class Llama3VisionAlphaChatHandler(Llava15ChatHandler):
     # question = "<image>" + q
 
     # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
@@ -3159,6 +3159,10 @@ class Llama3VisionAlpha(Llava15ChatHandler):
         "{% endif %}"
     )
 
+# alias
+Llama3VisionAlpha = Llama3VisionAlphaChatHandler
+
+
 @register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
     llama: llama.Llama,
@@ -3193,7 +3197,6 @@ def chatml_function_calling(
     llama_types.CreateChatCompletionResponse,
     Iterator[llama_types.CreateChatCompletionStreamResponse],
 ]:
-    print(logprobs)
     function_calling_template = (
         "{% for message in messages %}"
         "<|im_start|>{{ message.role }}\n"