From 454c9bb1cb3a58ecc37b58abeff6f245e3b8f316 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 27 May 2024 10:51:57 -0400 Subject: [PATCH 1/7] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 21 +++++++++++++++++---- vendor/llama.cpp | 2 +- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 4284019..d9b5087 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -300,6 +300,7 @@ LLAMA_VOCAB_TYPE_WPM = 3 # LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11, # LLAMA_VOCAB_PRE_TYPE_OLMO = 12, # LLAMA_VOCAB_PRE_TYPE_DBRX = 13, +# LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -315,6 +316,7 @@ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10 LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11 LLAMA_VOCAB_PRE_TYPE_OLMO = 12 LLAMA_VOCAB_PRE_TYPE_DBRX = 13 +LLAMA_VOCAB_PRE_TYPE_SMAUG = 14 # // note: these values should be synchronized with ggml_rope @@ -718,6 +720,8 @@ class llama_model_params(ctypes.Structure): ] +# // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations +# // https://github.com/ggerganov/llama.cpp/pull/7544 # struct llama_context_params { # uint32_t seed; // RNG seed, -1 for random # uint32_t n_ctx; // text context, 0 = from model @@ -744,15 +748,14 @@ class llama_model_params(ctypes.Structure): # ggml_backend_sched_eval_callback cb_eval; # void * cb_eval_user_data; -# enum ggml_type type_k; // data type for K cache -# enum ggml_type type_v; // data type for V cache +# enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] +# enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] # // Keep the booleans together to avoid misalignment during copy-by-value. # bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool embeddings; // if true, extract embeddings (together with logits) # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU -# bool flash_attn; // whether to use flash attention - +# bool flash_attn; // whether to use flash attention [EXPERIMENTAL] # // Abort callback # // if it returns true, execution of llama_decode() will be aborted @@ -2454,6 +2457,16 @@ def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /) ... +# // Identify if Token Id is a control token or a render-able token +# LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token); +@ctypes_function( + "llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool +) +def llama_token_is_control(model: llama_model_p, token: Union[llama_token, int], /) -> bool: + """Identify if Token Id is a control token or a render-able token""" + ... + + # // Special tokens diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 0df0aa8..5487593 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 0df0aa8e43c3378975269a51f9b876c8692e70da +Subproject commit 5487593bc7ee0b65b9d2e2985b4b61dc77043101 From c564007ff6c93d3408031db9562de13d50112707 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 May 2024 10:57:17 -0400 Subject: [PATCH 2/7] chore(deps): bump pypa/cibuildwheel from 2.18.0 to 2.18.1 (#1472) updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Andrei --- .github/workflows/build-and-release.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index b50da48..957912d 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -29,7 +29,7 @@ jobs: python -m pip install -e .[all] - name: Build wheels - uses: pypa/cibuildwheel@v2.18.0 + uses: pypa/cibuildwheel@v2.18.1 env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" @@ -56,7 +56,7 @@ jobs: platforms: linux/arm64 - name: Build wheels - uses: pypa/cibuildwheel@v2.18.0 + uses: pypa/cibuildwheel@v2.18.1 env: CIBW_SKIP: "*musllinux* pp*" CIBW_REPAIR_WHEEL_COMMAND: "" From c26004b1be20c485ea547a87b3871551f6a8774c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 28 May 2024 22:52:03 -0400 Subject: [PATCH 3/7] feat: Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 5487593..504f0c3 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 5487593bc7ee0b65b9d2e2985b4b61dc77043101 +Subproject commit 504f0c340f6b5e04de682f6ddefdd3b81208df5d From 2907c26906272a333ab913054f7ae7cf2bbd94ed Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 28 May 2024 22:52:28 -0400 Subject: [PATCH 4/7] misc: Update debug build to keep all debug symbols for easier gdb debugging --- Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 83085d2..3796d17 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,13 @@ build: python3 -m pip install --verbose -e . build.debug: - CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false --editable . + python3 -m pip install \ + --verbose \ + --config-settings=cmake.verbose=true \ + --config-settings=logging.level=INFO \ + --config-settings=install.strip=false \ + --config-settings=cmake.args="-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_C_FLAGS='-ggdb -O0';-DCMAKE_CXX_FLAGS='-ggdb -O0'" \ + --editable . build.cuda: CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e . From df45a4b3fe46e72664bda87301b318210c6d4782 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 29 May 2024 02:02:22 -0400 Subject: [PATCH 5/7] fix: fix string value kv_overrides. Closes #1487 --- llama_cpp/llama.py | 13 ++++++++----- llama_cpp/server/model.py | 4 +++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6dad650..6d872e3 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -6,6 +6,7 @@ import uuid import time import json import ctypes +import typing import fnmatch import multiprocessing @@ -249,13 +250,13 @@ class Llama: self._kv_overrides_array[i].key = k.encode("utf-8") if isinstance(v, bool): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL - self._kv_overrides_array[i].value.bool_value = v + self._kv_overrides_array[i].value.val_bool = v elif isinstance(v, int): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT - self._kv_overrides_array[i].value.int_value = v + self._kv_overrides_array[i].value.val_i64 = v elif isinstance(v, float): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT - self._kv_overrides_array[i].value.float_value = v + self._kv_overrides_array[i].value.val_f64 = v elif isinstance(v, str): # type: ignore v_bytes = v.encode("utf-8") if len(v_bytes) > 128: # TODO: Make this a constant @@ -263,10 +264,12 @@ class Llama: v_bytes = v_bytes.ljust(128, b"\0") self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR # copy min(v_bytes, 128) to str_value + address = typing.cast(int, ctypes.addressof(self._kv_overrides_array[i].value) + llama_cpp.llama_model_kv_override_value.val_str.offset) + buffer_start = ctypes.cast(address, ctypes.POINTER(ctypes.c_char)) ctypes.memmove( - self._kv_overrides_array[i].value.str_value, + buffer_start, v_bytes, - min(len(v_bytes), 128), + 128, ) else: raise ValueError(f"Unknown value type for {k}: {v}") diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index f002924..4f83716 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -183,7 +183,7 @@ class LlamaProxy: num_pred_tokens=settings.draft_model_num_pred_tokens ) - kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None + kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None if settings.kv_overrides is not None: assert isinstance(settings.kv_overrides, list) kv_overrides = {} @@ -197,6 +197,8 @@ class LlamaProxy: kv_overrides[key] = int(value) elif value_type == "float": kv_overrides[key] = float(value) + elif value_type == "str": + kv_overrides[key] = value else: raise ValueError(f"Unknown value type {value_type}") From 91d05aba469e9ce4632995b8f41e7c3b3c3518a5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 29 May 2024 02:28:58 -0400 Subject: [PATCH 6/7] fix: adjust kv_override member names to match llama.cpp --- llama_cpp/llama_cpp.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index d9b5087..1668717 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -613,17 +613,17 @@ LLAMA_KV_OVERRIDE_TYPE_STR = 3 # }; class llama_model_kv_override_value(ctypes.Union): _fields_ = [ - ("int_value", ctypes.c_int64), - ("float_value", ctypes.c_double), - ("bool_value", ctypes.c_bool), - ("str_value", ctypes.c_char * 128), + ("val_i64", ctypes.c_int64), + ("val_f64", ctypes.c_double), + ("val_bool", ctypes.c_bool), + ("val_str", ctypes.c_char * 128), ] if TYPE_CHECKING: - int_value: int - float_value: float - bool_value: bool - str_value: bytes + val_i64: int + val_f64: float + val_bool: bool + val_str: bytes class llama_model_kv_override(ctypes.Structure): From 165b4dc6c188f8fda2fc616154e111f710484eba Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 29 May 2024 02:29:44 -0400 Subject: [PATCH 7/7] fix: Fix typo in Llama3VisionAlphaChatHandler. Closes #1488 --- llama_cpp/llama_chat_format.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index e2d7e27..8f3b1de 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3098,7 +3098,7 @@ class NanoLlavaChatHandler(Llava15ChatHandler): "{% endif %}" ) -class Llama3VisionAlpha(Llava15ChatHandler): +class Llama3VisionAlphaChatHandler(Llava15ChatHandler): # question = "" + q # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" @@ -3159,6 +3159,10 @@ class Llama3VisionAlpha(Llava15ChatHandler): "{% endif %}" ) +# alias +Llama3VisionAlpha = Llama3VisionAlphaChatHandler + + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama, @@ -3193,7 +3197,6 @@ def chatml_function_calling( llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse], ]: - print(logprobs) function_calling_template = ( "{% for message in messages %}" "<|im_start|>{{ message.role }}\n"