From 5b982d0f8c6f35242c8862ffdce00e17cea0b44f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 22 Jan 2024 08:32:48 -0500 Subject: [PATCH 1/6] fix: use both eos and bos tokens as stop sequences for hf-tokenizer-config chat format. --- llama_cpp/llama_chat_format.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 02bdbcf..6c274aa 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -379,7 +379,8 @@ def hf_autotokenizer_to_chat_completion_handler( def hf_tokenizer_config_to_chat_formatter( - tokenizer_config: Dict[str, Any] + tokenizer_config: Dict[str, Any], + add_generation_prompt: bool = True, ) -> ChatFormatter: assert isinstance(tokenizer_config, dict) @@ -401,31 +402,34 @@ def hf_tokenizer_config_to_chat_formatter( lstrip_blocks=True, ).from_string(chat_template) - def format_autotokenizer( + def format_tokenizer_config( messages: List[llama_types.ChatCompletionRequestMessage], **kwargs: Any, ) -> ChatFormatterResponse: # TODO: veryify this is correct # Add a blank assistant message to the end of the messages to prompt the model to generate a response - prompt = env.render( - messages=[ + if add_generation_prompt: + messages = [ *messages, llama_types.ChatCompletionRequestAssistantMessage( role="assistant", content="" ), - ], + ] + prompt = env.render( + messages=messages, bos_token=bos_token, eos_token=eos_token, ) - return ChatFormatterResponse(prompt=prompt, stop=eos_token) + return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token]) - return format_autotokenizer + return format_tokenizer_config def hf_tokenizer_config_to_chat_completion_handler( tokenizer_config: Dict[str, Any], + add_generation_prompt: bool = True, ) -> LlamaChatCompletionHandler: - chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config) + chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config, add_generation_prompt=add_generation_prompt) return chat_formatter_to_chat_completion_handler(chat_formatter) From fcdf337d84591c46c0a26b8660b2e537af1fd353 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 22 Jan 2024 11:25:11 -0500 Subject: [PATCH 2/6] Update llama.cpp --- llama_cpp/llama_cpp.py | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index ef16272..5de837f 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -188,6 +188,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6 # LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; @@ -211,6 +212,7 @@ LLAMA_FTYPE_MOSTLY_Q6_K = 18 LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19 LLAMA_FTYPE_MOSTLY_IQ2_XS = 20 LLAMA_FTYPE_MOSTLY_Q2_K_S = 21 +LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22 LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 504dc37..6f9939d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 504dc37be8446fb09b1ede70300250ad41be32a2 +Subproject commit 6f9939d119b2d004c264952eb510bd106455531e From 7e63928bc9eb87170cbf3d856a318e0794f2505f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 23 Jan 2024 18:42:39 -0500 Subject: [PATCH 3/6] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 6f9939d..26d6076 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 6f9939d119b2d004c264952eb510bd106455531e +Subproject commit 26d607608d794efa56df3bdb6043a2f94c1d632c From fe5d6ea6484a06021a2531c6f66c9873c1ee753d Mon Sep 17 00:00:00 2001 From: Phil H <5756783+phiharri@users.noreply.github.com> Date: Wed, 24 Jan 2024 03:00:38 +0000 Subject: [PATCH 4/6] fix: GGUF metadata KV overrides, re #1011 (#1116) * kv overrides another attempt * add sentinel element, simplify array population * ensure sentinel element is zeroed --- llama_cpp/llama.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5c66bcf..538781d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -194,16 +194,16 @@ class Llama: self.model_params.use_mmap = use_mmap if lora_path is None else False self.model_params.use_mlock = use_mlock + # kv_overrides is the original python dict self.kv_overrides = kv_overrides if kv_overrides is not None: - n_overrides = len(kv_overrides) - self._kv_overrides_array = llama_cpp.llama_model_kv_override * (n_overrides + 1) - self._kv_overrides_array_keys = [] - for k, v in kv_overrides.items(): - key_buf = ctypes.create_string_buffer(k.encode("utf-8")) - self._kv_overrides_array_keys.append(key_buf) - self._kv_overrides_array[i].key = key_buf + # _kv_overrides_array is a ctypes.Array of llama_model_kv_override Structs + kvo_array_len = len(kv_overrides) + 1 # for sentinel element + self._kv_overrides_array = (llama_cpp.llama_model_kv_override * kvo_array_len)() + + for i, (k, v) in enumerate(kv_overrides.items()): + self._kv_overrides_array[i].key = k.encode('utf-8'); if isinstance(v, int): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT self._kv_overrides_array[i].value.int_value = v @@ -216,10 +216,7 @@ class Llama: else: raise ValueError(f"Unknown value type for {k}: {v}") - self._kv_overrides_array_sentinel_key = b'\0' - - # null array sentinel - self._kv_overrides_array[n_overrides].key = self._kv_overrides_array_sentinel_key + self._kv_overrides_array[-1].key = b'\0' # ensure sentinel element is zeroed self.model_params.kv_overrides = self._kv_overrides_array self.n_batch = min(n_ctx, n_batch) # ??? From 4d6b2f7b91a8dfd4b4e283ea73492772b3471afe Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 23 Jan 2024 22:08:27 -0500 Subject: [PATCH 5/6] fix: format --- llama_cpp/llama.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 538781d..3d15800 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -197,13 +197,14 @@ class Llama: # kv_overrides is the original python dict self.kv_overrides = kv_overrides if kv_overrides is not None: - # _kv_overrides_array is a ctypes.Array of llama_model_kv_override Structs - kvo_array_len = len(kv_overrides) + 1 # for sentinel element - self._kv_overrides_array = (llama_cpp.llama_model_kv_override * kvo_array_len)() + kvo_array_len = len(kv_overrides) + 1 # for sentinel element + self._kv_overrides_array = ( + llama_cpp.llama_model_kv_override * kvo_array_len + )() for i, (k, v) in enumerate(kv_overrides.items()): - self._kv_overrides_array[i].key = k.encode('utf-8'); + self._kv_overrides_array[i].key = k.encode("utf-8") if isinstance(v, int): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT self._kv_overrides_array[i].value.int_value = v @@ -216,7 +217,9 @@ class Llama: else: raise ValueError(f"Unknown value type for {k}: {v}") - self._kv_overrides_array[-1].key = b'\0' # ensure sentinel element is zeroed + self._kv_overrides_array[ + -1 + ].key = b"\0" # ensure sentinel element is zeroed self.model_params.kv_overrides = self._kv_overrides_array self.n_batch = min(n_ctx, n_batch) # ??? @@ -326,7 +329,9 @@ class Llama: (n_ctx, self._n_vocab), dtype=np.single ) - self._mirostat_mu = ctypes.c_float(2.0 * 5.0) # TODO: Move this to sampling context + self._mirostat_mu = ctypes.c_float( + 2.0 * 5.0 + ) # TODO: Move this to sampling context try: self.metadata = self._model.metadata() @@ -334,7 +339,7 @@ class Llama: self.metadata = {} if self.verbose: print(f"Failed to load metadata: {e}", file=sys.stderr) - + if self.verbose: print(f"Model metadata: {self.metadata}", file=sys.stderr) @@ -534,7 +539,7 @@ class Llama: candidates=self._candidates, tau=mirostat_tau, eta=mirostat_eta, - mu=ctypes.pointer(self._mirostat_mu) + mu=ctypes.pointer(self._mirostat_mu), ) else: self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1) From 9677a1f2c895979c8a8fc2b68002fca4bac8c5fb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 23 Jan 2024 22:28:03 -0500 Subject: [PATCH 6/6] fix: Check order --- llama_cpp/llama.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 3d15800..74739cb 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -205,15 +205,15 @@ class Llama: for i, (k, v) in enumerate(kv_overrides.items()): self._kv_overrides_array[i].key = k.encode("utf-8") - if isinstance(v, int): + if isinstance(v, bool): + self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL + self._kv_overrides_array[i].value.bool_value = v + elif isinstance(v, int): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT self._kv_overrides_array[i].value.int_value = v elif isinstance(v, float): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT self._kv_overrides_array[i].value.float_value = v - elif isinstance(v, bool): - self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL - self._kv_overrides_array[i].value.bool_value = v else: raise ValueError(f"Unknown value type for {k}: {v}")