diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5c66bcf..74739cb 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -194,32 +194,32 @@ class Llama: self.model_params.use_mmap = use_mmap if lora_path is None else False self.model_params.use_mlock = use_mlock + # kv_overrides is the original python dict self.kv_overrides = kv_overrides if kv_overrides is not None: - n_overrides = len(kv_overrides) - self._kv_overrides_array = llama_cpp.llama_model_kv_override * (n_overrides + 1) - self._kv_overrides_array_keys = [] + # _kv_overrides_array is a ctypes.Array of llama_model_kv_override Structs + kvo_array_len = len(kv_overrides) + 1 # for sentinel element + self._kv_overrides_array = ( + llama_cpp.llama_model_kv_override * kvo_array_len + )() - for k, v in kv_overrides.items(): - key_buf = ctypes.create_string_buffer(k.encode("utf-8")) - self._kv_overrides_array_keys.append(key_buf) - self._kv_overrides_array[i].key = key_buf - if isinstance(v, int): + for i, (k, v) in enumerate(kv_overrides.items()): + self._kv_overrides_array[i].key = k.encode("utf-8") + if isinstance(v, bool): + self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL + self._kv_overrides_array[i].value.bool_value = v + elif isinstance(v, int): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT self._kv_overrides_array[i].value.int_value = v elif isinstance(v, float): self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT self._kv_overrides_array[i].value.float_value = v - elif isinstance(v, bool): - self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL - self._kv_overrides_array[i].value.bool_value = v else: raise ValueError(f"Unknown value type for {k}: {v}") - self._kv_overrides_array_sentinel_key = b'\0' - - # null array sentinel - self._kv_overrides_array[n_overrides].key = self._kv_overrides_array_sentinel_key + self._kv_overrides_array[ + -1 + ].key = b"\0" # ensure sentinel element is zeroed self.model_params.kv_overrides = self._kv_overrides_array self.n_batch = min(n_ctx, n_batch) # ??? @@ -329,7 +329,9 @@ class Llama: (n_ctx, self._n_vocab), dtype=np.single ) - self._mirostat_mu = ctypes.c_float(2.0 * 5.0) # TODO: Move this to sampling context + self._mirostat_mu = ctypes.c_float( + 2.0 * 5.0 + ) # TODO: Move this to sampling context try: self.metadata = self._model.metadata() @@ -337,7 +339,7 @@ class Llama: self.metadata = {} if self.verbose: print(f"Failed to load metadata: {e}", file=sys.stderr) - + if self.verbose: print(f"Model metadata: {self.metadata}", file=sys.stderr) @@ -537,7 +539,7 @@ class Llama: candidates=self._candidates, tau=mirostat_tau, eta=mirostat_eta, - mu=ctypes.pointer(self._mirostat_mu) + mu=ctypes.pointer(self._mirostat_mu), ) else: self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 02bdbcf..6c274aa 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -379,7 +379,8 @@ def hf_autotokenizer_to_chat_completion_handler( def hf_tokenizer_config_to_chat_formatter( - tokenizer_config: Dict[str, Any] + tokenizer_config: Dict[str, Any], + add_generation_prompt: bool = True, ) -> ChatFormatter: assert isinstance(tokenizer_config, dict) @@ -401,31 +402,34 @@ def hf_tokenizer_config_to_chat_formatter( lstrip_blocks=True, ).from_string(chat_template) - def format_autotokenizer( + def format_tokenizer_config( messages: List[llama_types.ChatCompletionRequestMessage], **kwargs: Any, ) -> ChatFormatterResponse: # TODO: veryify this is correct # Add a blank assistant message to the end of the messages to prompt the model to generate a response - prompt = env.render( - messages=[ + if add_generation_prompt: + messages = [ *messages, llama_types.ChatCompletionRequestAssistantMessage( role="assistant", content="" ), - ], + ] + prompt = env.render( + messages=messages, bos_token=bos_token, eos_token=eos_token, ) - return ChatFormatterResponse(prompt=prompt, stop=eos_token) + return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token]) - return format_autotokenizer + return format_tokenizer_config def hf_tokenizer_config_to_chat_completion_handler( tokenizer_config: Dict[str, Any], + add_generation_prompt: bool = True, ) -> LlamaChatCompletionHandler: - chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config) + chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config, add_generation_prompt=add_generation_prompt) return chat_formatter_to_chat_completion_handler(chat_formatter) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index ef16272..5de837f 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -188,6 +188,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6 # LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; @@ -211,6 +212,7 @@ LLAMA_FTYPE_MOSTLY_Q6_K = 18 LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19 LLAMA_FTYPE_MOSTLY_IQ2_XS = 20 LLAMA_FTYPE_MOSTLY_Q2_K_S = 21 +LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22 LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 504dc37..26d6076 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 504dc37be8446fb09b1ede70300250ad41be32a2 +Subproject commit 26d607608d794efa56df3bdb6043a2f94c1d632c