This commit is contained in:
baalajimaestro 2024-01-24 16:12:35 +05:30
commit 238d2e0290
Signed by: baalajimaestro
GPG key ID: F93C394FE9BBAFD5
4 changed files with 35 additions and 27 deletions

View file

@ -194,32 +194,32 @@ class Llama:
self.model_params.use_mmap = use_mmap if lora_path is None else False self.model_params.use_mmap = use_mmap if lora_path is None else False
self.model_params.use_mlock = use_mlock self.model_params.use_mlock = use_mlock
# kv_overrides is the original python dict
self.kv_overrides = kv_overrides self.kv_overrides = kv_overrides
if kv_overrides is not None: if kv_overrides is not None:
n_overrides = len(kv_overrides) # _kv_overrides_array is a ctypes.Array of llama_model_kv_override Structs
self._kv_overrides_array = llama_cpp.llama_model_kv_override * (n_overrides + 1) kvo_array_len = len(kv_overrides) + 1 # for sentinel element
self._kv_overrides_array_keys = [] self._kv_overrides_array = (
llama_cpp.llama_model_kv_override * kvo_array_len
)()
for k, v in kv_overrides.items(): for i, (k, v) in enumerate(kv_overrides.items()):
key_buf = ctypes.create_string_buffer(k.encode("utf-8")) self._kv_overrides_array[i].key = k.encode("utf-8")
self._kv_overrides_array_keys.append(key_buf) if isinstance(v, bool):
self._kv_overrides_array[i].key = key_buf self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
if isinstance(v, int): self._kv_overrides_array[i].value.bool_value = v
elif isinstance(v, int):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
self._kv_overrides_array[i].value.int_value = v self._kv_overrides_array[i].value.int_value = v
elif isinstance(v, float): elif isinstance(v, float):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
self._kv_overrides_array[i].value.float_value = v self._kv_overrides_array[i].value.float_value = v
elif isinstance(v, bool):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
self._kv_overrides_array[i].value.bool_value = v
else: else:
raise ValueError(f"Unknown value type for {k}: {v}") raise ValueError(f"Unknown value type for {k}: {v}")
self._kv_overrides_array_sentinel_key = b'\0' self._kv_overrides_array[
-1
# null array sentinel ].key = b"\0" # ensure sentinel element is zeroed
self._kv_overrides_array[n_overrides].key = self._kv_overrides_array_sentinel_key
self.model_params.kv_overrides = self._kv_overrides_array self.model_params.kv_overrides = self._kv_overrides_array
self.n_batch = min(n_ctx, n_batch) # ??? self.n_batch = min(n_ctx, n_batch) # ???
@ -329,7 +329,9 @@ class Llama:
(n_ctx, self._n_vocab), dtype=np.single (n_ctx, self._n_vocab), dtype=np.single
) )
self._mirostat_mu = ctypes.c_float(2.0 * 5.0) # TODO: Move this to sampling context self._mirostat_mu = ctypes.c_float(
2.0 * 5.0
) # TODO: Move this to sampling context
try: try:
self.metadata = self._model.metadata() self.metadata = self._model.metadata()
@ -537,7 +539,7 @@ class Llama:
candidates=self._candidates, candidates=self._candidates,
tau=mirostat_tau, tau=mirostat_tau,
eta=mirostat_eta, eta=mirostat_eta,
mu=ctypes.pointer(self._mirostat_mu) mu=ctypes.pointer(self._mirostat_mu),
) )
else: else:
self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1) self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1)

View file

@ -379,7 +379,8 @@ def hf_autotokenizer_to_chat_completion_handler(
def hf_tokenizer_config_to_chat_formatter( def hf_tokenizer_config_to_chat_formatter(
tokenizer_config: Dict[str, Any] tokenizer_config: Dict[str, Any],
add_generation_prompt: bool = True,
) -> ChatFormatter: ) -> ChatFormatter:
assert isinstance(tokenizer_config, dict) assert isinstance(tokenizer_config, dict)
@ -401,31 +402,34 @@ def hf_tokenizer_config_to_chat_formatter(
lstrip_blocks=True, lstrip_blocks=True,
).from_string(chat_template) ).from_string(chat_template)
def format_autotokenizer( def format_tokenizer_config(
messages: List[llama_types.ChatCompletionRequestMessage], messages: List[llama_types.ChatCompletionRequestMessage],
**kwargs: Any, **kwargs: Any,
) -> ChatFormatterResponse: ) -> ChatFormatterResponse:
# TODO: veryify this is correct # TODO: veryify this is correct
# Add a blank assistant message to the end of the messages to prompt the model to generate a response # Add a blank assistant message to the end of the messages to prompt the model to generate a response
prompt = env.render( if add_generation_prompt:
messages = [ messages = [
*messages, *messages,
llama_types.ChatCompletionRequestAssistantMessage( llama_types.ChatCompletionRequestAssistantMessage(
role="assistant", content="" role="assistant", content=""
), ),
], ]
prompt = env.render(
messages=messages,
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
) )
return ChatFormatterResponse(prompt=prompt, stop=eos_token) return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token])
return format_autotokenizer return format_tokenizer_config
def hf_tokenizer_config_to_chat_completion_handler( def hf_tokenizer_config_to_chat_completion_handler(
tokenizer_config: Dict[str, Any], tokenizer_config: Dict[str, Any],
add_generation_prompt: bool = True,
) -> LlamaChatCompletionHandler: ) -> LlamaChatCompletionHandler:
chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config) chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config, add_generation_prompt=add_generation_prompt)
return chat_formatter_to_chat_completion_handler(chat_formatter) return chat_formatter_to_chat_completion_handler(chat_formatter)

View file

@ -188,6 +188,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors # LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
# }; # };
@ -211,6 +212,7 @@ LLAMA_FTYPE_MOSTLY_Q6_K = 18
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19 LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20 LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21 LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22
LLAMA_FTYPE_GUESSED = 1024 LLAMA_FTYPE_GUESSED = 1024
# enum llama_rope_scaling_type { # enum llama_rope_scaling_type {

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 504dc37be8446fb09b1ede70300250ad41be32a2 Subproject commit 26d607608d794efa56df3bdb6043a2f94c1d632c