This commit is contained in:
baalajimaestro 2024-01-24 16:12:35 +05:30
commit 238d2e0290
Signed by: baalajimaestro
GPG key ID: F93C394FE9BBAFD5
4 changed files with 35 additions and 27 deletions

View file

@ -194,32 +194,32 @@ class Llama:
self.model_params.use_mmap = use_mmap if lora_path is None else False
self.model_params.use_mlock = use_mlock
# kv_overrides is the original python dict
self.kv_overrides = kv_overrides
if kv_overrides is not None:
n_overrides = len(kv_overrides)
self._kv_overrides_array = llama_cpp.llama_model_kv_override * (n_overrides + 1)
self._kv_overrides_array_keys = []
# _kv_overrides_array is a ctypes.Array of llama_model_kv_override Structs
kvo_array_len = len(kv_overrides) + 1 # for sentinel element
self._kv_overrides_array = (
llama_cpp.llama_model_kv_override * kvo_array_len
)()
for k, v in kv_overrides.items():
key_buf = ctypes.create_string_buffer(k.encode("utf-8"))
self._kv_overrides_array_keys.append(key_buf)
self._kv_overrides_array[i].key = key_buf
if isinstance(v, int):
for i, (k, v) in enumerate(kv_overrides.items()):
self._kv_overrides_array[i].key = k.encode("utf-8")
if isinstance(v, bool):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
self._kv_overrides_array[i].value.bool_value = v
elif isinstance(v, int):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
self._kv_overrides_array[i].value.int_value = v
elif isinstance(v, float):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
self._kv_overrides_array[i].value.float_value = v
elif isinstance(v, bool):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
self._kv_overrides_array[i].value.bool_value = v
else:
raise ValueError(f"Unknown value type for {k}: {v}")
self._kv_overrides_array_sentinel_key = b'\0'
# null array sentinel
self._kv_overrides_array[n_overrides].key = self._kv_overrides_array_sentinel_key
self._kv_overrides_array[
-1
].key = b"\0" # ensure sentinel element is zeroed
self.model_params.kv_overrides = self._kv_overrides_array
self.n_batch = min(n_ctx, n_batch) # ???
@ -329,7 +329,9 @@ class Llama:
(n_ctx, self._n_vocab), dtype=np.single
)
self._mirostat_mu = ctypes.c_float(2.0 * 5.0) # TODO: Move this to sampling context
self._mirostat_mu = ctypes.c_float(
2.0 * 5.0
) # TODO: Move this to sampling context
try:
self.metadata = self._model.metadata()
@ -337,7 +339,7 @@ class Llama:
self.metadata = {}
if self.verbose:
print(f"Failed to load metadata: {e}", file=sys.stderr)
if self.verbose:
print(f"Model metadata: {self.metadata}", file=sys.stderr)
@ -537,7 +539,7 @@ class Llama:
candidates=self._candidates,
tau=mirostat_tau,
eta=mirostat_eta,
mu=ctypes.pointer(self._mirostat_mu)
mu=ctypes.pointer(self._mirostat_mu),
)
else:
self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1)

View file

@ -379,7 +379,8 @@ def hf_autotokenizer_to_chat_completion_handler(
def hf_tokenizer_config_to_chat_formatter(
tokenizer_config: Dict[str, Any]
tokenizer_config: Dict[str, Any],
add_generation_prompt: bool = True,
) -> ChatFormatter:
assert isinstance(tokenizer_config, dict)
@ -401,31 +402,34 @@ def hf_tokenizer_config_to_chat_formatter(
lstrip_blocks=True,
).from_string(chat_template)
def format_autotokenizer(
def format_tokenizer_config(
messages: List[llama_types.ChatCompletionRequestMessage],
**kwargs: Any,
) -> ChatFormatterResponse:
# TODO: veryify this is correct
# Add a blank assistant message to the end of the messages to prompt the model to generate a response
prompt = env.render(
messages=[
if add_generation_prompt:
messages = [
*messages,
llama_types.ChatCompletionRequestAssistantMessage(
role="assistant", content=""
),
],
]
prompt = env.render(
messages=messages,
bos_token=bos_token,
eos_token=eos_token,
)
return ChatFormatterResponse(prompt=prompt, stop=eos_token)
return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token])
return format_autotokenizer
return format_tokenizer_config
def hf_tokenizer_config_to_chat_completion_handler(
tokenizer_config: Dict[str, Any],
add_generation_prompt: bool = True,
) -> LlamaChatCompletionHandler:
chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config)
chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config, add_generation_prompt=add_generation_prompt)
return chat_formatter_to_chat_completion_handler(chat_formatter)

View file

@ -188,6 +188,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
# };
@ -211,6 +212,7 @@ LLAMA_FTYPE_MOSTLY_Q6_K = 18
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22
LLAMA_FTYPE_GUESSED = 1024
# enum llama_rope_scaling_type {

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 504dc37be8446fb09b1ede70300250ad41be32a2
Subproject commit 26d607608d794efa56df3bdb6043a2f94c1d632c