This commit is contained in:
commit
238d2e0290
4 changed files with 35 additions and 27 deletions
|
@ -194,32 +194,32 @@ class Llama:
|
|||
self.model_params.use_mmap = use_mmap if lora_path is None else False
|
||||
self.model_params.use_mlock = use_mlock
|
||||
|
||||
# kv_overrides is the original python dict
|
||||
self.kv_overrides = kv_overrides
|
||||
if kv_overrides is not None:
|
||||
n_overrides = len(kv_overrides)
|
||||
self._kv_overrides_array = llama_cpp.llama_model_kv_override * (n_overrides + 1)
|
||||
self._kv_overrides_array_keys = []
|
||||
# _kv_overrides_array is a ctypes.Array of llama_model_kv_override Structs
|
||||
kvo_array_len = len(kv_overrides) + 1 # for sentinel element
|
||||
self._kv_overrides_array = (
|
||||
llama_cpp.llama_model_kv_override * kvo_array_len
|
||||
)()
|
||||
|
||||
for k, v in kv_overrides.items():
|
||||
key_buf = ctypes.create_string_buffer(k.encode("utf-8"))
|
||||
self._kv_overrides_array_keys.append(key_buf)
|
||||
self._kv_overrides_array[i].key = key_buf
|
||||
if isinstance(v, int):
|
||||
for i, (k, v) in enumerate(kv_overrides.items()):
|
||||
self._kv_overrides_array[i].key = k.encode("utf-8")
|
||||
if isinstance(v, bool):
|
||||
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
|
||||
self._kv_overrides_array[i].value.bool_value = v
|
||||
elif isinstance(v, int):
|
||||
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
|
||||
self._kv_overrides_array[i].value.int_value = v
|
||||
elif isinstance(v, float):
|
||||
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
|
||||
self._kv_overrides_array[i].value.float_value = v
|
||||
elif isinstance(v, bool):
|
||||
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
|
||||
self._kv_overrides_array[i].value.bool_value = v
|
||||
else:
|
||||
raise ValueError(f"Unknown value type for {k}: {v}")
|
||||
|
||||
self._kv_overrides_array_sentinel_key = b'\0'
|
||||
|
||||
# null array sentinel
|
||||
self._kv_overrides_array[n_overrides].key = self._kv_overrides_array_sentinel_key
|
||||
self._kv_overrides_array[
|
||||
-1
|
||||
].key = b"\0" # ensure sentinel element is zeroed
|
||||
self.model_params.kv_overrides = self._kv_overrides_array
|
||||
|
||||
self.n_batch = min(n_ctx, n_batch) # ???
|
||||
|
@ -329,7 +329,9 @@ class Llama:
|
|||
(n_ctx, self._n_vocab), dtype=np.single
|
||||
)
|
||||
|
||||
self._mirostat_mu = ctypes.c_float(2.0 * 5.0) # TODO: Move this to sampling context
|
||||
self._mirostat_mu = ctypes.c_float(
|
||||
2.0 * 5.0
|
||||
) # TODO: Move this to sampling context
|
||||
|
||||
try:
|
||||
self.metadata = self._model.metadata()
|
||||
|
@ -537,7 +539,7 @@ class Llama:
|
|||
candidates=self._candidates,
|
||||
tau=mirostat_tau,
|
||||
eta=mirostat_eta,
|
||||
mu=ctypes.pointer(self._mirostat_mu)
|
||||
mu=ctypes.pointer(self._mirostat_mu),
|
||||
)
|
||||
else:
|
||||
self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1)
|
||||
|
|
|
@ -379,7 +379,8 @@ def hf_autotokenizer_to_chat_completion_handler(
|
|||
|
||||
|
||||
def hf_tokenizer_config_to_chat_formatter(
|
||||
tokenizer_config: Dict[str, Any]
|
||||
tokenizer_config: Dict[str, Any],
|
||||
add_generation_prompt: bool = True,
|
||||
) -> ChatFormatter:
|
||||
assert isinstance(tokenizer_config, dict)
|
||||
|
||||
|
@ -401,31 +402,34 @@ def hf_tokenizer_config_to_chat_formatter(
|
|||
lstrip_blocks=True,
|
||||
).from_string(chat_template)
|
||||
|
||||
def format_autotokenizer(
|
||||
def format_tokenizer_config(
|
||||
messages: List[llama_types.ChatCompletionRequestMessage],
|
||||
**kwargs: Any,
|
||||
) -> ChatFormatterResponse:
|
||||
# TODO: veryify this is correct
|
||||
# Add a blank assistant message to the end of the messages to prompt the model to generate a response
|
||||
prompt = env.render(
|
||||
messages=[
|
||||
if add_generation_prompt:
|
||||
messages = [
|
||||
*messages,
|
||||
llama_types.ChatCompletionRequestAssistantMessage(
|
||||
role="assistant", content=""
|
||||
),
|
||||
],
|
||||
]
|
||||
prompt = env.render(
|
||||
messages=messages,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
)
|
||||
return ChatFormatterResponse(prompt=prompt, stop=eos_token)
|
||||
return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token])
|
||||
|
||||
return format_autotokenizer
|
||||
return format_tokenizer_config
|
||||
|
||||
|
||||
def hf_tokenizer_config_to_chat_completion_handler(
|
||||
tokenizer_config: Dict[str, Any],
|
||||
add_generation_prompt: bool = True,
|
||||
) -> LlamaChatCompletionHandler:
|
||||
chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config)
|
||||
chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config, add_generation_prompt=add_generation_prompt)
|
||||
return chat_formatter_to_chat_completion_handler(chat_formatter)
|
||||
|
||||
|
||||
|
|
|
@ -188,6 +188,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
|
|||
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||
|
||||
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
# };
|
||||
|
@ -211,6 +212,7 @@ LLAMA_FTYPE_MOSTLY_Q6_K = 18
|
|||
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22
|
||||
LLAMA_FTYPE_GUESSED = 1024
|
||||
|
||||
# enum llama_rope_scaling_type {
|
||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 504dc37be8446fb09b1ede70300250ad41be32a2
|
||||
Subproject commit 26d607608d794efa56df3bdb6043a2f94c1d632c
|
Loading…
Reference in a new issue