This commit is contained in:
commit
238d2e0290
4 changed files with 35 additions and 27 deletions
|
@ -194,32 +194,32 @@ class Llama:
|
||||||
self.model_params.use_mmap = use_mmap if lora_path is None else False
|
self.model_params.use_mmap = use_mmap if lora_path is None else False
|
||||||
self.model_params.use_mlock = use_mlock
|
self.model_params.use_mlock = use_mlock
|
||||||
|
|
||||||
|
# kv_overrides is the original python dict
|
||||||
self.kv_overrides = kv_overrides
|
self.kv_overrides = kv_overrides
|
||||||
if kv_overrides is not None:
|
if kv_overrides is not None:
|
||||||
n_overrides = len(kv_overrides)
|
# _kv_overrides_array is a ctypes.Array of llama_model_kv_override Structs
|
||||||
self._kv_overrides_array = llama_cpp.llama_model_kv_override * (n_overrides + 1)
|
kvo_array_len = len(kv_overrides) + 1 # for sentinel element
|
||||||
self._kv_overrides_array_keys = []
|
self._kv_overrides_array = (
|
||||||
|
llama_cpp.llama_model_kv_override * kvo_array_len
|
||||||
|
)()
|
||||||
|
|
||||||
for k, v in kv_overrides.items():
|
for i, (k, v) in enumerate(kv_overrides.items()):
|
||||||
key_buf = ctypes.create_string_buffer(k.encode("utf-8"))
|
self._kv_overrides_array[i].key = k.encode("utf-8")
|
||||||
self._kv_overrides_array_keys.append(key_buf)
|
if isinstance(v, bool):
|
||||||
self._kv_overrides_array[i].key = key_buf
|
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
|
||||||
if isinstance(v, int):
|
self._kv_overrides_array[i].value.bool_value = v
|
||||||
|
elif isinstance(v, int):
|
||||||
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
|
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
|
||||||
self._kv_overrides_array[i].value.int_value = v
|
self._kv_overrides_array[i].value.int_value = v
|
||||||
elif isinstance(v, float):
|
elif isinstance(v, float):
|
||||||
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
|
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
|
||||||
self._kv_overrides_array[i].value.float_value = v
|
self._kv_overrides_array[i].value.float_value = v
|
||||||
elif isinstance(v, bool):
|
|
||||||
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
|
|
||||||
self._kv_overrides_array[i].value.bool_value = v
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown value type for {k}: {v}")
|
raise ValueError(f"Unknown value type for {k}: {v}")
|
||||||
|
|
||||||
self._kv_overrides_array_sentinel_key = b'\0'
|
self._kv_overrides_array[
|
||||||
|
-1
|
||||||
# null array sentinel
|
].key = b"\0" # ensure sentinel element is zeroed
|
||||||
self._kv_overrides_array[n_overrides].key = self._kv_overrides_array_sentinel_key
|
|
||||||
self.model_params.kv_overrides = self._kv_overrides_array
|
self.model_params.kv_overrides = self._kv_overrides_array
|
||||||
|
|
||||||
self.n_batch = min(n_ctx, n_batch) # ???
|
self.n_batch = min(n_ctx, n_batch) # ???
|
||||||
|
@ -329,7 +329,9 @@ class Llama:
|
||||||
(n_ctx, self._n_vocab), dtype=np.single
|
(n_ctx, self._n_vocab), dtype=np.single
|
||||||
)
|
)
|
||||||
|
|
||||||
self._mirostat_mu = ctypes.c_float(2.0 * 5.0) # TODO: Move this to sampling context
|
self._mirostat_mu = ctypes.c_float(
|
||||||
|
2.0 * 5.0
|
||||||
|
) # TODO: Move this to sampling context
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.metadata = self._model.metadata()
|
self.metadata = self._model.metadata()
|
||||||
|
@ -537,7 +539,7 @@ class Llama:
|
||||||
candidates=self._candidates,
|
candidates=self._candidates,
|
||||||
tau=mirostat_tau,
|
tau=mirostat_tau,
|
||||||
eta=mirostat_eta,
|
eta=mirostat_eta,
|
||||||
mu=ctypes.pointer(self._mirostat_mu)
|
mu=ctypes.pointer(self._mirostat_mu),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1)
|
self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1)
|
||||||
|
|
|
@ -379,7 +379,8 @@ def hf_autotokenizer_to_chat_completion_handler(
|
||||||
|
|
||||||
|
|
||||||
def hf_tokenizer_config_to_chat_formatter(
|
def hf_tokenizer_config_to_chat_formatter(
|
||||||
tokenizer_config: Dict[str, Any]
|
tokenizer_config: Dict[str, Any],
|
||||||
|
add_generation_prompt: bool = True,
|
||||||
) -> ChatFormatter:
|
) -> ChatFormatter:
|
||||||
assert isinstance(tokenizer_config, dict)
|
assert isinstance(tokenizer_config, dict)
|
||||||
|
|
||||||
|
@ -401,31 +402,34 @@ def hf_tokenizer_config_to_chat_formatter(
|
||||||
lstrip_blocks=True,
|
lstrip_blocks=True,
|
||||||
).from_string(chat_template)
|
).from_string(chat_template)
|
||||||
|
|
||||||
def format_autotokenizer(
|
def format_tokenizer_config(
|
||||||
messages: List[llama_types.ChatCompletionRequestMessage],
|
messages: List[llama_types.ChatCompletionRequestMessage],
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> ChatFormatterResponse:
|
) -> ChatFormatterResponse:
|
||||||
# TODO: veryify this is correct
|
# TODO: veryify this is correct
|
||||||
# Add a blank assistant message to the end of the messages to prompt the model to generate a response
|
# Add a blank assistant message to the end of the messages to prompt the model to generate a response
|
||||||
prompt = env.render(
|
if add_generation_prompt:
|
||||||
messages = [
|
messages = [
|
||||||
*messages,
|
*messages,
|
||||||
llama_types.ChatCompletionRequestAssistantMessage(
|
llama_types.ChatCompletionRequestAssistantMessage(
|
||||||
role="assistant", content=""
|
role="assistant", content=""
|
||||||
),
|
),
|
||||||
],
|
]
|
||||||
|
prompt = env.render(
|
||||||
|
messages=messages,
|
||||||
bos_token=bos_token,
|
bos_token=bos_token,
|
||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
)
|
)
|
||||||
return ChatFormatterResponse(prompt=prompt, stop=eos_token)
|
return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token])
|
||||||
|
|
||||||
return format_autotokenizer
|
return format_tokenizer_config
|
||||||
|
|
||||||
|
|
||||||
def hf_tokenizer_config_to_chat_completion_handler(
|
def hf_tokenizer_config_to_chat_completion_handler(
|
||||||
tokenizer_config: Dict[str, Any],
|
tokenizer_config: Dict[str, Any],
|
||||||
|
add_generation_prompt: bool = True,
|
||||||
) -> LlamaChatCompletionHandler:
|
) -> LlamaChatCompletionHandler:
|
||||||
chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config)
|
chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config, add_generation_prompt=add_generation_prompt)
|
||||||
return chat_formatter_to_chat_completion_handler(chat_formatter)
|
return chat_formatter_to_chat_completion_handler(chat_formatter)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -188,6 +188,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
|
||||||
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
||||||
# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
||||||
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||||
|
# LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||||
|
|
||||||
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
# };
|
# };
|
||||||
|
@ -211,6 +212,7 @@ LLAMA_FTYPE_MOSTLY_Q6_K = 18
|
||||||
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19
|
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19
|
||||||
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
|
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
|
||||||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
|
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22
|
||||||
LLAMA_FTYPE_GUESSED = 1024
|
LLAMA_FTYPE_GUESSED = 1024
|
||||||
|
|
||||||
# enum llama_rope_scaling_type {
|
# enum llama_rope_scaling_type {
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 504dc37be8446fb09b1ede70300250ad41be32a2
|
Subproject commit 26d607608d794efa56df3bdb6043a2f94c1d632c
|
Loading…
Reference in a new issue