Format
This commit is contained in:
parent
422ebc89ce
commit
7a3f87846b
3 changed files with 104 additions and 71 deletions
|
@ -1036,9 +1036,9 @@ class Llama:
|
||||||
offset = (
|
offset = (
|
||||||
0 if self.context_params.logits_all else n_tokens - 1
|
0 if self.context_params.logits_all else n_tokens - 1
|
||||||
) # NOTE: Only save the last token logits if logits_all is False
|
) # NOTE: Only save the last token logits if logits_all is False
|
||||||
self.scores[n_past + offset : n_past + n_tokens, :].reshape(
|
self.scores[n_past + offset : n_past + n_tokens, :].reshape(-1)[
|
||||||
-1
|
:
|
||||||
)[:] = self._ctx.get_logits()[offset * cols: rows * cols]
|
] = self._ctx.get_logits()[offset * cols : rows * cols]
|
||||||
# Update n_tokens
|
# Update n_tokens
|
||||||
self.n_tokens += n_tokens
|
self.n_tokens += n_tokens
|
||||||
|
|
||||||
|
@ -1135,7 +1135,9 @@ class Llama:
|
||||||
else:
|
else:
|
||||||
self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1)
|
self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1)
|
||||||
self._ctx.sample_tail_free(candidates=self._candidates, z=tfs_z, min_keep=1)
|
self._ctx.sample_tail_free(candidates=self._candidates, z=tfs_z, min_keep=1)
|
||||||
self._ctx.sample_typical(candidates=self._candidates, p=typical_p, min_keep=1)
|
self._ctx.sample_typical(
|
||||||
|
candidates=self._candidates, p=typical_p, min_keep=1
|
||||||
|
)
|
||||||
self._ctx.sample_top_p(candidates=self._candidates, p=top_p, min_keep=1)
|
self._ctx.sample_top_p(candidates=self._candidates, p=top_p, min_keep=1)
|
||||||
self._ctx.sample_min_p(candidates=self._candidates, p=min_p, min_keep=1)
|
self._ctx.sample_min_p(candidates=self._candidates, p=min_p, min_keep=1)
|
||||||
self._ctx.sample_temp(candidates=self._candidates, temp=temp)
|
self._ctx.sample_temp(candidates=self._candidates, temp=temp)
|
||||||
|
|
|
@ -532,6 +532,7 @@ def format_phind(
|
||||||
_prompt = _format_add_colon_single(_system_message, _messages, _sep)
|
_prompt = _format_add_colon_single(_system_message, _messages, _sep)
|
||||||
return ChatFormatterResponse(prompt=_prompt)
|
return ChatFormatterResponse(prompt=_prompt)
|
||||||
|
|
||||||
|
|
||||||
@register_chat_format("intel")
|
@register_chat_format("intel")
|
||||||
def format_intel(
|
def format_intel(
|
||||||
messages: List[llama_types.ChatCompletionRequestMessage],
|
messages: List[llama_types.ChatCompletionRequestMessage],
|
||||||
|
@ -588,6 +589,7 @@ def format_mistrallite(
|
||||||
_prompt = _format_no_colon_single(system_message, _messages, _sep)
|
_prompt = _format_no_colon_single(system_message, _messages, _sep)
|
||||||
return ChatFormatterResponse(prompt=_prompt)
|
return ChatFormatterResponse(prompt=_prompt)
|
||||||
|
|
||||||
|
|
||||||
@register_chat_format("chatml")
|
@register_chat_format("chatml")
|
||||||
def format_chatml(
|
def format_chatml(
|
||||||
messages: List[llama_types.ChatCompletionRequestMessage],
|
messages: List[llama_types.ChatCompletionRequestMessage],
|
||||||
|
@ -604,6 +606,7 @@ def format_chatml(
|
||||||
_prompt = _format_chatml(system_message, _messages, _sep)
|
_prompt = _format_chatml(system_message, _messages, _sep)
|
||||||
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
|
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
|
||||||
|
|
||||||
|
|
||||||
@register_chat_format("openchat")
|
@register_chat_format("openchat")
|
||||||
def format_openchat(
|
def format_openchat(
|
||||||
messages: List[llama_types.ChatCompletionRequestMessage],
|
messages: List[llama_types.ChatCompletionRequestMessage],
|
||||||
|
@ -612,7 +615,9 @@ def format_openchat(
|
||||||
system_template = "{system_message}<|end_of_turn|>"
|
system_template = "{system_message}<|end_of_turn|>"
|
||||||
system_message = _get_system_message(messages)
|
system_message = _get_system_message(messages)
|
||||||
system_message = system_template.format(system_message=system_message)
|
system_message = system_template.format(system_message=system_message)
|
||||||
_roles = dict(user="GPT4 Correct User: ", assistant="<|end_of_turn|>GPT4 Correct Assistant: ")
|
_roles = dict(
|
||||||
|
user="GPT4 Correct User: ", assistant="<|end_of_turn|>GPT4 Correct Assistant: "
|
||||||
|
)
|
||||||
_sep = "<|end_of_turn|>"
|
_sep = "<|end_of_turn|>"
|
||||||
_messages = _map_roles(messages, _roles)
|
_messages = _map_roles(messages, _roles)
|
||||||
_messages.append((_roles["assistant"], None))
|
_messages.append((_roles["assistant"], None))
|
||||||
|
@ -651,46 +656,60 @@ def functionary_chat_handler(
|
||||||
) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
|
) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
|
||||||
SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
|
SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
|
||||||
|
|
||||||
def generate_type_definition(param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs) -> str:
|
def generate_type_definition(
|
||||||
indent = ' ' * indent_level
|
param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
|
||||||
if '$ref' in param:
|
) -> str:
|
||||||
|
indent = " " * indent_level
|
||||||
|
if "$ref" in param:
|
||||||
# Reference to a shared definition
|
# Reference to a shared definition
|
||||||
ref_name = param['$ref'].split('/')[-1] # Extract the type name from the reference
|
ref_name = param["$ref"].split("/")[
|
||||||
|
-1
|
||||||
|
] # Extract the type name from the reference
|
||||||
return ref_name
|
return ref_name
|
||||||
elif param.get('type') == 'array':
|
elif param.get("type") == "array":
|
||||||
items = param.get('items', {})
|
items = param.get("items", {})
|
||||||
item_type = generate_type_definition(items, indent_level + 1, shared_defs)
|
item_type = generate_type_definition(items, indent_level + 1, shared_defs)
|
||||||
return f"Array<{item_type}>"
|
return f"Array<{item_type}>"
|
||||||
elif param.get('type') == 'object':
|
elif param.get("type") == "object":
|
||||||
properties = param.get('properties', {})
|
properties = param.get("properties", {})
|
||||||
nested_schema = "{\n"
|
nested_schema = "{\n"
|
||||||
for nested_param_name, nested_param in properties.items():
|
for nested_param_name, nested_param in properties.items():
|
||||||
nested_param_type = generate_type_definition(nested_param, indent_level + 1, shared_defs)
|
nested_param_type = generate_type_definition(
|
||||||
nested_schema += f"{indent} {nested_param_name}: {nested_param_type},\n"
|
nested_param, indent_level + 1, shared_defs
|
||||||
|
)
|
||||||
|
nested_schema += (
|
||||||
|
f"{indent} {nested_param_name}: {nested_param_type},\n"
|
||||||
|
)
|
||||||
nested_schema += indent + "}"
|
nested_schema += indent + "}"
|
||||||
return nested_schema
|
return nested_schema
|
||||||
elif 'enum' in param:
|
elif "enum" in param:
|
||||||
# Enum type
|
# Enum type
|
||||||
return " | ".join([f'"{enum_value}"' for enum_value in param['enum']])
|
return " | ".join([f'"{enum_value}"' for enum_value in param["enum"]])
|
||||||
else:
|
else:
|
||||||
# Simple type
|
# Simple type
|
||||||
return param.get('type', 'any')
|
return param.get("type", "any")
|
||||||
|
|
||||||
def generate_shared_definitions(shared_defs, indent_level: int) -> str:
|
def generate_shared_definitions(shared_defs, indent_level: int) -> str:
|
||||||
indent = ' ' * indent_level
|
indent = " " * indent_level
|
||||||
shared_definitions = ""
|
shared_definitions = ""
|
||||||
for def_name, def_properties in shared_defs.items():
|
for def_name, def_properties in shared_defs.items():
|
||||||
shared_definitions += f"{indent}type {def_name} = "
|
shared_definitions += f"{indent}type {def_name} = "
|
||||||
if def_properties.get('type') == 'object':
|
if def_properties.get("type") == "object":
|
||||||
shared_definitions += generate_type_definition(def_properties, indent_level, shared_defs)
|
shared_definitions += generate_type_definition(
|
||||||
elif 'enum' in def_properties:
|
def_properties, indent_level, shared_defs
|
||||||
|
)
|
||||||
|
elif "enum" in def_properties:
|
||||||
# Enum type
|
# Enum type
|
||||||
shared_definitions += " | ".join([f'"{enum_value}"' for enum_value in def_properties['enum']])
|
shared_definitions += " | ".join(
|
||||||
|
[f'"{enum_value}"' for enum_value in def_properties["enum"]]
|
||||||
|
)
|
||||||
shared_definitions += ";\n"
|
shared_definitions += ";\n"
|
||||||
return shared_definitions
|
return shared_definitions
|
||||||
|
|
||||||
def generate_schema_from_functions(functions, namespace="functions") -> str:
|
def generate_schema_from_functions(functions, namespace="functions") -> str:
|
||||||
schema = "// Supported function definitions that should be called when necessary.\n"
|
schema = (
|
||||||
|
"// Supported function definitions that should be called when necessary.\n"
|
||||||
|
)
|
||||||
schema += f"namespace {namespace} {{\n\n"
|
schema += f"namespace {namespace} {{\n\n"
|
||||||
|
|
||||||
# Generate shared definitions
|
# Generate shared definitions
|
||||||
|
@ -737,9 +756,14 @@ def functionary_chat_handler(
|
||||||
if tools is not None:
|
if tools is not None:
|
||||||
all_messages.append(
|
all_messages.append(
|
||||||
llama_types.ChatCompletionRequestSystemMessage(
|
llama_types.ChatCompletionRequestSystemMessage(
|
||||||
role="system", content=generate_schema_from_functions(
|
role="system",
|
||||||
[tool["function"] for tool in tools if tool["type"] == "function"]
|
content=generate_schema_from_functions(
|
||||||
)
|
[
|
||||||
|
tool["function"]
|
||||||
|
for tool in tools
|
||||||
|
if tool["type"] == "function"
|
||||||
|
]
|
||||||
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -790,7 +814,9 @@ def functionary_chat_handler(
|
||||||
elif "function_call" in msg:
|
elif "function_call" in msg:
|
||||||
return f"assistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
|
return f"assistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
|
||||||
elif "tool_calls" in msg and len(msg["tool_calls"]) > 0:
|
elif "tool_calls" in msg and len(msg["tool_calls"]) > 0:
|
||||||
for tool_call in msg["tool_calls"]: # NOTE: probably doesn't work with the functionary model
|
for tool_call in msg[
|
||||||
|
"tool_calls"
|
||||||
|
]: # NOTE: probably doesn't work with the functionary model
|
||||||
return f"assistant to={tool_call['id']}:\n{tool_call['function']['arguments']}</s>\n"
|
return f"assistant to={tool_call['id']}:\n{tool_call['function']['arguments']}</s>\n"
|
||||||
elif msg["content"] is None:
|
elif msg["content"] is None:
|
||||||
return "assistant"
|
return "assistant"
|
||||||
|
@ -805,7 +831,9 @@ def functionary_chat_handler(
|
||||||
functions = [tool["function"] for tool in tools if tool["type"] == "function"]
|
functions = [tool["function"] for tool in tools if tool["type"] == "function"]
|
||||||
|
|
||||||
if tool_choice is not None:
|
if tool_choice is not None:
|
||||||
function_call = tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
|
function_call = (
|
||||||
|
tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
|
||||||
|
)
|
||||||
|
|
||||||
prompt = prepare_messages_for_inference(messages, functions, tools)
|
prompt = prepare_messages_for_inference(messages, functions, tools)
|
||||||
|
|
||||||
|
@ -865,15 +893,23 @@ def functionary_chat_handler(
|
||||||
if function_body is not None:
|
if function_body is not None:
|
||||||
try:
|
try:
|
||||||
with suppress_stdout_stderr(disable=llama.verbose):
|
with suppress_stdout_stderr(disable=llama.verbose):
|
||||||
grammar_text = llama_grammar.json_schema_to_gbnf(json.dumps(function_body))
|
grammar_text = llama_grammar.json_schema_to_gbnf(
|
||||||
grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.json_schema_to_gbnf(json.dumps(function_body)))
|
json.dumps(function_body)
|
||||||
|
)
|
||||||
|
grammar = llama_grammar.LlamaGrammar.from_string(
|
||||||
|
llama_grammar.json_schema_to_gbnf(json.dumps(function_body))
|
||||||
|
)
|
||||||
print(grammar_text)
|
print(grammar_text)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if llama.verbose:
|
if llama.verbose:
|
||||||
print("Failed to parse function body as JSON schema, falling back to default grammar")
|
print(
|
||||||
|
"Failed to parse function body as JSON schema, falling back to default grammar"
|
||||||
|
)
|
||||||
print(e)
|
print(e)
|
||||||
with suppress_stdout_stderr(disable=llama.verbose):
|
with suppress_stdout_stderr(disable=llama.verbose):
|
||||||
grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
|
grammar = llama_grammar.LlamaGrammar.from_string(
|
||||||
|
llama_grammar.JSON_GBNF
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
with suppress_stdout_stderr(disable=llama.verbose):
|
with suppress_stdout_stderr(disable=llama.verbose):
|
||||||
grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
|
grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
|
||||||
|
@ -929,9 +965,9 @@ def functionary_chat_handler(
|
||||||
"function": {
|
"function": {
|
||||||
"name": function_call,
|
"name": function_call,
|
||||||
"arguments": completion["choices"][0]["text"],
|
"arguments": completion["choices"][0]["text"],
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
},
|
},
|
||||||
"finish_reason": "tool_calls",
|
"finish_reason": "tool_calls",
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,7 +30,7 @@ import numpy.typing as npt
|
||||||
|
|
||||||
|
|
||||||
# Disable warning for model and model_alias settings
|
# Disable warning for model and model_alias settings
|
||||||
BaseSettings.model_config['protected_namespaces'] = ()
|
BaseSettings.model_config["protected_namespaces"] = ()
|
||||||
|
|
||||||
|
|
||||||
class Settings(BaseSettings):
|
class Settings(BaseSettings):
|
||||||
|
@ -68,7 +68,9 @@ class Settings(BaseSettings):
|
||||||
description="Use mlock.",
|
description="Use mlock.",
|
||||||
)
|
)
|
||||||
# Context Params
|
# Context Params
|
||||||
seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
|
seed: int = Field(
|
||||||
|
default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
|
||||||
|
)
|
||||||
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
|
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
|
||||||
n_batch: int = Field(
|
n_batch: int = Field(
|
||||||
default=512, ge=1, description="The batch size to use per eval."
|
default=512, ge=1, description="The batch size to use per eval."
|
||||||
|
@ -83,30 +85,16 @@ class Settings(BaseSettings):
|
||||||
ge=0,
|
ge=0,
|
||||||
description="The number of threads to use when batch processing.",
|
description="The number of threads to use when batch processing.",
|
||||||
)
|
)
|
||||||
rope_scaling_type: int = Field(
|
rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED)
|
||||||
default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
|
rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
|
||||||
)
|
|
||||||
rope_freq_base: float = Field(
|
|
||||||
default=0.0, description="RoPE base frequency"
|
|
||||||
)
|
|
||||||
rope_freq_scale: float = Field(
|
rope_freq_scale: float = Field(
|
||||||
default=0.0, description="RoPE frequency scaling factor"
|
default=0.0, description="RoPE frequency scaling factor"
|
||||||
)
|
)
|
||||||
yarn_ext_factor: float = Field(
|
yarn_ext_factor: float = Field(default=-1.0)
|
||||||
default=-1.0
|
yarn_attn_factor: float = Field(default=1.0)
|
||||||
)
|
yarn_beta_fast: float = Field(default=32.0)
|
||||||
yarn_attn_factor: float = Field(
|
yarn_beta_slow: float = Field(default=1.0)
|
||||||
default=1.0
|
yarn_orig_ctx: int = Field(default=0)
|
||||||
)
|
|
||||||
yarn_beta_fast: float = Field(
|
|
||||||
default=32.0
|
|
||||||
)
|
|
||||||
yarn_beta_slow: float = Field(
|
|
||||||
default=1.0
|
|
||||||
)
|
|
||||||
yarn_orig_ctx: int = Field(
|
|
||||||
default=0
|
|
||||||
)
|
|
||||||
mul_mat_q: bool = Field(
|
mul_mat_q: bool = Field(
|
||||||
default=True, description="if true, use experimental mul_mat_q kernels"
|
default=True, description="if true, use experimental mul_mat_q kernels"
|
||||||
)
|
)
|
||||||
|
@ -122,7 +110,7 @@ class Settings(BaseSettings):
|
||||||
# LoRA Params
|
# LoRA Params
|
||||||
lora_base: Optional[str] = Field(
|
lora_base: Optional[str] = Field(
|
||||||
default=None,
|
default=None,
|
||||||
description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model."
|
description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.",
|
||||||
)
|
)
|
||||||
lora_path: Optional[str] = Field(
|
lora_path: Optional[str] = Field(
|
||||||
default=None,
|
default=None,
|
||||||
|
@ -384,7 +372,9 @@ def create_app(settings: Optional[Settings] = None):
|
||||||
chat_handler = None
|
chat_handler = None
|
||||||
if settings.chat_format == "llava-1-5":
|
if settings.chat_format == "llava-1-5":
|
||||||
assert settings.clip_model_path is not None
|
assert settings.clip_model_path is not None
|
||||||
chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path, verbose=settings.verbose)
|
chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
|
||||||
|
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
||||||
|
)
|
||||||
##
|
##
|
||||||
|
|
||||||
llama = llama_cpp.Llama(
|
llama = llama_cpp.Llama(
|
||||||
|
@ -587,9 +577,10 @@ mirostat_eta_field = Field(
|
||||||
|
|
||||||
grammar = Field(
|
grammar = Field(
|
||||||
default=None,
|
default=None,
|
||||||
description="A CBNF grammar (as string) to be used for formatting the model's output."
|
description="A CBNF grammar (as string) to be used for formatting the model's output.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class CreateCompletionRequest(BaseModel):
|
class CreateCompletionRequest(BaseModel):
|
||||||
prompt: Union[str, List[str]] = Field(
|
prompt: Union[str, List[str]] = Field(
|
||||||
default="", description="The prompt to generate completions for."
|
default="", description="The prompt to generate completions for."
|
||||||
|
@ -690,7 +681,8 @@ async def create_completion(
|
||||||
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
|
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
|
||||||
|
|
||||||
iterator_or_completion: Union[
|
iterator_or_completion: Union[
|
||||||
llama_cpp.CreateCompletionResponse, Iterator[llama_cpp.CreateCompletionStreamResponse]
|
llama_cpp.CreateCompletionResponse,
|
||||||
|
Iterator[llama_cpp.CreateCompletionStreamResponse],
|
||||||
] = await run_in_threadpool(llama, **kwargs)
|
] = await run_in_threadpool(llama, **kwargs)
|
||||||
|
|
||||||
if isinstance(iterator_or_completion, Iterator):
|
if isinstance(iterator_or_completion, Iterator):
|
||||||
|
@ -748,7 +740,9 @@ class ChatCompletionRequestMessage(BaseModel):
|
||||||
role: Literal["system", "user", "assistant", "function"] = Field(
|
role: Literal["system", "user", "assistant", "function"] = Field(
|
||||||
default="user", description="The role of the message."
|
default="user", description="The role of the message."
|
||||||
)
|
)
|
||||||
content: Optional[str] = Field(default="", description="The content of the message.")
|
content: Optional[str] = Field(
|
||||||
|
default="", description="The content of the message."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class CreateChatCompletionRequest(BaseModel):
|
class CreateChatCompletionRequest(BaseModel):
|
||||||
|
@ -770,9 +764,10 @@ class CreateChatCompletionRequest(BaseModel):
|
||||||
tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field(
|
tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field(
|
||||||
default=None,
|
default=None,
|
||||||
description="A tool to apply to the generated completions.",
|
description="A tool to apply to the generated completions.",
|
||||||
) # TODO: verify
|
) # TODO: verify
|
||||||
max_tokens: Optional[int] = Field(
|
max_tokens: Optional[int] = Field(
|
||||||
default=None, description="The maximum number of tokens to generate. Defaults to inf"
|
default=None,
|
||||||
|
description="The maximum number of tokens to generate. Defaults to inf",
|
||||||
)
|
)
|
||||||
temperature: float = temperature_field
|
temperature: float = temperature_field
|
||||||
top_p: float = top_p_field
|
top_p: float = top_p_field
|
||||||
|
|
Loading…
Reference in a new issue