This commit is contained in:
Andrei Betlen 2023-11-21 04:02:20 -05:00
parent 422ebc89ce
commit 7a3f87846b
3 changed files with 104 additions and 71 deletions

View file

@ -1036,9 +1036,9 @@ class Llama:
offset = ( offset = (
0 if self.context_params.logits_all else n_tokens - 1 0 if self.context_params.logits_all else n_tokens - 1
) # NOTE: Only save the last token logits if logits_all is False ) # NOTE: Only save the last token logits if logits_all is False
self.scores[n_past + offset : n_past + n_tokens, :].reshape( self.scores[n_past + offset : n_past + n_tokens, :].reshape(-1)[
-1 :
)[:] = self._ctx.get_logits()[offset * cols: rows * cols] ] = self._ctx.get_logits()[offset * cols : rows * cols]
# Update n_tokens # Update n_tokens
self.n_tokens += n_tokens self.n_tokens += n_tokens
@ -1135,7 +1135,9 @@ class Llama:
else: else:
self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1) self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1)
self._ctx.sample_tail_free(candidates=self._candidates, z=tfs_z, min_keep=1) self._ctx.sample_tail_free(candidates=self._candidates, z=tfs_z, min_keep=1)
self._ctx.sample_typical(candidates=self._candidates, p=typical_p, min_keep=1) self._ctx.sample_typical(
candidates=self._candidates, p=typical_p, min_keep=1
)
self._ctx.sample_top_p(candidates=self._candidates, p=top_p, min_keep=1) self._ctx.sample_top_p(candidates=self._candidates, p=top_p, min_keep=1)
self._ctx.sample_min_p(candidates=self._candidates, p=min_p, min_keep=1) self._ctx.sample_min_p(candidates=self._candidates, p=min_p, min_keep=1)
self._ctx.sample_temp(candidates=self._candidates, temp=temp) self._ctx.sample_temp(candidates=self._candidates, temp=temp)

View file

@ -532,6 +532,7 @@ def format_phind(
_prompt = _format_add_colon_single(_system_message, _messages, _sep) _prompt = _format_add_colon_single(_system_message, _messages, _sep)
return ChatFormatterResponse(prompt=_prompt) return ChatFormatterResponse(prompt=_prompt)
@register_chat_format("intel") @register_chat_format("intel")
def format_intel( def format_intel(
messages: List[llama_types.ChatCompletionRequestMessage], messages: List[llama_types.ChatCompletionRequestMessage],
@ -588,6 +589,7 @@ def format_mistrallite(
_prompt = _format_no_colon_single(system_message, _messages, _sep) _prompt = _format_no_colon_single(system_message, _messages, _sep)
return ChatFormatterResponse(prompt=_prompt) return ChatFormatterResponse(prompt=_prompt)
@register_chat_format("chatml") @register_chat_format("chatml")
def format_chatml( def format_chatml(
messages: List[llama_types.ChatCompletionRequestMessage], messages: List[llama_types.ChatCompletionRequestMessage],
@ -604,6 +606,7 @@ def format_chatml(
_prompt = _format_chatml(system_message, _messages, _sep) _prompt = _format_chatml(system_message, _messages, _sep)
return ChatFormatterResponse(prompt=_prompt, stop=_sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep)
@register_chat_format("openchat") @register_chat_format("openchat")
def format_openchat( def format_openchat(
messages: List[llama_types.ChatCompletionRequestMessage], messages: List[llama_types.ChatCompletionRequestMessage],
@ -612,7 +615,9 @@ def format_openchat(
system_template = "{system_message}<|end_of_turn|>" system_template = "{system_message}<|end_of_turn|>"
system_message = _get_system_message(messages) system_message = _get_system_message(messages)
system_message = system_template.format(system_message=system_message) system_message = system_template.format(system_message=system_message)
_roles = dict(user="GPT4 Correct User: ", assistant="<|end_of_turn|>GPT4 Correct Assistant: ") _roles = dict(
user="GPT4 Correct User: ", assistant="<|end_of_turn|>GPT4 Correct Assistant: "
)
_sep = "<|end_of_turn|>" _sep = "<|end_of_turn|>"
_messages = _map_roles(messages, _roles) _messages = _map_roles(messages, _roles)
_messages.append((_roles["assistant"], None)) _messages.append((_roles["assistant"], None))
@ -651,46 +656,60 @@ def functionary_chat_handler(
) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
def generate_type_definition(param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs) -> str: def generate_type_definition(
indent = ' ' * indent_level param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
if '$ref' in param: ) -> str:
indent = " " * indent_level
if "$ref" in param:
# Reference to a shared definition # Reference to a shared definition
ref_name = param['$ref'].split('/')[-1] # Extract the type name from the reference ref_name = param["$ref"].split("/")[
-1
] # Extract the type name from the reference
return ref_name return ref_name
elif param.get('type') == 'array': elif param.get("type") == "array":
items = param.get('items', {}) items = param.get("items", {})
item_type = generate_type_definition(items, indent_level + 1, shared_defs) item_type = generate_type_definition(items, indent_level + 1, shared_defs)
return f"Array<{item_type}>" return f"Array<{item_type}>"
elif param.get('type') == 'object': elif param.get("type") == "object":
properties = param.get('properties', {}) properties = param.get("properties", {})
nested_schema = "{\n" nested_schema = "{\n"
for nested_param_name, nested_param in properties.items(): for nested_param_name, nested_param in properties.items():
nested_param_type = generate_type_definition(nested_param, indent_level + 1, shared_defs) nested_param_type = generate_type_definition(
nested_schema += f"{indent} {nested_param_name}: {nested_param_type},\n" nested_param, indent_level + 1, shared_defs
)
nested_schema += (
f"{indent} {nested_param_name}: {nested_param_type},\n"
)
nested_schema += indent + "}" nested_schema += indent + "}"
return nested_schema return nested_schema
elif 'enum' in param: elif "enum" in param:
# Enum type # Enum type
return " | ".join([f'"{enum_value}"' for enum_value in param['enum']]) return " | ".join([f'"{enum_value}"' for enum_value in param["enum"]])
else: else:
# Simple type # Simple type
return param.get('type', 'any') return param.get("type", "any")
def generate_shared_definitions(shared_defs, indent_level: int) -> str: def generate_shared_definitions(shared_defs, indent_level: int) -> str:
indent = ' ' * indent_level indent = " " * indent_level
shared_definitions = "" shared_definitions = ""
for def_name, def_properties in shared_defs.items(): for def_name, def_properties in shared_defs.items():
shared_definitions += f"{indent}type {def_name} = " shared_definitions += f"{indent}type {def_name} = "
if def_properties.get('type') == 'object': if def_properties.get("type") == "object":
shared_definitions += generate_type_definition(def_properties, indent_level, shared_defs) shared_definitions += generate_type_definition(
elif 'enum' in def_properties: def_properties, indent_level, shared_defs
)
elif "enum" in def_properties:
# Enum type # Enum type
shared_definitions += " | ".join([f'"{enum_value}"' for enum_value in def_properties['enum']]) shared_definitions += " | ".join(
[f'"{enum_value}"' for enum_value in def_properties["enum"]]
)
shared_definitions += ";\n" shared_definitions += ";\n"
return shared_definitions return shared_definitions
def generate_schema_from_functions(functions, namespace="functions") -> str: def generate_schema_from_functions(functions, namespace="functions") -> str:
schema = "// Supported function definitions that should be called when necessary.\n" schema = (
"// Supported function definitions that should be called when necessary.\n"
)
schema += f"namespace {namespace} {{\n\n" schema += f"namespace {namespace} {{\n\n"
# Generate shared definitions # Generate shared definitions
@ -706,10 +725,10 @@ def functionary_chat_handler(
description = function.get("description", "") description = function.get("description", "")
parameters = function.get("parameters", {}) parameters = function.get("parameters", {})
required_params = parameters.get("required", []) required_params = parameters.get("required", [])
schema += f" // {description}\n" schema += f" // {description}\n"
schema += f" type {function_name} = (_: {{\n" schema += f" type {function_name} = (_: {{\n"
for param_name, param in parameters.get("properties", {}).items(): for param_name, param in parameters.get("properties", {}).items():
param_description = param.get("description", "") param_description = param.get("description", "")
param_type = generate_type_definition(param, 2, shared_definitions) param_type = generate_type_definition(param, 2, shared_definitions)
@ -733,13 +752,18 @@ def functionary_chat_handler(
role="system", content=generate_schema_from_functions(functions) role="system", content=generate_schema_from_functions(functions)
) )
) )
if tools is not None: if tools is not None:
all_messages.append( all_messages.append(
llama_types.ChatCompletionRequestSystemMessage( llama_types.ChatCompletionRequestSystemMessage(
role="system", content=generate_schema_from_functions( role="system",
[tool["function"] for tool in tools if tool["type"] == "function"] content=generate_schema_from_functions(
) [
tool["function"]
for tool in tools
if tool["type"] == "function"
]
),
) )
) )
@ -790,7 +814,9 @@ def functionary_chat_handler(
elif "function_call" in msg: elif "function_call" in msg:
return f"assistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n" return f"assistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
elif "tool_calls" in msg and len(msg["tool_calls"]) > 0: elif "tool_calls" in msg and len(msg["tool_calls"]) > 0:
for tool_call in msg["tool_calls"]: # NOTE: probably doesn't work with the functionary model for tool_call in msg[
"tool_calls"
]: # NOTE: probably doesn't work with the functionary model
return f"assistant to={tool_call['id']}:\n{tool_call['function']['arguments']}</s>\n" return f"assistant to={tool_call['id']}:\n{tool_call['function']['arguments']}</s>\n"
elif msg["content"] is None: elif msg["content"] is None:
return "assistant" return "assistant"
@ -800,12 +826,14 @@ def functionary_chat_handler(
raise ValueError(f"Unsupported role: {msg['role']}") raise ValueError(f"Unsupported role: {msg['role']}")
return "".join([message_to_str(msg) for msg in all_messages]) return "".join([message_to_str(msg) for msg in all_messages])
if tools is not None: if tools is not None:
functions = [tool["function"] for tool in tools if tool["type"] == "function"] functions = [tool["function"] for tool in tools if tool["type"] == "function"]
if tool_choice is not None: if tool_choice is not None:
function_call = tool_choice if isinstance(tool_choice, str) else tool_choice["function"] function_call = (
tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
)
prompt = prepare_messages_for_inference(messages, functions, tools) prompt = prepare_messages_for_inference(messages, functions, tools)
@ -861,19 +889,27 @@ def functionary_chat_handler(
if tool["type"] == "function" and tool["function"]["name"] == function_call: if tool["type"] == "function" and tool["function"]["name"] == function_call:
function_body = tool["function"]["parameters"] function_body = tool["function"]["parameters"]
break break
if function_body is not None: if function_body is not None:
try: try:
with suppress_stdout_stderr(disable=llama.verbose): with suppress_stdout_stderr(disable=llama.verbose):
grammar_text = llama_grammar.json_schema_to_gbnf(json.dumps(function_body)) grammar_text = llama_grammar.json_schema_to_gbnf(
grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.json_schema_to_gbnf(json.dumps(function_body))) json.dumps(function_body)
)
grammar = llama_grammar.LlamaGrammar.from_string(
llama_grammar.json_schema_to_gbnf(json.dumps(function_body))
)
print(grammar_text) print(grammar_text)
except Exception as e: except Exception as e:
if llama.verbose: if llama.verbose:
print("Failed to parse function body as JSON schema, falling back to default grammar") print(
"Failed to parse function body as JSON schema, falling back to default grammar"
)
print(e) print(e)
with suppress_stdout_stderr(disable=llama.verbose): with suppress_stdout_stderr(disable=llama.verbose):
grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF) grammar = llama_grammar.LlamaGrammar.from_string(
llama_grammar.JSON_GBNF
)
else: else:
with suppress_stdout_stderr(disable=llama.verbose): with suppress_stdout_stderr(disable=llama.verbose):
grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF) grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
@ -929,9 +965,9 @@ def functionary_chat_handler(
"function": { "function": {
"name": function_call, "name": function_call,
"arguments": completion["choices"][0]["text"], "arguments": completion["choices"][0]["text"],
} },
} }
] ],
}, },
"finish_reason": "tool_calls", "finish_reason": "tool_calls",
} }

View file

@ -30,7 +30,7 @@ import numpy.typing as npt
# Disable warning for model and model_alias settings # Disable warning for model and model_alias settings
BaseSettings.model_config['protected_namespaces'] = () BaseSettings.model_config["protected_namespaces"] = ()
class Settings(BaseSettings): class Settings(BaseSettings):
@ -68,7 +68,9 @@ class Settings(BaseSettings):
description="Use mlock.", description="Use mlock.",
) )
# Context Params # Context Params
seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.") seed: int = Field(
default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
)
n_ctx: int = Field(default=2048, ge=1, description="The context size.") n_ctx: int = Field(default=2048, ge=1, description="The context size.")
n_batch: int = Field( n_batch: int = Field(
default=512, ge=1, description="The batch size to use per eval." default=512, ge=1, description="The batch size to use per eval."
@ -83,30 +85,16 @@ class Settings(BaseSettings):
ge=0, ge=0,
description="The number of threads to use when batch processing.", description="The number of threads to use when batch processing.",
) )
rope_scaling_type: int = Field( rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED)
default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
)
rope_freq_base: float = Field(
default=0.0, description="RoPE base frequency"
)
rope_freq_scale: float = Field( rope_freq_scale: float = Field(
default=0.0, description="RoPE frequency scaling factor" default=0.0, description="RoPE frequency scaling factor"
) )
yarn_ext_factor: float = Field( yarn_ext_factor: float = Field(default=-1.0)
default=-1.0 yarn_attn_factor: float = Field(default=1.0)
) yarn_beta_fast: float = Field(default=32.0)
yarn_attn_factor: float = Field( yarn_beta_slow: float = Field(default=1.0)
default=1.0 yarn_orig_ctx: int = Field(default=0)
)
yarn_beta_fast: float = Field(
default=32.0
)
yarn_beta_slow: float = Field(
default=1.0
)
yarn_orig_ctx: int = Field(
default=0
)
mul_mat_q: bool = Field( mul_mat_q: bool = Field(
default=True, description="if true, use experimental mul_mat_q kernels" default=True, description="if true, use experimental mul_mat_q kernels"
) )
@ -122,7 +110,7 @@ class Settings(BaseSettings):
# LoRA Params # LoRA Params
lora_base: Optional[str] = Field( lora_base: Optional[str] = Field(
default=None, default=None,
description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model." description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.",
) )
lora_path: Optional[str] = Field( lora_path: Optional[str] = Field(
default=None, default=None,
@ -384,7 +372,9 @@ def create_app(settings: Optional[Settings] = None):
chat_handler = None chat_handler = None
if settings.chat_format == "llava-1-5": if settings.chat_format == "llava-1-5":
assert settings.clip_model_path is not None assert settings.clip_model_path is not None
chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path, verbose=settings.verbose) chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
clip_model_path=settings.clip_model_path, verbose=settings.verbose
)
## ##
llama = llama_cpp.Llama( llama = llama_cpp.Llama(
@ -587,9 +577,10 @@ mirostat_eta_field = Field(
grammar = Field( grammar = Field(
default=None, default=None,
description="A CBNF grammar (as string) to be used for formatting the model's output." description="A CBNF grammar (as string) to be used for formatting the model's output.",
) )
class CreateCompletionRequest(BaseModel): class CreateCompletionRequest(BaseModel):
prompt: Union[str, List[str]] = Field( prompt: Union[str, List[str]] = Field(
default="", description="The prompt to generate completions for." default="", description="The prompt to generate completions for."
@ -690,7 +681,8 @@ async def create_completion(
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
iterator_or_completion: Union[ iterator_or_completion: Union[
llama_cpp.CreateCompletionResponse, Iterator[llama_cpp.CreateCompletionStreamResponse] llama_cpp.CreateCompletionResponse,
Iterator[llama_cpp.CreateCompletionStreamResponse],
] = await run_in_threadpool(llama, **kwargs) ] = await run_in_threadpool(llama, **kwargs)
if isinstance(iterator_or_completion, Iterator): if isinstance(iterator_or_completion, Iterator):
@ -748,7 +740,9 @@ class ChatCompletionRequestMessage(BaseModel):
role: Literal["system", "user", "assistant", "function"] = Field( role: Literal["system", "user", "assistant", "function"] = Field(
default="user", description="The role of the message." default="user", description="The role of the message."
) )
content: Optional[str] = Field(default="", description="The content of the message.") content: Optional[str] = Field(
default="", description="The content of the message."
)
class CreateChatCompletionRequest(BaseModel): class CreateChatCompletionRequest(BaseModel):
@ -770,9 +764,10 @@ class CreateChatCompletionRequest(BaseModel):
tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field( tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field(
default=None, default=None,
description="A tool to apply to the generated completions.", description="A tool to apply to the generated completions.",
) # TODO: verify ) # TODO: verify
max_tokens: Optional[int] = Field( max_tokens: Optional[int] = Field(
default=None, description="The maximum number of tokens to generate. Defaults to inf" default=None,
description="The maximum number of tokens to generate. Defaults to inf",
) )
temperature: float = temperature_field temperature: float = temperature_field
top_p: float = top_p_field top_p: float = top_p_field