diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7d3dc76..1532646 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1036,9 +1036,9 @@ class Llama: offset = ( 0 if self.context_params.logits_all else n_tokens - 1 ) # NOTE: Only save the last token logits if logits_all is False - self.scores[n_past + offset : n_past + n_tokens, :].reshape( - -1 - )[:] = self._ctx.get_logits()[offset * cols: rows * cols] + self.scores[n_past + offset : n_past + n_tokens, :].reshape(-1)[ + : + ] = self._ctx.get_logits()[offset * cols : rows * cols] # Update n_tokens self.n_tokens += n_tokens @@ -1135,7 +1135,9 @@ class Llama: else: self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1) self._ctx.sample_tail_free(candidates=self._candidates, z=tfs_z, min_keep=1) - self._ctx.sample_typical(candidates=self._candidates, p=typical_p, min_keep=1) + self._ctx.sample_typical( + candidates=self._candidates, p=typical_p, min_keep=1 + ) self._ctx.sample_top_p(candidates=self._candidates, p=top_p, min_keep=1) self._ctx.sample_min_p(candidates=self._candidates, p=min_p, min_keep=1) self._ctx.sample_temp(candidates=self._candidates, temp=temp) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 8efbaae..5017ca7 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -532,6 +532,7 @@ def format_phind( _prompt = _format_add_colon_single(_system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt) + @register_chat_format("intel") def format_intel( messages: List[llama_types.ChatCompletionRequestMessage], @@ -588,6 +589,7 @@ def format_mistrallite( _prompt = _format_no_colon_single(system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt) + @register_chat_format("chatml") def format_chatml( messages: List[llama_types.ChatCompletionRequestMessage], @@ -604,6 +606,7 @@ def format_chatml( _prompt = _format_chatml(system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep) + @register_chat_format("openchat") def format_openchat( messages: List[llama_types.ChatCompletionRequestMessage], @@ -612,7 +615,9 @@ def format_openchat( system_template = "{system_message}<|end_of_turn|>" system_message = _get_system_message(messages) system_message = system_template.format(system_message=system_message) - _roles = dict(user="GPT4 Correct User: ", assistant="<|end_of_turn|>GPT4 Correct Assistant: ") + _roles = dict( + user="GPT4 Correct User: ", assistant="<|end_of_turn|>GPT4 Correct Assistant: " + ) _sep = "<|end_of_turn|>" _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) @@ -651,46 +656,60 @@ def functionary_chat_handler( ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" - def generate_type_definition(param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs) -> str: - indent = ' ' * indent_level - if '$ref' in param: + def generate_type_definition( + param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs + ) -> str: + indent = " " * indent_level + if "$ref" in param: # Reference to a shared definition - ref_name = param['$ref'].split('/')[-1] # Extract the type name from the reference + ref_name = param["$ref"].split("/")[ + -1 + ] # Extract the type name from the reference return ref_name - elif param.get('type') == 'array': - items = param.get('items', {}) + elif param.get("type") == "array": + items = param.get("items", {}) item_type = generate_type_definition(items, indent_level + 1, shared_defs) return f"Array<{item_type}>" - elif param.get('type') == 'object': - properties = param.get('properties', {}) + elif param.get("type") == "object": + properties = param.get("properties", {}) nested_schema = "{\n" for nested_param_name, nested_param in properties.items(): - nested_param_type = generate_type_definition(nested_param, indent_level + 1, shared_defs) - nested_schema += f"{indent} {nested_param_name}: {nested_param_type},\n" + nested_param_type = generate_type_definition( + nested_param, indent_level + 1, shared_defs + ) + nested_schema += ( + f"{indent} {nested_param_name}: {nested_param_type},\n" + ) nested_schema += indent + "}" return nested_schema - elif 'enum' in param: + elif "enum" in param: # Enum type - return " | ".join([f'"{enum_value}"' for enum_value in param['enum']]) + return " | ".join([f'"{enum_value}"' for enum_value in param["enum"]]) else: # Simple type - return param.get('type', 'any') + return param.get("type", "any") def generate_shared_definitions(shared_defs, indent_level: int) -> str: - indent = ' ' * indent_level + indent = " " * indent_level shared_definitions = "" for def_name, def_properties in shared_defs.items(): shared_definitions += f"{indent}type {def_name} = " - if def_properties.get('type') == 'object': - shared_definitions += generate_type_definition(def_properties, indent_level, shared_defs) - elif 'enum' in def_properties: + if def_properties.get("type") == "object": + shared_definitions += generate_type_definition( + def_properties, indent_level, shared_defs + ) + elif "enum" in def_properties: # Enum type - shared_definitions += " | ".join([f'"{enum_value}"' for enum_value in def_properties['enum']]) + shared_definitions += " | ".join( + [f'"{enum_value}"' for enum_value in def_properties["enum"]] + ) shared_definitions += ";\n" return shared_definitions def generate_schema_from_functions(functions, namespace="functions") -> str: - schema = "// Supported function definitions that should be called when necessary.\n" + schema = ( + "// Supported function definitions that should be called when necessary.\n" + ) schema += f"namespace {namespace} {{\n\n" # Generate shared definitions @@ -706,10 +725,10 @@ def functionary_chat_handler( description = function.get("description", "") parameters = function.get("parameters", {}) required_params = parameters.get("required", []) - + schema += f" // {description}\n" schema += f" type {function_name} = (_: {{\n" - + for param_name, param in parameters.get("properties", {}).items(): param_description = param.get("description", "") param_type = generate_type_definition(param, 2, shared_definitions) @@ -733,13 +752,18 @@ def functionary_chat_handler( role="system", content=generate_schema_from_functions(functions) ) ) - + if tools is not None: all_messages.append( llama_types.ChatCompletionRequestSystemMessage( - role="system", content=generate_schema_from_functions( - [tool["function"] for tool in tools if tool["type"] == "function"] - ) + role="system", + content=generate_schema_from_functions( + [ + tool["function"] + for tool in tools + if tool["type"] == "function" + ] + ), ) ) @@ -790,7 +814,9 @@ def functionary_chat_handler( elif "function_call" in msg: return f"assistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}\n" elif "tool_calls" in msg and len(msg["tool_calls"]) > 0: - for tool_call in msg["tool_calls"]: # NOTE: probably doesn't work with the functionary model + for tool_call in msg[ + "tool_calls" + ]: # NOTE: probably doesn't work with the functionary model return f"assistant to={tool_call['id']}:\n{tool_call['function']['arguments']}\n" elif msg["content"] is None: return "assistant" @@ -800,12 +826,14 @@ def functionary_chat_handler( raise ValueError(f"Unsupported role: {msg['role']}") return "".join([message_to_str(msg) for msg in all_messages]) - + if tools is not None: functions = [tool["function"] for tool in tools if tool["type"] == "function"] - + if tool_choice is not None: - function_call = tool_choice if isinstance(tool_choice, str) else tool_choice["function"] + function_call = ( + tool_choice if isinstance(tool_choice, str) else tool_choice["function"] + ) prompt = prepare_messages_for_inference(messages, functions, tools) @@ -861,19 +889,27 @@ def functionary_chat_handler( if tool["type"] == "function" and tool["function"]["name"] == function_call: function_body = tool["function"]["parameters"] break - + if function_body is not None: try: with suppress_stdout_stderr(disable=llama.verbose): - grammar_text = llama_grammar.json_schema_to_gbnf(json.dumps(function_body)) - grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.json_schema_to_gbnf(json.dumps(function_body))) + grammar_text = llama_grammar.json_schema_to_gbnf( + json.dumps(function_body) + ) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.json_schema_to_gbnf(json.dumps(function_body)) + ) print(grammar_text) except Exception as e: if llama.verbose: - print("Failed to parse function body as JSON schema, falling back to default grammar") + print( + "Failed to parse function body as JSON schema, falling back to default grammar" + ) print(e) with suppress_stdout_stderr(disable=llama.verbose): - grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF + ) else: with suppress_stdout_stderr(disable=llama.verbose): grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF) @@ -929,9 +965,9 @@ def functionary_chat_handler( "function": { "name": function_call, "arguments": completion["choices"][0]["text"], - } + }, } - ] + ], }, "finish_reason": "tool_calls", } diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 9262b20..a316aaf 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -30,7 +30,7 @@ import numpy.typing as npt # Disable warning for model and model_alias settings -BaseSettings.model_config['protected_namespaces'] = () +BaseSettings.model_config["protected_namespaces"] = () class Settings(BaseSettings): @@ -68,7 +68,9 @@ class Settings(BaseSettings): description="Use mlock.", ) # Context Params - seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.") + seed: int = Field( + default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random." + ) n_ctx: int = Field(default=2048, ge=1, description="The context size.") n_batch: int = Field( default=512, ge=1, description="The batch size to use per eval." @@ -83,30 +85,16 @@ class Settings(BaseSettings): ge=0, description="The number of threads to use when batch processing.", ) - rope_scaling_type: int = Field( - default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED - ) - rope_freq_base: float = Field( - default=0.0, description="RoPE base frequency" - ) + rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED) + rope_freq_base: float = Field(default=0.0, description="RoPE base frequency") rope_freq_scale: float = Field( default=0.0, description="RoPE frequency scaling factor" ) - yarn_ext_factor: float = Field( - default=-1.0 - ) - yarn_attn_factor: float = Field( - default=1.0 - ) - yarn_beta_fast: float = Field( - default=32.0 - ) - yarn_beta_slow: float = Field( - default=1.0 - ) - yarn_orig_ctx: int = Field( - default=0 - ) + yarn_ext_factor: float = Field(default=-1.0) + yarn_attn_factor: float = Field(default=1.0) + yarn_beta_fast: float = Field(default=32.0) + yarn_beta_slow: float = Field(default=1.0) + yarn_orig_ctx: int = Field(default=0) mul_mat_q: bool = Field( default=True, description="if true, use experimental mul_mat_q kernels" ) @@ -122,7 +110,7 @@ class Settings(BaseSettings): # LoRA Params lora_base: Optional[str] = Field( default=None, - description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model." + description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.", ) lora_path: Optional[str] = Field( default=None, @@ -384,7 +372,9 @@ def create_app(settings: Optional[Settings] = None): chat_handler = None if settings.chat_format == "llava-1-5": assert settings.clip_model_path is not None - chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path, verbose=settings.verbose) + chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) ## llama = llama_cpp.Llama( @@ -587,9 +577,10 @@ mirostat_eta_field = Field( grammar = Field( default=None, - description="A CBNF grammar (as string) to be used for formatting the model's output." + description="A CBNF grammar (as string) to be used for formatting the model's output.", ) + class CreateCompletionRequest(BaseModel): prompt: Union[str, List[str]] = Field( default="", description="The prompt to generate completions for." @@ -690,7 +681,8 @@ async def create_completion( kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) iterator_or_completion: Union[ - llama_cpp.CreateCompletionResponse, Iterator[llama_cpp.CreateCompletionStreamResponse] + llama_cpp.CreateCompletionResponse, + Iterator[llama_cpp.CreateCompletionStreamResponse], ] = await run_in_threadpool(llama, **kwargs) if isinstance(iterator_or_completion, Iterator): @@ -748,7 +740,9 @@ class ChatCompletionRequestMessage(BaseModel): role: Literal["system", "user", "assistant", "function"] = Field( default="user", description="The role of the message." ) - content: Optional[str] = Field(default="", description="The content of the message.") + content: Optional[str] = Field( + default="", description="The content of the message." + ) class CreateChatCompletionRequest(BaseModel): @@ -770,9 +764,10 @@ class CreateChatCompletionRequest(BaseModel): tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field( default=None, description="A tool to apply to the generated completions.", - ) # TODO: verify + ) # TODO: verify max_tokens: Optional[int] = Field( - default=None, description="The maximum number of tokens to generate. Defaults to inf" + default=None, + description="The maximum number of tokens to generate. Defaults to inf", ) temperature: float = temperature_field top_p: float = top_p_field