From 5e3e67af47908919968f06b738321c32e646a97b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 6 Feb 2024 12:44:07 -0500 Subject: [PATCH 01/38] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 098f6d7..b08f22c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 098f6d737b65134cf220d12b9b706e8cfc5e4610 +Subproject commit b08f22c882a1443e6b97081f3ce718a4d1a741f8 From 34f31040f610925552a66b3a033e31320b6f6ad8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 6 Feb 2024 12:47:59 -0500 Subject: [PATCH 02/38] Bump version --- CHANGELOG.md | 7 ++++++- llama_cpp/__init__.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9632210..5ce0b43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,11 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.39] + +- feat: Update llama.cpp to ggerganov/llama.cpp@b08f22c882a1443e6b97081f3ce718a4d1a741f8 +- fix: Fix destructor logging bugs by using llama_log_callback to avoid suppress_stdout_stderr by @abetlen in 59760c85eddc72dfcc1839f43760ef72c23d6874 + ## [0.2.38] - feat: Update llama.cpp to ggerganov/llama.cpp@1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 - feat: Add speculative decoding by @abetlen in #1120 -- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template 078cca0361bf5a94d2cf52ed04980d20e32d6f95 +- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template by @abetlen in 078cca0361bf5a94d2cf52ed04980d20e32d6f95 ## [0.2.37] diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 94cd401..837e3c9 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.38" \ No newline at end of file +__version__ = "0.2.39" \ No newline at end of file From ce1277549012a33e5c2360f42bf53aaf1b95e528 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 6 Feb 2024 18:50:56 -0500 Subject: [PATCH 03/38] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index b08f22c..213d143 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit b08f22c882a1443e6b97081f3ce718a4d1a741f8 +Subproject commit 213d1439fadefe182f69c5f7e8dd3b4b6572ebcb From 901827013b732d74f1f67033062d13a6204a62bd Mon Sep 17 00:00:00 2001 From: Jeffrey Fong Date: Thu, 8 Feb 2024 09:07:03 +0800 Subject: [PATCH 04/38] feat: Integrate functionary v1.4 and v2 models + add custom tokenizer support to Llama class (#1078) * convert functionary-v1 chat handler to use hf autotokenizer * add hf_tokenizer + inteegrate functionary-v1.4 prompt template * integrate functionary v2 prompt template * update readme * set up parallel function calling wip * set up parallel function calling * Update README.md * Update README.md * refactor tokenizers * include old functionary handler for backward compatibility * add hf_tokenizer_path in server ModelSettings * convert functionary-v1 chat handler to use hf autotokenizer * add hf_tokenizer + inteegrate functionary-v1.4 prompt template * integrate functionary v2 prompt template * update readme * set up parallel function calling wip * resolve merge conflict * Update README.md * Update README.md * refactor tokenizers * include old functionary handler for backward compatibility * add hf_tokenizer_path in server ModelSettings * Cleanup PR, fix breaking changes * Use hf_pretrained_model_name_or_path for tokenizer * fix hf tokenizer in streaming * update README * refactor offset mapping --------- Co-authored-by: Andrei --- README.md | 19 +- llama_cpp/llama.py | 101 ++++++-- llama_cpp/llama_chat_format.py | 433 ++++++++++++++++++++++++++++++++- llama_cpp/server/model.py | 6 + 4 files changed, 525 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 4131bb3..bddef64 100644 --- a/README.md +++ b/README.md @@ -293,19 +293,16 @@ To constrain the response to a specific JSON Schema, you can use the `schema` pr The high-level API also provides a simple interface for function calling. -Note that the only model that supports full function calling at this time is "functionary". -The gguf-converted files for this model can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF) +The only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary). The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**. You can provide either `functionary-v1` or `functionary-v2` for the `chat_format` when initializing the Llama class. + +Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files. ```python ->>> from llama_cpp import Llama ->>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary") +>>> from llama_cpp import Llama, LlamaHFTokenizer +>>> tokenizer = LlamaHFTokenizer.from_pretrained("path/to/functionary/") +>>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", tokenizer=tokenizer, chat_format="functionary-v2") >>> llm.create_chat_completion( messages = [ - { - "role": "system", - "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary" - - }, { "role": "user", "content": "Extract Jason is 25 years old" @@ -332,12 +329,12 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h } } }], - tool_choice=[{ + tool_choice={ "type": "function", "function": { "name": "UserDetail" } - }] + }, ) ``` diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 85943db..bad75df 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -2,6 +2,7 @@ from __future__ import annotations import os import sys +import abc import uuid import time import multiprocessing @@ -14,11 +15,14 @@ from typing import ( Iterator, Deque, Callable, + Any, ) from collections import deque import ctypes +from llama_cpp.llama_types import List + from .llama_types import * from .llama_grammar import LlamaGrammar from .llama_cache import ( @@ -95,6 +99,8 @@ class Llama: chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None, # Speculative Decoding draft_model: Optional[LlamaDraftModel] = None, + # Tokenizer Override + tokenizer: Optional[BaseLlamaTokenizer] = None, # Misc verbose: bool = True, # Extra Params @@ -159,6 +165,7 @@ class Llama: chat_format: String specifying the chat format to use when calling create_chat_completion. chat_handler: Optional chat handler to use when calling create_chat_completion. draft_model: Optional draft model to use for speculative decoding. + tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp. verbose: Print verbose output to stderr. Raises: @@ -235,6 +242,7 @@ class Llama: self.n_threads_batch = n_threads_batch or max( multiprocessing.cpu_count() // 2, 1 ) + # Context Params self.context_params = llama_cpp.llama_context_default_params() self.context_params.seed = seed @@ -286,6 +294,10 @@ class Llama: self._model = _LlamaModel( path_model=self.model_path, params=self.model_params, verbose=self.verbose ) + + # Override tokenizer + self.tokenizer_ = tokenizer or LlamaTokenizer(self) + # Set the default value for the context and correct the batch if n_ctx == 0: n_ctx = self._model.n_ctx_train() @@ -431,18 +443,19 @@ class Llama: Returns: A list of tokens. """ - return self._model.tokenize(text, add_bos, special) + return self.tokenizer_.tokenize(text, add_bos, special) - def detokenize(self, tokens: List[int]) -> bytes: + def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes: """Detokenize a list of tokens. Args: tokens: The list of tokens to detokenize. + prev_tokens: The list of previous tokens. Offset mapping will be performed if provided Returns: The detokenized string. """ - return self._model.detokenize(tokens) + return self.tokenizer_.detokenize(tokens, prev_tokens) def set_cache(self, cache: Optional[BaseLlamaCache]): """Set the cache. @@ -935,7 +948,8 @@ class Llama: if stream: remaining_tokens = completion_tokens[returned_tokens:] - remaining_text = self.detokenize(remaining_tokens) + prev_tokens = completion_tokens[:returned_tokens] + remaining_text = self.detokenize(completion_tokens, prev_tokens) remaining_length = len(remaining_text) # We want to avoid yielding any characters from @@ -957,13 +971,13 @@ class Llama: for token in remaining_tokens: if token == self.token_bos(): continue - token_end_position += len(self.detokenize([token])) + token_end_position += len(remaining_text) # Check if stop sequence is in the token if token_end_position > ( remaining_length - first_stop_position ): break - token_str = self.detokenize([token]).decode( + token_str = remaining_text.decode( "utf-8", errors="ignore" ) text_offset = len(prompt) + len( @@ -988,11 +1002,7 @@ class Llama: } top_logprob.update({token_str: current_logprobs[int(token)]}) logprobs_or_none = { - "tokens": [ - self.detokenize([token]).decode( - "utf-8", errors="ignore" - ) - ], + "tokens": [token_str], "text_offset": [text_offset], "token_logprobs": [current_logprobs[int(token)]], "top_logprobs": [top_logprob], @@ -1005,9 +1015,7 @@ class Llama: "model": model_name, "choices": [ { - "text": self.detokenize([token]).decode( - "utf-8", errors="ignore" - ), + "text": token_str, "index": 0, "logprobs": logprobs_or_none, "finish_reason": None, @@ -1019,7 +1027,7 @@ class Llama: decode_success = False for i in range(1, len(remaining_tokens) + 1): try: - bs = self.detokenize(remaining_tokens[:i]) + bs = remaining_text ts = bs.decode("utf-8") decode_success = True break @@ -1055,6 +1063,7 @@ class Llama: if len(completion_tokens) >= max_tokens: text = self.detokenize(completion_tokens) + finish_reason = "length" break @@ -1693,8 +1702,8 @@ class Llama: """Return the vocabulary size.""" return self._model.n_vocab() - def tokenizer(self) -> "LlamaTokenizer": - """Return the tokenizer for this model.""" + def tokenizer(self) -> LlamaTokenizer: + """Return the llama tokenizer for this model.""" return LlamaTokenizer(self) def token_eos(self) -> int: @@ -1738,23 +1747,71 @@ class Llama: return longest_prefix -class LlamaTokenizer: +class BaseLlamaTokenizer(abc.ABC): + @abc.abstractmethod + def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]: + raise NotImplementedError + + @abc.abstractmethod + def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes: + raise NotImplementedError + + +class LlamaTokenizer(BaseLlamaTokenizer): def __init__(self, llama: Llama): self.llama = llama + self._model = llama._model # type: ignore - def encode(self, text: str, add_bos: bool = True) -> List[int]: - return self.llama.tokenize( - text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=True + def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]: + return self._model.tokenize(text, add_bos=add_bos, special=special) + + def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes: + if prev_tokens is not None: + return self._model.detokenize(tokens[len(prev_tokens):]) + else: + return self._model.detokenize(tokens) + + def encode(self, text: str, add_bos: bool = True, special: bool = True) -> List[int]: + return self.tokenize( + text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special ) def decode(self, tokens: List[int]) -> str: - return self.llama.detokenize(tokens).decode("utf-8", errors="ignore") + return self.detokenize(tokens).decode("utf-8", errors="ignore") @classmethod def from_ggml_file(cls, path: str) -> "LlamaTokenizer": return cls(Llama(model_path=path, vocab_only=True)) +class LlamaHFTokenizer(BaseLlamaTokenizer): + def __init__(self, hf_tokenizer: Any): + self.hf_tokenizer = hf_tokenizer + + def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]: + return self.hf_tokenizer.encode(text.decode("utf-8", errors="ignore"), add_special_tokens=special) + + def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes: + if prev_tokens is not None: + text = self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore") + prev_text = self.hf_tokenizer.decode(prev_tokens).encode("utf-8", errors="ignore") + return text[len(prev_text):] + else: + return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer": + try: + from transformers import AutoTokenizer + except ImportError: + raise ImportError( + "The `transformers` library is required to use the `HFTokenizer`." + "You can install it with `pip install transformers`." + ) + hf_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path) + return cls(hf_tokenizer) + + class LlamaState: def __init__( self, diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 08f991b..2e42041 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -4,7 +4,9 @@ import os import json import ctypes import dataclasses -from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol +import random +import string +from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol import jinja2 @@ -1332,6 +1334,435 @@ def functionary_chat_handler( ) +@register_chat_completion_handler("functionary-v1") +@register_chat_completion_handler("functionary-v2") +def functionary_v1_v2_chat_handler( + llama: llama.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, + max_tokens: Optional[int] = None, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + logits_processor: Optional[llama.LogitsProcessorList] = None, + grammar: Optional[llama.LlamaGrammar] = None, + **kwargs, # type: ignore +) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: + SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" + + tokenizer = llama.tokenizer_ + assert hasattr(tokenizer, "hf_tokenizer"), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class" + from transformers import AutoTokenizer + + if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens: + version = "v1" + END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>" + END_USER_TOKEN = "<|END_OF_USER|>" + END_ASSISTANT_TOKEN = "<|END_OF_ASSISTANT|>" + END_FUNCTION_RESULT_TOKEN = "<|END_OF_FUNCTION_RESULT|>" + START_FUNCTION_CALL_TOKEN = "<|START_OF_FUNCTION_CALL|>" + END_FUNCTION_CALL_TOKEN = "<|END_OF_FUNCTION_CALL|>" + else: + version = "v2" + RECIPIENT_TOKEN = "<|recipient|>" + FROM_TOKEN = "<|from|>" + STOP_TOKEN = "<|stop|>" + CONTENT_TOKEN = "<|content|>" + + def generate_type_definition( + param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs + ) -> str: + indent = " " * indent_level + if "$ref" in param: + # Reference to a shared definition + ref_name = param["$ref"].split("/")[ + -1 + ] # Extract the type name from the reference + return ref_name + elif param.get("type") == "array": + items = param.get("items", {}) + item_type = generate_type_definition(items, indent_level + 1, shared_defs) + return f"Array<{item_type}>" + elif param.get("type") == "object": + properties = param.get("properties", {}) + nested_schema = "{\n" + for nested_param_name, nested_param in properties.items(): + nested_param_type = generate_type_definition( + nested_param, indent_level + 1, shared_defs + ) + nested_schema += ( + f"{indent} {nested_param_name}: {nested_param_type},\n" + ) + nested_schema += indent + "}" + return nested_schema + elif "enum" in param: + # Enum type + return " | ".join([f'"{enum_value}"' for enum_value in param["enum"]]) + else: + # Simple type + return param.get("type", "any") + + def generate_shared_definitions(shared_defs, indent_level: int) -> str: + indent = " " * indent_level + shared_definitions = "" + for def_name, def_properties in shared_defs.items(): + shared_definitions += f"{indent}type {def_name} = " + if def_properties.get("type") == "object": + shared_definitions += generate_type_definition( + def_properties, indent_level, shared_defs + ) + elif "enum" in def_properties: + # Enum type + shared_definitions += " | ".join( + [f'"{enum_value}"' for enum_value in def_properties["enum"]] + ) + shared_definitions += ";\n" + return shared_definitions + + def generate_schema_from_functions(functions, namespace="functions") -> str: + schema = ( + "// Supported function definitions that should be called when necessary.\n" + ) + schema += f"namespace {namespace} {{\n\n" + + # Generate shared definitions + shared_definitions = {} + for function in functions: + parameters = function.get("parameters", {}) + shared_definitions.update(parameters.get("$defs", {})) + + schema += generate_shared_definitions(shared_definitions, 1) + + for function in functions: + function_name = function["name"] + description = function.get("description", "") + parameters = function.get("parameters", {}) + required_params = parameters.get("required", []) + + schema += f"// {description}\n" + schema += f"type {function_name} = (_: {{\n" + + for param_name, param in parameters.get("properties", {}).items(): + param_description = param.get("description", "") + param_type = generate_type_definition(param, 2, shared_definitions) + optional_indicator = "" if param_name in required_params else "?" + schema += f"// {param_description}\n" + schema += f"{param_name}{optional_indicator}: {param_type},\n" + schema += "}) => any;\n\n" + + schema += "}} // namespace {}".format(namespace) + return schema + + def prepare_messages_for_inference( + messages: List[llama_types.ChatCompletionRequestMessage], + tokenizer: AutoTokenizer, + version: Literal["v1", "v2"], + functions: Optional[List[llama_types.ChatCompletionFunctions]] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + ): + all_messages: List[llama_types.ChatCompletionRequestMessage] = [] + if functions is not None: + all_messages.append( + llama_types.ChatCompletionRequestSystemMessage( + role="system", content=generate_schema_from_functions(functions) + ) + ) + elif tools is not None: + all_messages.append( + llama_types.ChatCompletionRequestSystemMessage( + role="system", + content=generate_schema_from_functions( + [ + tool["function"] + for tool in tools + if tool["type"] == "function" + ] + ), + ) + ) + + all_messages.append( + llama_types.ChatCompletionRequestSystemMessage( + role="system", content=SYSTEM_MESSAGE + ) + ) + + for message in messages: + # Function call responses + if message["role"] == "function" and "name" in message: + message["name"] = f"functions.{message['name']}" + # Function call requests by assistant + if "function_call" in message: + message["function_call"][ + "name" + ] = f"functions.{message['function_call']['name']}" + all_messages.append(message) + + if version == "v1": + suffix = "assistant:\n" + else: + suffix = "<|from|>assistant\n<|recipient|>" + + return tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False) + suffix + + if tools is not None: + functions = [tool["function"] for tool in tools if tool["type"] == "function"] + + if tool_choice is not None: + function_call = ( + tool_choice if isinstance(tool_choice, str) else tool_choice["function"] + ) + + prompt = prepare_messages_for_inference(messages, tokenizer, version, functions, tools) + + # If no tools/functions are provided + if function_call is None and (functions is None or len(functions) == 0): + if version == "v1": + stop = END_ASSISTANT_TOKEN + else: + stop = STOP_TOKEN + prompt += "all\n<|content|>" + + completion_or_completion_chunks = llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=stream, + stop=stop, + max_tokens=max_tokens, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, + grammar=grammar, + ) + return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore + + assert stream is False # TODO: support stream mode + + def get_grammar(function_call): + function_body = None + for function in functions or []: + if function["name"] == function_call: + function_body = function["parameters"] + break + for tool in tools or []: + if tool["type"] == "function" and tool["function"]["name"] == function_call: + function_body = tool["function"]["parameters"] + break + + try: + with suppress_stdout_stderr(disable=llama.verbose): + grammar_text = llama_grammar.json_schema_to_gbnf( + json.dumps(function_body) + ) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.json_schema_to_gbnf(json.dumps(function_body)) + ) + print(grammar_text) + except Exception as e: + if llama.verbose: + print( + "Failed to parse function body as JSON schema, falling back to default grammar" + ) + print(e) + with suppress_stdout_stderr(disable=llama.verbose): + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF + ) + + return grammar + + def create_completion(stop): + completion: llama_types.Completion = llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=stream, + stop=stop, + max_tokens=max_tokens, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, + grammar=grammar, + ) + + return completion + + function_calls, function_bodies = [], [] + + if version == "v1": + # If no or "auto" tool_choice/function_call + if function_call is None or ( + isinstance(function_call, str) and function_call == "auto" + ): + stops = ["\n", END_ASSISTANT_TOKEN] + # If tool_choice/function_call is "none" + elif isinstance(function_call, str) and function_call == "none": + prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + stops = END_ASSISTANT_TOKEN + # If tool_choice/function_call is provided + elif isinstance(function_call, dict): + prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n" + stops = END_FUNCTION_CALL_TOKEN + function_call = function_call["name"] + function_calls.append(function_call) + grammar = get_grammar(function_call) + else: + prompt = prompt + stops = ["\n", END_ASSISTANT_TOKEN] + + completion = create_completion(stop=stops) + completion_text = completion["choices"][0]["text"] + + # If the generation does not involve a function call + if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text: + return _convert_completion_to_chat(completion, stream=stream) # type: ignore + # If the generation involves a function call in completion, generate the parameters + elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text: + prompt += completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n" + function_calls.append(completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()) + grammar = get_grammar(function_calls[-1]) + completion = create_completion(stop=END_FUNCTION_CALL_TOKEN) + function_bodies.append(completion["choices"][0]["text"].strip()) + # If the prompt involves a function call, just append generated parameters to function_bodies + else: + function_bodies.append(completion_text.strip()) + else: + # Loop until all parallel function calls are generated + while True: + # If no or "auto" tool_choice/function_call + if function_call is None or ( + isinstance(function_call, str) and function_call == "auto" + ): + grammar = None + stops = CONTENT_TOKEN + # If tool_choice/function_call is "none" + elif isinstance(function_call, str) and function_call == "none": + prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>" + stops = STOP_TOKEN + # If tool_choice/function_call is provided + elif isinstance(function_call, dict): + prompt += f"{function_call['name']}\n{CONTENT_TOKEN}" + stops = STOP_TOKEN + function_call = function_call["name"] + function_calls.append(function_call) + grammar = get_grammar(function_call) + else: + prompt = prompt + stops = STOP_TOKEN + + completion = create_completion(stop=stops) + completion_text = completion["choices"][0]["text"] + + # If the generation does not involve a function call + if prompt.endswith("all\n<|content|>") and not completion_text.startswith("all"): + return _convert_completion_to_chat(completion, stream=stream) # type: ignore + # Generate model response if the model decides not to call any function + elif (prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all")): + prompt += completion_text + CONTENT_TOKEN + completion = create_completion(stop=STOP_TOKEN) + return _convert_completion_to_chat(completion, stream=stream) # type: ignore + # Generate parameters if model decides to call a function + elif prompt.endswith(RECIPIENT_TOKEN): + function_calls.append(completion_text[:-1]) + grammar = get_grammar(function_calls[-1]) + completion = create_completion(stop=[STOP_TOKEN, "\n"]) + function_bodies.append(completion["choices"][0]["text"].strip()) + prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}" + grammar = None + + # Try to generate the beginning of next turn + # If empty completion, break from loop + next_turn_completion_text = create_completion( + stop=[STOP_TOKEN, RECIPIENT_TOKEN] + )["choices"][0]["text"] + if len(next_turn_completion_text) > 0: + prompt += f"\n{FROM_TOKEN}assistant\n{RECIPIENT_TOKEN}" + else: + break + # Break from loop if tool_choice/function_call is provided as a dict + else: + function_bodies.append(completion_text.strip()) + break + + assert "usage" in completion + assert len(function_calls) > 0 + assert len(function_calls) == len(function_bodies) + + tool_calls = [] + for function_call, function_body in zip(function_calls, function_bodies): + tool_calls.append( + { + "id": "call_" + "".join( + [random.choice(string.ascii_letters + string.digits) for _ in range(24)] + ), + "type": "function", + "function": { + "name": function_call, + "arguments": function_body, + }, + } + ) + + # TODO: support stream mode + return llama_types.CreateChatCompletionResponse( + id="chat" + completion["id"], + object="chat.completion", + created=completion["created"], + model=completion["model"], + choices=[ + { + "index": 0, + "message": { + "role": "assistant", + "content": None, + "function_call": { + "name": tool_calls[0]["function"]["name"], + "arguments": tool_calls[0]["function"]["arguments"], + }, + "tool_calls": tool_calls, + }, + "finish_reason": "tool_calls", + } + ], + usage=completion["usage"], + ) + + class Llava15ChatHandler: _clip_free = None diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 925ab99..6d8ec24 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -93,6 +93,10 @@ class LlamaProxy: ) ) + tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None + if settings.hf_pretrained_model_name_or_path is not None: + tokenizer = llama_cpp.LlamaHFTokenizer.from_pretrained(settings.hf_pretrained_model_name_or_path) + draft_model = None if settings.draft_model is not None: draft_model = llama_speculative.LlamaPromptLookupDecoding( @@ -156,6 +160,8 @@ class LlamaProxy: chat_handler=chat_handler, # Speculative Decoding draft_model=draft_model, + # Tokenizer + tokenizer=tokenizer, # Misc verbose=settings.verbose, ) From 2ef7ba3aed572609fbf7292adb125e41e5279a15 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Feb 2024 01:07:44 -0500 Subject: [PATCH 05/38] misc: rename grammar test --- tests/{test_grammar.py => test_llama_grammar.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_grammar.py => test_llama_grammar.py} (100%) diff --git a/tests/test_grammar.py b/tests/test_llama_grammar.py similarity index 100% rename from tests/test_grammar.py rename to tests/test_llama_grammar.py From b5fca911b57a23565c55c31802fb9603a0c6497c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Feb 2024 01:08:18 -0500 Subject: [PATCH 06/38] feat: Move tokenizer to own module --- llama_cpp/llama.py | 69 ++------------------------ llama_cpp/llama_tokenizer.py | 96 ++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 65 deletions(-) create mode 100644 llama_cpp/llama_tokenizer.py diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index bad75df..30ae3b5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -2,7 +2,6 @@ from __future__ import annotations import os import sys -import abc import uuid import time import multiprocessing @@ -15,7 +14,6 @@ from typing import ( Iterator, Deque, Callable, - Any, ) from collections import deque @@ -31,6 +29,10 @@ from .llama_cache import ( LlamaDiskCache, # type: ignore LlamaRAMCache, # type: ignore ) +from .llama_tokenizer import ( + BaseLlamaTokenizer, + LlamaTokenizer +) import llama_cpp.llama_cpp as llama_cpp import llama_cpp.llama_chat_format as llama_chat_format @@ -1747,69 +1749,6 @@ class Llama: return longest_prefix -class BaseLlamaTokenizer(abc.ABC): - @abc.abstractmethod - def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]: - raise NotImplementedError - - @abc.abstractmethod - def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes: - raise NotImplementedError - - -class LlamaTokenizer(BaseLlamaTokenizer): - def __init__(self, llama: Llama): - self.llama = llama - self._model = llama._model # type: ignore - - def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]: - return self._model.tokenize(text, add_bos=add_bos, special=special) - - def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes: - if prev_tokens is not None: - return self._model.detokenize(tokens[len(prev_tokens):]) - else: - return self._model.detokenize(tokens) - - def encode(self, text: str, add_bos: bool = True, special: bool = True) -> List[int]: - return self.tokenize( - text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special - ) - - def decode(self, tokens: List[int]) -> str: - return self.detokenize(tokens).decode("utf-8", errors="ignore") - - @classmethod - def from_ggml_file(cls, path: str) -> "LlamaTokenizer": - return cls(Llama(model_path=path, vocab_only=True)) - - -class LlamaHFTokenizer(BaseLlamaTokenizer): - def __init__(self, hf_tokenizer: Any): - self.hf_tokenizer = hf_tokenizer - - def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]: - return self.hf_tokenizer.encode(text.decode("utf-8", errors="ignore"), add_special_tokens=special) - - def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes: - if prev_tokens is not None: - text = self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore") - prev_text = self.hf_tokenizer.decode(prev_tokens).encode("utf-8", errors="ignore") - return text[len(prev_text):] - else: - return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore") - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer": - try: - from transformers import AutoTokenizer - except ImportError: - raise ImportError( - "The `transformers` library is required to use the `HFTokenizer`." - "You can install it with `pip install transformers`." - ) - hf_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path) - return cls(hf_tokenizer) class LlamaState: diff --git a/llama_cpp/llama_tokenizer.py b/llama_cpp/llama_tokenizer.py new file mode 100644 index 0000000..0ad3c3a --- /dev/null +++ b/llama_cpp/llama_tokenizer.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import abc +from typing import ( + List, + Optional, + Any, +) + +import llama_cpp +from llama_cpp.llama_types import List + + +class BaseLlamaTokenizer(abc.ABC): + @abc.abstractmethod + def tokenize( + self, text: bytes, add_bos: bool = True, special: bool = True + ) -> List[int]: + raise NotImplementedError + + @abc.abstractmethod + def detokenize( + self, tokens: List[int], prev_tokens: Optional[List[int]] = None + ) -> bytes: + raise NotImplementedError + + +class LlamaTokenizer(BaseLlamaTokenizer): + def __init__(self, llama: llama_cpp.Llama): + self.llama = llama + self._model = llama._model # type: ignore + + def tokenize( + self, text: bytes, add_bos: bool = True, special: bool = True + ) -> List[int]: + return self._model.tokenize(text, add_bos=add_bos, special=special) + + def detokenize( + self, tokens: List[int], prev_tokens: Optional[List[int]] = None + ) -> bytes: + if prev_tokens is not None: + return self._model.detokenize(tokens[len(prev_tokens) :]) + else: + return self._model.detokenize(tokens) + + def encode( + self, text: str, add_bos: bool = True, special: bool = True + ) -> List[int]: + return self.tokenize( + text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special + ) + + def decode(self, tokens: List[int]) -> str: + return self.detokenize(tokens).decode("utf-8", errors="ignore") + + @classmethod + def from_ggml_file(cls, path: str) -> "LlamaTokenizer": + return cls(llama_cpp.Llama(model_path=path, vocab_only=True)) + + +class LlamaHFTokenizer(BaseLlamaTokenizer): + def __init__(self, hf_tokenizer: Any): + self.hf_tokenizer = hf_tokenizer + + def tokenize( + self, text: bytes, add_bos: bool = True, special: bool = True + ) -> List[int]: + return self.hf_tokenizer.encode( + text.decode("utf-8", errors="ignore"), add_special_tokens=special + ) + + def detokenize( + self, tokens: List[int], prev_tokens: Optional[List[int]] = None + ) -> bytes: + if prev_tokens is not None: + text = self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore") + prev_text = self.hf_tokenizer.decode(prev_tokens).encode( + "utf-8", errors="ignore" + ) + return text[len(prev_text) :] + else: + return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer": + try: + from transformers import AutoTokenizer + except ImportError: + raise ImportError( + "The `transformers` library is required to use the `HFTokenizer`." + "You can install it with `pip install transformers`." + ) + hf_tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path + ) + return cls(hf_tokenizer) From 85d3374b4d5892e51e27b9973f9ce3623e076e2a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Feb 2024 01:13:28 -0500 Subject: [PATCH 07/38] fix: broken import --- llama_cpp/server/model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 6d8ec24..5308dc2 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -6,6 +6,7 @@ from typing import Dict, Optional, Union, List import llama_cpp import llama_cpp.llama_speculative as llama_speculative +import llama_cpp.llama_tokenizer as llama_tokenizer from llama_cpp.server.settings import ModelSettings @@ -95,7 +96,7 @@ class LlamaProxy: tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None if settings.hf_pretrained_model_name_or_path is not None: - tokenizer = llama_cpp.LlamaHFTokenizer.from_pretrained(settings.hf_pretrained_model_name_or_path) + tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(settings.hf_pretrained_model_name_or_path) draft_model = None if settings.draft_model is not None: From dfc1b173414b550f8f5be1b94430af16b53a63cb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 8 Feb 2024 23:38:12 -0500 Subject: [PATCH 08/38] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 213d143..8e6a9d2 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 213d1439fadefe182f69c5f7e8dd3b4b6572ebcb +Subproject commit 8e6a9d2de0096af7120606c74ee2f26684e87b41 From e16f06e6eb555947f4404c20732921c8ea76c4f7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Feb 2024 02:02:13 -0500 Subject: [PATCH 09/38] fix: revert _create_completions. --- llama_cpp/llama.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index bad75df..f445fb0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -948,8 +948,7 @@ class Llama: if stream: remaining_tokens = completion_tokens[returned_tokens:] - prev_tokens = completion_tokens[:returned_tokens] - remaining_text = self.detokenize(completion_tokens, prev_tokens) + remaining_text = self.detokenize(remaining_tokens) remaining_length = len(remaining_text) # We want to avoid yielding any characters from @@ -971,13 +970,13 @@ class Llama: for token in remaining_tokens: if token == self.token_bos(): continue - token_end_position += len(remaining_text) + token_end_position += len(self.detokenize([token])) # Check if stop sequence is in the token if token_end_position > ( remaining_length - first_stop_position ): break - token_str = remaining_text.decode( + token_str = self.detokenize([token]).decode( "utf-8", errors="ignore" ) text_offset = len(prompt) + len( @@ -1002,7 +1001,11 @@ class Llama: } top_logprob.update({token_str: current_logprobs[int(token)]}) logprobs_or_none = { - "tokens": [token_str], + "tokens": [ + self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + ], "text_offset": [text_offset], "token_logprobs": [current_logprobs[int(token)]], "top_logprobs": [top_logprob], @@ -1015,7 +1018,9 @@ class Llama: "model": model_name, "choices": [ { - "text": token_str, + "text": self.detokenize([token]).decode( + "utf-8", errors="ignore" + ), "index": 0, "logprobs": logprobs_or_none, "finish_reason": None, @@ -1027,7 +1032,7 @@ class Llama: decode_success = False for i in range(1, len(remaining_tokens) + 1): try: - bs = remaining_text + bs = self.detokenize(remaining_tokens[:i]) ts = bs.decode("utf-8") decode_success = True break @@ -1063,7 +1068,6 @@ class Llama: if len(completion_tokens) >= max_tokens: text = self.detokenize(completion_tokens) - finish_reason = "length" break From 63b0c37836169baa71c04484e5344294928bd359 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 9 Feb 2024 13:36:58 -0500 Subject: [PATCH 10/38] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8e6a9d2..4b7b38b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8e6a9d2de0096af7120606c74ee2f26684e87b41 +Subproject commit 4b7b38bef5addbd31f453871d79647fbae6bec8a From 19b55ad3e55cc707938b191ab7779f5fd69cd0c6 Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Sun, 11 Feb 2024 12:53:59 -0600 Subject: [PATCH 11/38] feat: use gpu backend for clip if available (#1175) --- CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 795dad7..b4df8ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,14 @@ if (LLAMA_BUILD) ) if (LLAVA_BUILD) + if (LLAMA_CUBLAS) + add_compile_definitions(GGML_USE_CUBLAS) + endif() + + if (LLAMA_METAL) + add_compile_definitions(GGML_USE_METAL) + endif() + # Building llava add_subdirectory(vendor/llama.cpp/examples/llava) set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava") From 918ff27e501f621ab7d511a9c71c0783d870082c Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 12 Feb 2024 00:25:15 +0530 Subject: [PATCH 12/38] docs: Set the correct command for compiling with syscl support (#1172) --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index bddef64..59a7cd4 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,8 @@ CMAKE_ARGS="-DLLAMA_KOMPUTE=on" pip install llama-cpp-python To install with SYCL support, set the `LLAMA_SYCL=on` environment variable before installing: ```bash -CMAKE_ARGS="-DLLAMA_SYCL=on" pip install llama-cpp-python +source /opt/intel/oneapi/setvars.sh +CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install llama-cpp-python ``` ### Windows Notes From a05d90446fea83426fe8dc0d6c78afe6c3dd0894 Mon Sep 17 00:00:00 2001 From: Connor Date: Sun, 11 Feb 2024 10:57:57 -0800 Subject: [PATCH 13/38] fix: Circular dependancy preventing early Llama object free (#1176) commit 901827013b732d74f1f67033062d13a6204a62bd introduced a cyclic dependency within Llama objects. That change causes old models to linger in memory longer than necessary, thereby creating memory bloat in most applications attempting to switch between models at runtime. This patch simply removes the problematic line, allowing models to deallocate without relying on GC. One might also consider combining `weakref.ref` with a `@property` if the `llama` attribute is absolutely necessary to expose in the tokenizer class. --- llama_cpp/llama_tokenizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_cpp/llama_tokenizer.py b/llama_cpp/llama_tokenizer.py index 0ad3c3a..c2aad47 100644 --- a/llama_cpp/llama_tokenizer.py +++ b/llama_cpp/llama_tokenizer.py @@ -27,7 +27,6 @@ class BaseLlamaTokenizer(abc.ABC): class LlamaTokenizer(BaseLlamaTokenizer): def __init__(self, llama: llama_cpp.Llama): - self.llama = llama self._model = llama._model # type: ignore def tokenize( From 936867063984dc695be71eab21115f183dc4d33b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 11 Feb 2024 14:02:46 -0500 Subject: [PATCH 14/38] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4b7b38b..97a3365 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4b7b38bef5addbd31f453871d79647fbae6bec8a +Subproject commit 97a336507ed9b971d72262bec7e2b8b7016a054a From 69413ce08e7119b0b10ddb1a52eb8eaa2a865f7f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 11 Feb 2024 19:00:17 -0500 Subject: [PATCH 15/38] Update llama.cpp --- llama_cpp/llama_cpp.py | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index da2a7f3..2724edd 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -139,9 +139,11 @@ llama_seq_id = c_int32 # enum llama_vocab_type { # LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece # LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding +# LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece # }; LLAMA_VOCAB_TYPE_SPM = 0 LLAMA_VOCAB_TYPE_BPE = 1 +LLAMA_VOCAB_TYPE_WPM = 2 # enum llama_token_type { diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 97a3365..3bdc4cd 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 97a336507ed9b971d72262bec7e2b8b7016a054a +Subproject commit 3bdc4cd0f595a6096cca4a64aa75ffa8a3503465 From 153a0049d90329dec2fc44628fdc6fc8c5f31ae4 Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 12 Feb 2024 15:56:07 -0500 Subject: [PATCH 16/38] feat: Generic chatml Function Calling (#957) * Add demo notebook * Add initial chat handler * Update OpenAI types * Add generic chatml function calling (wip) * Update chatml generic function calling. * Progress on auto-tool calls * fix streaming functions * Remove print statements * fix: Suppress output from llama.cpp init and grammar creation * Add OpenAI v1 python api compatible chat completion function * Support non-streaming multi-tool calls * Format * Include function_call in response. --- .../notebooks/OpenHermesFunctionCalling.ipynb | 910 ++++++++++++++++++ llama_cpp/llama.py | 38 +- llama_cpp/llama_chat_format.py | 761 +++++++++++++-- llama_cpp/llama_types.py | 11 +- 4 files changed, 1660 insertions(+), 60 deletions(-) create mode 100644 examples/notebooks/OpenHermesFunctionCalling.ipynb diff --git a/examples/notebooks/OpenHermesFunctionCalling.ipynb b/examples/notebooks/OpenHermesFunctionCalling.ipynb new file mode 100644 index 0000000..c0de3fd --- /dev/null +++ b/examples/notebooks/OpenHermesFunctionCalling.ipynb @@ -0,0 +1,910 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"name\": \"get_article_details\",\n", + " \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"title\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"authors\": {\n", + " \"type\": \"list[str]\"\n", + " },\n", + " \"short_summary\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"date_published\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"tags\": {\n", + " \"type\": \"list[str]\"\n", + " }\n", + " }\n", + " },\n", + " \"returns\": \"Article\"\n", + "}\n" + ] + } + ], + "source": [ + "import json\n", + "import inspect\n", + "from typing import get_type_hints\n", + "\n", + "class Article:\n", + " pass\n", + "\n", + "class Weather:\n", + " pass\n", + "\n", + "class Directions:\n", + " pass\n", + "\n", + "def calculate_mortgage_payment(loan_amount: int, interest_rate: float, loan_term: int) -> float:\n", + " \"\"\"Get the monthly mortgage payment given an interest rate percentage.\"\"\"\n", + " \n", + " # TODO: you must implement this to actually call it later\n", + " pass\n", + "\n", + "def get_article_details(title: str, authors: list[str], short_summary: str, date_published: str, tags: list[str]) -> Article:\n", + " '''Get article details from unstructured article text.\n", + "date_published: formatted as \"MM/DD/YYYY\"'''\n", + " \n", + " # TODO: you must implement this to actually call it later\n", + " pass\n", + "\n", + "def get_weather(zip_code: str) -> Weather:\n", + " \"\"\"Get the current weather given a zip code.\"\"\"\n", + " \n", + " # TODO: you must implement this to actually call it later\n", + " pass\n", + "\n", + "def get_directions(start: str, destination: str) -> Directions:\n", + " \"\"\"Get directions from Google Directions API.\n", + "start: start address as a string including zipcode (if any)\n", + "destination: end address as a string including zipcode (if any)\"\"\"\n", + " \n", + " # TODO: you must implement this to actually call it later\n", + " pass\n", + "\n", + "def get_type_name(t):\n", + " name = str(t)\n", + " if \"list\" in name or \"dict\" in name:\n", + " return name\n", + " else:\n", + " return t.__name__\n", + "\n", + "def serialize_function_to_json(func):\n", + " signature = inspect.signature(func)\n", + " type_hints = get_type_hints(func)\n", + "\n", + " function_info = {\n", + " \"name\": func.__name__,\n", + " \"description\": func.__doc__,\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {}\n", + " },\n", + " \"returns\": type_hints.get('return', 'void').__name__\n", + " }\n", + "\n", + " for name, _ in signature.parameters.items():\n", + " param_type = get_type_name(type_hints.get(name, type(None)))\n", + " function_info[\"parameters\"][\"properties\"][name] = {\"type\": param_type}\n", + "\n", + " return json.dumps(function_info, indent=2)\n", + "\n", + "print(serialize_function_to_json(get_article_details))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import xml.etree.ElementTree as ET\n", + "import re\n", + "\n", + "def extract_function_calls(completion):\n", + " completion = completion.strip()\n", + " pattern = r\"((.*?))\"\n", + " match = re.search(pattern, completion, re.DOTALL)\n", + " if not match:\n", + " return None\n", + " \n", + " multiplefn = match.group(1)\n", + " root = ET.fromstring(multiplefn)\n", + " functions = root.findall(\"functioncall\")\n", + " return [json.loads(fn.text) for fn in functions]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_hermes_prompt(prompt, functions):\n", + " functions = \"\\n\\n\".join([serialize_function_to_json(fn) for fn in functions])\n", + " prompt = f\"\"\"<|im_start|>system\n", + "You are a helpful assistant with access to the following functions:\n", + "\n", + "{functions}\n", + "\n", + "To use these functions respond with:\n", + "\n", + " {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} \n", + " {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} \n", + " ...\n", + "\n", + "\n", + "Edge cases you must handle:\n", + "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n", + "<|im_start|>user\n", + "{prompt}<|im_end|>\n", + "<|im_start|>assistant\"\"\"\n", + " return prompt" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<|im_start|>system\n", + "You are a helpful assistant with access to the following functions:\n", + "\n", + "{\n", + " \"name\": \"get_weather\",\n", + " \"description\": \"Get the current weather given a zip code.\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"zip_code\": {\n", + " \"type\": \"str\"\n", + " }\n", + " }\n", + " },\n", + " \"returns\": \"Weather\"\n", + "}\n", + "\n", + "{\n", + " \"name\": \"calculate_mortgage_payment\",\n", + " \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"loan_amount\": {\n", + " \"type\": \"int\"\n", + " },\n", + " \"interest_rate\": {\n", + " \"type\": \"float\"\n", + " },\n", + " \"loan_term\": {\n", + " \"type\": \"int\"\n", + " }\n", + " }\n", + " },\n", + " \"returns\": \"float\"\n", + "}\n", + "\n", + "{\n", + " \"name\": \"get_article_details\",\n", + " \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"title\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"authors\": {\n", + " \"type\": \"list[str]\"\n", + " },\n", + " \"short_summary\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"date_published\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"tags\": {\n", + " \"type\": \"list[str]\"\n", + " }\n", + " }\n", + " },\n", + " \"returns\": \"Article\"\n", + "}\n", + "\n", + "To use these functions respond with:\n", + "\n", + " {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} \n", + " {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} \n", + " ...\n", + "\n", + "\n", + "Edge cases you must handle:\n", + "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n", + "<|im_start|>user\n", + "What's the weather in 10001?<|im_end|>\n", + "<|im_start|>assistant\n", + "<|im_start|>system\n", + "You are a helpful assistant with access to the following functions:\n", + "\n", + "{\n", + " \"name\": \"get_weather\",\n", + " \"description\": \"Get the current weather given a zip code.\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"zip_code\": {\n", + " \"type\": \"str\"\n", + " }\n", + " }\n", + " },\n", + " \"returns\": \"Weather\"\n", + "}\n", + "\n", + "{\n", + " \"name\": \"calculate_mortgage_payment\",\n", + " \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"loan_amount\": {\n", + " \"type\": \"int\"\n", + " },\n", + " \"interest_rate\": {\n", + " \"type\": \"float\"\n", + " },\n", + " \"loan_term\": {\n", + " \"type\": \"int\"\n", + " }\n", + " }\n", + " },\n", + " \"returns\": \"float\"\n", + "}\n", + "\n", + "{\n", + " \"name\": \"get_article_details\",\n", + " \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"title\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"authors\": {\n", + " \"type\": \"list[str]\"\n", + " },\n", + " \"short_summary\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"date_published\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"tags\": {\n", + " \"type\": \"list[str]\"\n", + " }\n", + " }\n", + " },\n", + " \"returns\": \"Article\"\n", + "}\n", + "\n", + "To use these functions respond with:\n", + "\n", + " {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} \n", + " {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} \n", + " ...\n", + "\n", + "\n", + "Edge cases you must handle:\n", + "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n", + "<|im_start|>user\n", + "Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.<|im_end|>\n", + "<|im_start|>assistant\n", + "<|im_start|>system\n", + "You are a helpful assistant with access to the following functions:\n", + "\n", + "{\n", + " \"name\": \"get_weather\",\n", + " \"description\": \"Get the current weather given a zip code.\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"zip_code\": {\n", + " \"type\": \"str\"\n", + " }\n", + " }\n", + " },\n", + " \"returns\": \"Weather\"\n", + "}\n", + "\n", + "{\n", + " \"name\": \"calculate_mortgage_payment\",\n", + " \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"loan_amount\": {\n", + " \"type\": \"int\"\n", + " },\n", + " \"interest_rate\": {\n", + " \"type\": \"float\"\n", + " },\n", + " \"loan_term\": {\n", + " \"type\": \"int\"\n", + " }\n", + " }\n", + " },\n", + " \"returns\": \"float\"\n", + "}\n", + "\n", + "{\n", + " \"name\": \"get_article_details\",\n", + " \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"title\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"authors\": {\n", + " \"type\": \"list[str]\"\n", + " },\n", + " \"short_summary\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"date_published\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"tags\": {\n", + " \"type\": \"list[str]\"\n", + " }\n", + " }\n", + " },\n", + " \"returns\": \"Article\"\n", + "}\n", + "\n", + "To use these functions respond with:\n", + "\n", + " {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} \n", + " {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} \n", + " ...\n", + "\n", + "\n", + "Edge cases you must handle:\n", + "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n", + "<|im_start|>user\n", + "What's the current exchange rate for USD to EUR?<|im_end|>\n", + "<|im_start|>assistant\n" + ] + } + ], + "source": [ + "prompts = [\n", + " \"What's the weather in 10001?\",\n", + " \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n", + " \"What's the current exchange rate for USD to EUR?\"\n", + "]\n", + "functions = [get_weather, calculate_mortgage_payment, get_article_details]\n", + "\n", + "for prompt in prompts:\n", + " print(generate_hermes_prompt(prompt, functions))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ggml_init_cublas: GGML_CUDA_FORCE_MMQ: no\n", + "ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes\n", + "ggml_init_cublas: found 1 CUDA devices:\n", + " Device 0: NVIDIA GeForce RTX 2060, compute capability 7.5\n", + "llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from ../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf (version GGUF V3 (latest))\n", + "llama_model_loader: - tensor 0: token_embd.weight q4_K [ 4096, 32002, 1, 1 ]\n", + "llama_model_loader: - tensor 1: blk.0.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 2: blk.0.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 3: blk.0.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 4: blk.0.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 5: blk.0.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 6: blk.0.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 7: blk.0.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 8: blk.0.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 9: blk.0.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 10: blk.1.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 11: blk.1.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 12: blk.1.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 13: blk.1.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 14: blk.1.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 15: blk.1.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 16: blk.1.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 17: blk.1.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 18: blk.1.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 19: blk.2.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 20: blk.2.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 21: blk.2.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 22: blk.2.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 23: blk.2.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 24: blk.2.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 25: blk.2.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 26: blk.2.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 27: blk.2.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 28: blk.3.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 29: blk.3.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 30: blk.3.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 31: blk.3.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 32: blk.3.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 33: blk.3.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 34: blk.3.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 35: blk.3.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 36: blk.3.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 37: blk.4.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 38: blk.4.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 39: blk.4.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 40: blk.4.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 41: blk.4.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 42: blk.4.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 43: blk.4.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 44: blk.4.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 45: blk.4.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 46: blk.5.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 47: blk.5.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 48: blk.5.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 49: blk.5.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 50: blk.5.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 51: blk.5.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 52: blk.5.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 53: blk.5.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 54: blk.5.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 55: blk.6.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 56: blk.6.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 57: blk.6.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 58: blk.6.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 59: blk.6.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 60: blk.6.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 61: blk.6.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 62: blk.6.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 63: blk.6.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 64: blk.7.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 65: blk.7.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 66: blk.7.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 67: blk.7.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 68: blk.7.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 69: blk.7.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 70: blk.7.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 71: blk.7.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 72: blk.7.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 73: blk.8.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 74: blk.8.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 75: blk.8.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 76: blk.8.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 77: blk.8.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 78: blk.8.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 79: blk.8.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 80: blk.8.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 81: blk.8.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 82: blk.9.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 83: blk.9.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 84: blk.9.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 85: blk.9.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 86: blk.9.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 87: blk.9.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 88: blk.9.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 89: blk.9.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 90: blk.9.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 91: blk.10.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 92: blk.10.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 93: blk.10.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 94: blk.10.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 95: blk.10.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 96: blk.10.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 97: blk.10.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 98: blk.10.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 99: blk.10.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 100: blk.11.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 101: blk.11.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 102: blk.11.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 103: blk.11.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 104: blk.11.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 105: blk.11.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 106: blk.11.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 107: blk.11.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 108: blk.11.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 109: blk.12.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 110: blk.12.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 111: blk.12.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 112: blk.12.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 113: blk.12.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 114: blk.12.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 115: blk.12.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 116: blk.12.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 117: blk.12.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 118: blk.13.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 119: blk.13.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 120: blk.13.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 121: blk.13.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 122: blk.13.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 123: blk.13.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 124: blk.13.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 125: blk.13.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 126: blk.13.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 127: blk.14.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 128: blk.14.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 129: blk.14.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 130: blk.14.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 131: blk.14.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 132: blk.14.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 133: blk.14.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 134: blk.14.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 135: blk.14.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 136: blk.15.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 137: blk.15.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 138: blk.15.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 139: blk.15.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 140: blk.15.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 141: blk.15.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 142: blk.15.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 143: blk.15.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 144: blk.15.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 145: blk.16.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 146: blk.16.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 147: blk.16.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 148: blk.16.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 149: blk.16.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 150: blk.16.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 151: blk.16.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 152: blk.16.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 153: blk.16.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 154: blk.17.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 155: blk.17.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 156: blk.17.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 157: blk.17.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 158: blk.17.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 159: blk.17.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 160: blk.17.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 161: blk.17.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 162: blk.17.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 163: blk.18.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 164: blk.18.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 165: blk.18.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 166: blk.18.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 167: blk.18.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 168: blk.18.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 169: blk.18.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 170: blk.18.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 171: blk.18.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 172: blk.19.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 173: blk.19.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 174: blk.19.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 175: blk.19.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 176: blk.19.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 177: blk.19.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 178: blk.19.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 179: blk.19.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 180: blk.19.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 181: blk.20.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 182: blk.20.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 183: blk.20.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 184: blk.20.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 185: blk.20.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 186: blk.20.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 187: blk.20.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 188: blk.20.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 189: blk.20.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 190: blk.21.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 191: blk.21.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 192: blk.21.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 193: blk.21.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 194: blk.21.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 195: blk.21.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 196: blk.21.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 197: blk.21.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 198: blk.21.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 199: blk.22.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 200: blk.22.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 201: blk.22.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 202: blk.22.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 203: blk.22.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 204: blk.22.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 205: blk.22.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 206: blk.22.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 207: blk.22.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 208: blk.23.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 209: blk.23.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 210: blk.23.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 211: blk.23.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 212: blk.23.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 213: blk.23.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 214: blk.23.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 215: blk.23.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 216: blk.23.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 217: blk.24.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 218: blk.24.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 219: blk.24.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 220: blk.24.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 221: blk.24.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 222: blk.24.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 223: blk.24.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 224: blk.24.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 225: blk.24.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 226: blk.25.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 227: blk.25.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 228: blk.25.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 229: blk.25.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 230: blk.25.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 231: blk.25.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 232: blk.25.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 233: blk.25.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 234: blk.25.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 235: blk.26.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 236: blk.26.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 237: blk.26.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 238: blk.26.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 239: blk.26.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 240: blk.26.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 241: blk.26.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 242: blk.26.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 243: blk.26.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 244: blk.27.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 245: blk.27.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 246: blk.27.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 247: blk.27.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 248: blk.27.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 249: blk.27.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 250: blk.27.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 251: blk.27.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 252: blk.27.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 253: blk.28.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 254: blk.28.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 255: blk.28.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 256: blk.28.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 257: blk.28.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 258: blk.28.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 259: blk.28.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 260: blk.28.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 261: blk.28.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 262: blk.29.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 263: blk.29.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 264: blk.29.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 265: blk.29.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 266: blk.29.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 267: blk.29.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 268: blk.29.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 269: blk.29.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 270: blk.29.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 271: blk.30.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 272: blk.30.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 273: blk.30.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 274: blk.30.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 275: blk.30.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 276: blk.30.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 277: blk.30.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 278: blk.30.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 279: blk.30.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 280: blk.31.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 281: blk.31.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 282: blk.31.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n", + "llama_model_loader: - tensor 283: blk.31.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 284: blk.31.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 285: blk.31.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n", + "llama_model_loader: - tensor 286: blk.31.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 287: blk.31.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 288: blk.31.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 289: output_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 290: output.weight q6_K [ 4096, 32002, 1, 1 ]\n", + "llama_model_loader: - kv 0: general.architecture str = llama\n", + "llama_model_loader: - kv 1: general.name str = teknium_openhermes-2.5-mistral-7b\n", + "llama_model_loader: - kv 2: llama.context_length u32 = 32768\n", + "llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n", + "llama_model_loader: - kv 4: llama.block_count u32 = 32\n", + "llama_model_loader: - kv 5: llama.feed_forward_length u32 = 14336\n", + "llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n", + "llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n", + "llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8\n", + "llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n", + "llama_model_loader: - kv 10: llama.rope.freq_base f32 = 10000.000000\n", + "llama_model_loader: - kv 11: general.file_type u32 = 15\n", + "llama_model_loader: - kv 12: tokenizer.ggml.model str = llama\n", + "llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32002] = [\"\", \"\", \"\", \"<0x00>\", \"<...\n", + "llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32002] = [0.000000, 0.000000, 0.000000, 0.0000...\n", + "llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32002] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n", + "llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n", + "llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 32000\n", + "llama_model_loader: - kv 18: tokenizer.ggml.padding_token_id u32 = 0\n", + "llama_model_loader: - kv 19: general.quantization_version u32 = 2\n", + "llama_model_loader: - type f32: 65 tensors\n", + "llama_model_loader: - type q4_K: 193 tensors\n", + "llama_model_loader: - type q6_K: 33 tensors\n", + "llm_load_vocab: special tokens definition check successful ( 261/32002 ).\n", + "llm_load_print_meta: format = GGUF V3 (latest)\n", + "llm_load_print_meta: arch = llama\n", + "llm_load_print_meta: vocab type = SPM\n", + "llm_load_print_meta: n_vocab = 32002\n", + "llm_load_print_meta: n_merges = 0\n", + "llm_load_print_meta: n_ctx_train = 32768\n", + "llm_load_print_meta: n_embd = 4096\n", + "llm_load_print_meta: n_head = 32\n", + "llm_load_print_meta: n_head_kv = 8\n", + "llm_load_print_meta: n_layer = 32\n", + "llm_load_print_meta: n_rot = 128\n", + "llm_load_print_meta: n_gqa = 4\n", + "llm_load_print_meta: f_norm_eps = 0.0e+00\n", + "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n", + "llm_load_print_meta: f_clamp_kqv = 0.0e+00\n", + "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n", + "llm_load_print_meta: n_ff = 14336\n", + "llm_load_print_meta: rope scaling = linear\n", + "llm_load_print_meta: freq_base_train = 10000.0\n", + "llm_load_print_meta: freq_scale_train = 1\n", + "llm_load_print_meta: n_yarn_orig_ctx = 32768\n", + "llm_load_print_meta: rope_finetuned = unknown\n", + "llm_load_print_meta: model type = 7B\n", + "llm_load_print_meta: model ftype = mostly Q4_K - Medium\n", + "llm_load_print_meta: model params = 7.24 B\n", + "llm_load_print_meta: model size = 4.07 GiB (4.83 BPW) \n", + "llm_load_print_meta: general.name = teknium_openhermes-2.5-mistral-7b\n", + "llm_load_print_meta: BOS token = 1 ''\n", + "llm_load_print_meta: EOS token = 32000 '<|im_end|>'\n", + "llm_load_print_meta: UNK token = 0 ''\n", + "llm_load_print_meta: PAD token = 0 ''\n", + "llm_load_print_meta: LF token = 13 '<0x0A>'\n", + "llm_load_tensors: ggml ctx size = 0.11 MiB\n", + "llm_load_tensors: using CUDA for GPU acceleration\n", + "llm_load_tensors: mem required = 70.42 MiB\n", + "llm_load_tensors: offloading 32 repeating layers to GPU\n", + "llm_load_tensors: offloading non-repeating layers to GPU\n", + "llm_load_tensors: offloaded 35/35 layers to GPU\n", + "llm_load_tensors: VRAM used: 4095.06 MiB\n", + "...............................................................................................\n", + "llama_new_context_with_model: n_ctx = 2048\n", + "llama_new_context_with_model: freq_base = 10000.0\n", + "llama_new_context_with_model: freq_scale = 1\n", + "llama_kv_cache_init: offloading v cache to GPU\n", + "llama_kv_cache_init: offloading k cache to GPU\n", + "llama_kv_cache_init: VRAM kv self = 256.00 MiB\n", + "llama_new_context_with_model: kv self size = 256.00 MiB\n", + "llama_build_graph: non-view tensors processed: 740/740\n", + "llama_new_context_with_model: compute buffer total size = 159.07 MiB\n", + "llama_new_context_with_model: VRAM scratch buffer: 156.00 MiB\n", + "llama_new_context_with_model: total VRAM used: 4507.07 MiB (model: 4095.06 MiB, context: 412.00 MiB)\n" + ] + } + ], + "source": [ + "import llama_cpp\n", + "\n", + "llama = llama_cpp.Llama(model_path=\"../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf\", n_gpu_layers=-1, n_ctx=2048, verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'name': 'get_weather', 'arguments': {'zip_code': '10001'}}]\n", + "====================================================================================================\n", + "[{'name': 'calculate_mortgage_payment', 'arguments': {'loan_amount': 200000, 'interest_rate': 0.04, 'loan_term': 30}}]\n", + "====================================================================================================\n", + "Unfortunately, I do not have a built-in function to check currency exchange rates. However, you can use third-party APIs or websites like Google Finance or XE to get this information.\n", + "====================================================================================================\n" + ] + } + ], + "source": [ + "prompts = [\n", + " \"What's the weather in 10001?\",\n", + " \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n", + " \"What's the current exchange rate for USD to EUR?\"\n", + "]\n", + "functions = [get_weather, calculate_mortgage_payment, get_article_details]\n", + "\n", + "for prompt in prompts:\n", + " prompt = generate_hermes_prompt(prompt, functions)\n", + " completion = llama.create_completion(prompt, max_tokens=-1)[\"choices\"][0][\"text\"]\n", + " function_calls = extract_function_calls(completion)\n", + " if function_calls:\n", + " print(function_calls)\n", + " else:\n", + " print(completion.strip())\n", + " print(\"=\"*100)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "get_weather\n", + "{'zip_code': '05751'}\n", + "====================================================================================================\n", + "get_weather\n", + "{'zip_code': '05751'}\n", + "get_weather\n", + "{'zip_code': '07030'}\n", + "calculate_mortgage_payment\n", + "{'loan_amount': 250000, 'interest_rate': 4.18, 'loan_term': 30}\n", + "====================================================================================================\n", + "I don't have a function to get exchange rates, but I can provide some resources where you can find this information. You can check websites like Google Finance, XE.com, or Yahoo Finance for up-to-date currency exchange rates.\n", + "====================================================================================================\n" + ] + } + ], + "source": [ + "prompts = [\n", + " \"What's the weather in 05751?\",\n", + " \"I'm planning a trip to Killington, Vermont (05751) from Hoboken, NJ (07030). Can you get me weather for both locations and directions?\",\n", + " \"What's the current exchange rate for USD to EUR?\"\n", + "]\n", + "\n", + "for prompt in prompts:\n", + " completion = llama.create_completion(generate_hermes_prompt(prompt, functions), max_tokens=-1)[\"choices\"][0][\"text\"]\n", + " function_calls = extract_function_calls(completion)\n", + "\n", + " if function_calls:\n", + " for function in function_calls:\n", + " print(function[\"name\"])\n", + " print(function[\"arguments\"])\n", + " else:\n", + " print(completion.strip())\n", + "\n", + " print(\"=\"*100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5+" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 3dcb4b5..3efd95d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -50,6 +50,9 @@ from ._internals import ( _LlamaSamplingContext, # type: ignore ) from ._logger import set_verbose +from ._utils import ( + suppress_stdout_stderr +) class Llama: @@ -182,7 +185,8 @@ class Llama: self.numa = numa if not Llama.__backend_initialized: - llama_cpp.llama_backend_init(self.numa) + with suppress_stdout_stderr(disable=verbose): + llama_cpp.llama_backend_init(self.numa) Llama.__backend_initialized = True self.model_path = model_path @@ -1567,6 +1571,38 @@ class Llama: logit_bias=logit_bias, ) + def create_chat_completion_openai_v1( + self, + *args: Any, + **kwargs: Any, + ): + """Generate a chat completion with return type based on the the OpenAI v1 API. + + OpenAI python package is required to use this method. + + You can install it with `pip install openai`. + + Args: + *args: Positional arguments to pass to create_chat_completion. + **kwargs: Keyword arguments to pass to create_chat_completion. + + Returns: + Generated chat completion or a stream of chat completion chunks. + """ + try: + from openai.types.chat import ChatCompletion, ChatCompletionChunk + stream = kwargs.get("stream", False) # type: ignore + assert isinstance(stream, bool) + if stream: + return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore + else: + return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore + except ImportError: + raise ImportError( + "To use create_chat_completion_openai_v1, you must install the openai package." + "You can install it with `pip install openai`." + ) + def __getstate__(self): return dict( model_path=self.model_path, diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 2e42041..af60d5f 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -31,6 +31,7 @@ MISTRAL_INSTRUCT_EOS_TOKEN = "" ### Chat Completion Handler ### + class LlamaChatCompletionHandler(Protocol): """Base Protocol for a llama chat completion handler. @@ -77,8 +78,7 @@ class LlamaChatCompletionHandler(Protocol): ) -> Union[ llama_types.CreateChatCompletionResponse, Iterator[llama_types.CreateChatCompletionStreamResponse], - ]: - ... + ]: ... class LlamaChatCompletionHandlerNotFoundException(Exception): @@ -134,6 +134,7 @@ def register_chat_completion_handler(name: str): ### Chat Formatter ### + @dataclasses.dataclass class ChatFormatterResponse: """Dataclass that stores completion parameters for a given chat format and @@ -157,8 +158,7 @@ class ChatFormatter(Protocol): *, messages: List[llama_types.ChatCompletionRequestMessage], **kwargs: Any, - ) -> ChatFormatterResponse: - ... + ) -> ChatFormatterResponse: ... class Jinja2ChatFormatter(ChatFormatter): @@ -195,7 +195,7 @@ class Jinja2ChatFormatter(ChatFormatter): eos_token=self.eos_token, bos_token=self.bos_token, raise_exception=raise_exception, - add_generation_prompt=self.add_generation_prompt + add_generation_prompt=self.add_generation_prompt, ) return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token]) @@ -255,11 +255,13 @@ def _convert_text_completion_chunks_to_chat( "choices": [ { "index": 0, - "delta": { - "content": chunk["choices"][0]["text"], - } - if chunk["choices"][0]["finish_reason"] is None - else {}, + "delta": ( + { + "content": chunk["choices"][0]["text"], + } + if chunk["choices"][0]["finish_reason"] is None + else {} + ), "finish_reason": chunk["choices"][0]["finish_reason"], } ], @@ -338,10 +340,12 @@ def chat_formatter_to_chat_completion_handler( # create grammar from json schema if "schema" in response_format: grammar = llama_grammar.LlamaGrammar.from_json_schema( - json.dumps(response_format["schema"]) + json.dumps(response_format["schema"]), verbose=llama.verbose ) except Exception as e: - grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) completion_or_chunks = llama.create_completion( prompt=prompt, @@ -452,7 +456,9 @@ def hf_tokenizer_config_to_chat_completion_handler( tokenizer_config: Dict[str, Any], add_generation_prompt: bool = True, ) -> LlamaChatCompletionHandler: - chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config, add_generation_prompt=add_generation_prompt) + chat_formatter = hf_tokenizer_config_to_chat_formatter( + tokenizer_config, add_generation_prompt=add_generation_prompt + ) return chat_formatter_to_chat_completion_handler(chat_formatter) @@ -463,11 +469,12 @@ def guess_chat_format_from_gguf_metadata(metadata: Dict[str, str]) -> Optional[s if metadata["tokenizer.chat_template"] == CHATML_CHAT_TEMPLATE: return "chatml" - if metadata["tokenizer.chat_template"] == MISTRAL_INSTRUCT_CHAT_TEMPLATE: + if metadata["tokenizer.chat_template"] == MISTRAL_INSTRUCT_CHAT_TEMPLATE: return "mistral-instruct" return None + ### Utility functions for formatting chat prompts ### # TODO: Replace these with jinja2 templates @@ -916,9 +923,17 @@ def format_mistral_instruct( stop = eos prompt = bos for message in messages: - if message["role"] == "user" and message["content"] is not None and isinstance(message["content"], str): + if ( + message["role"] == "user" + and message["content"] is not None + and isinstance(message["content"], str) + ): prompt += "[INST] " + message["content"] - elif message["role"] == "assistant" and message["content"] is not None and isinstance(message["content"], str): + elif ( + message["role"] == "assistant" + and message["content"] is not None + and isinstance(message["content"], str) + ): prompt += " [/INST]" + message["content"] + eos prompt += " [/INST]" return ChatFormatterResponse(prompt=prompt, stop=stop) @@ -958,6 +973,7 @@ def format_openchat( _prompt = _format_chatml(system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep) + # Chat format for Saiga models, see more details and available models: # https://huggingface.co/collections/IlyaGusev/saiga2-saigamistral-6505d4ccc3d1e53166b636cd @register_chat_format("saiga") @@ -979,8 +995,10 @@ def format_saiga( _prompt += "bot" return ChatFormatterResponse(prompt=_prompt.strip()) + # Tricky chat formats that require custom chat handlers + @register_chat_completion_handler("functionary") def functionary_chat_handler( llama: llama.Llama, @@ -1253,7 +1271,8 @@ def functionary_chat_handler( json.dumps(function_body) ) grammar = llama_grammar.LlamaGrammar.from_string( - llama_grammar.json_schema_to_gbnf(json.dumps(function_body)) + llama_grammar.json_schema_to_gbnf(json.dumps(function_body)), + verbose=llama.verbose, ) print(grammar_text) except Exception as e: @@ -1264,11 +1283,14 @@ def functionary_chat_handler( print(e) with suppress_stdout_stderr(disable=llama.verbose): grammar = llama_grammar.LlamaGrammar.from_string( - llama_grammar.JSON_GBNF + llama_grammar.JSON_GBNF, + verbose=llama.verbose, ) else: with suppress_stdout_stderr(disable=llama.verbose): - grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) completion: llama_types.Completion = llama.create_completion( prompt=new_prompt, @@ -1365,11 +1387,13 @@ def functionary_v1_v2_chat_handler( **kwargs, # type: ignore ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" - + tokenizer = llama.tokenizer_ - assert hasattr(tokenizer, "hf_tokenizer"), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class" + assert hasattr( + tokenizer, "hf_tokenizer" + ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class" from transformers import AutoTokenizer - + if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens: version = "v1" END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>" @@ -1513,13 +1537,16 @@ def functionary_v1_v2_chat_handler( "name" ] = f"functions.{message['function_call']['name']}" all_messages.append(message) - + if version == "v1": suffix = "assistant:\n" else: suffix = "<|from|>assistant\n<|recipient|>" - - return tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False) + suffix + + return ( + tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False) + + suffix + ) if tools is not None: functions = [tool["function"] for tool in tools if tool["type"] == "function"] @@ -1529,8 +1556,10 @@ def functionary_v1_v2_chat_handler( tool_choice if isinstance(tool_choice, str) else tool_choice["function"] ) - prompt = prepare_messages_for_inference(messages, tokenizer, version, functions, tools) - + prompt = prepare_messages_for_inference( + messages, tokenizer, version, functions, tools + ) + # If no tools/functions are provided if function_call is None and (functions is None or len(functions) == 0): if version == "v1": @@ -1538,7 +1567,7 @@ def functionary_v1_v2_chat_handler( else: stop = STOP_TOKEN prompt += "all\n<|content|>" - + completion_or_completion_chunks = llama.create_completion( prompt=prompt, temperature=temperature, @@ -1561,9 +1590,9 @@ def functionary_v1_v2_chat_handler( grammar=grammar, ) return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore - + assert stream is False # TODO: support stream mode - + def get_grammar(function_call): function_body = None for function in functions or []: @@ -1574,7 +1603,7 @@ def functionary_v1_v2_chat_handler( if tool["type"] == "function" and tool["function"]["name"] == function_call: function_body = tool["function"]["parameters"] break - + try: with suppress_stdout_stderr(disable=llama.verbose): grammar_text = llama_grammar.json_schema_to_gbnf( @@ -1592,11 +1621,11 @@ def functionary_v1_v2_chat_handler( print(e) with suppress_stdout_stderr(disable=llama.verbose): grammar = llama_grammar.LlamaGrammar.from_string( - llama_grammar.JSON_GBNF + llama_grammar.JSON_GBNF, verbose=llama.verbose ) - + return grammar - + def create_completion(stop): completion: llama_types.Completion = llama.create_completion( prompt=prompt, @@ -1619,11 +1648,11 @@ def functionary_v1_v2_chat_handler( logits_processor=logits_processor, grammar=grammar, ) - + return completion - + function_calls, function_bodies = [], [] - + if version == "v1": # If no or "auto" tool_choice/function_call if function_call is None or ( @@ -1632,7 +1661,9 @@ def functionary_v1_v2_chat_handler( stops = ["\n", END_ASSISTANT_TOKEN] # If tool_choice/function_call is "none" elif isinstance(function_call, str) and function_call == "none": - prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + prompt = prepare_messages_for_inference( + messages, tokenizer, version, [], [] + ) stops = END_ASSISTANT_TOKEN # If tool_choice/function_call is provided elif isinstance(function_call, dict): @@ -1647,14 +1678,27 @@ def functionary_v1_v2_chat_handler( completion = create_completion(stop=stops) completion_text = completion["choices"][0]["text"] - + # If the generation does not involve a function call - if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text: + if ( + START_FUNCTION_CALL_TOKEN not in prompt + and START_FUNCTION_CALL_TOKEN not in completion_text + ): return _convert_completion_to_chat(completion, stream=stream) # type: ignore # If the generation involves a function call in completion, generate the parameters - elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text: - prompt += completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n" - function_calls.append(completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()) + elif ( + START_FUNCTION_CALL_TOKEN not in prompt + and START_FUNCTION_CALL_TOKEN in completion_text + ): + prompt += ( + completion_text.replace( + f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN + ) + + "\n" + ) + function_calls.append( + completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip() + ) grammar = get_grammar(function_calls[-1]) completion = create_completion(stop=END_FUNCTION_CALL_TOKEN) function_bodies.append(completion["choices"][0]["text"].strip()) @@ -1672,7 +1716,10 @@ def functionary_v1_v2_chat_handler( stops = CONTENT_TOKEN # If tool_choice/function_call is "none" elif isinstance(function_call, str) and function_call == "none": - prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>" + prompt = ( + prepare_messages_for_inference(messages, tokenizer, version, [], []) + + "all\n<|content|>" + ) stops = STOP_TOKEN # If tool_choice/function_call is provided elif isinstance(function_call, dict): @@ -1684,15 +1731,17 @@ def functionary_v1_v2_chat_handler( else: prompt = prompt stops = STOP_TOKEN - + completion = create_completion(stop=stops) completion_text = completion["choices"][0]["text"] - + # If the generation does not involve a function call - if prompt.endswith("all\n<|content|>") and not completion_text.startswith("all"): + if prompt.endswith("all\n<|content|>") and not completion_text.startswith( + "all" + ): return _convert_completion_to_chat(completion, stream=stream) # type: ignore # Generate model response if the model decides not to call any function - elif (prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all")): + elif prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all"): prompt += completion_text + CONTENT_TOKEN completion = create_completion(stop=STOP_TOKEN) return _convert_completion_to_chat(completion, stream=stream) # type: ignore @@ -1704,7 +1753,7 @@ def functionary_v1_v2_chat_handler( function_bodies.append(completion["choices"][0]["text"].strip()) prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}" grammar = None - + # Try to generate the beginning of next turn # If empty completion, break from loop next_turn_completion_text = create_completion( @@ -1718,17 +1767,21 @@ def functionary_v1_v2_chat_handler( else: function_bodies.append(completion_text.strip()) break - + assert "usage" in completion assert len(function_calls) > 0 assert len(function_calls) == len(function_bodies) - + tool_calls = [] for function_call, function_body in zip(function_calls, function_bodies): tool_calls.append( { - "id": "call_" + "".join( - [random.choice(string.ascii_letters + string.digits) for _ in range(24)] + "id": "call_" + + "".join( + [ + random.choice(string.ascii_letters + string.digits) + for _ in range(24) + ] ), "type": "function", "function": { @@ -1924,7 +1977,9 @@ class Llava15ChatHandler: json.dumps(response_format["schema"]) ) except Exception as e: - grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF) + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF + ) return _convert_completion_to_chat( llama.create_completion( @@ -1950,3 +2005,601 @@ class Llava15ChatHandler: ), stream=stream, ) + + +@register_chat_completion_handler("chatml-function-calling") +def chatml_function_calling( + llama: llama.Llama, + messages: List[llama_types.ChatCompletionRequestMessage], + functions: Optional[List[llama_types.ChatCompletionFunction]] = None, + function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, + tools: Optional[List[llama_types.ChatCompletionTool]] = None, + tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None, + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + min_p: float = 0.05, + typical_p: float = 1.0, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None, + max_tokens: Optional[int] = None, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + logits_processor: Optional[llama.LogitsProcessorList] = None, + grammar: Optional[llama.LlamaGrammar] = None, + **kwargs, # type: ignore +) -> Union[ + llama_types.CreateChatCompletionResponse, + Iterator[llama_types.CreateChatCompletionStreamResponse], +]: + function_calling_template = ( + "{% for message in messages %}" + "<|im_start|>{{ message.role }}\n" + # System message + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% if tool_calls %}" + "\n\nYou have access to the following functions:\n" + "{% for tool in tools %}" + "\nfunctions.{{ tool.function.name }}:\n" + "{{ tool.function.parameters | tojson }}" + "\n{% endfor %}" + "\n\nYou can respond to users messages with either a single message or one or more function calls." + "\n\nTo respond with a message begin the message with 'message:', use the following format:" + "\n\nmessage:" + "\n" + "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:" + "\n\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "\nfunctions.:" + '\n{ "arg1": "value1", "arg2": "value2" }' + "{% endif %}" + "\n<|im_end|>\n" + "{% endif %}" + # User message + "{% if message.role == 'user' %}" + "{{ message.content }}" + "\n<|im_end|>\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' %}" + ## Reglar message + "{% if message.content and message.content | length > 0 %}" + "message:\n" + "{{ message.content }}" + "\n<|im_end|>\n" + "{% endif %}" + ## Function calls + "{% if message.tool_calls %}" + "{% for tool_call in message.tool_calls %}" + "functions.{{ tool_call.function.name }}:\n" + "{{ tool_call.function.arguments }}" + "{% endfor %}" + "\n<|im_end|>\n" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + ) + template_renderer = jinja2.Environment( + loader=jinja2.BaseLoader(), + autoescape=jinja2.select_autoescape(["html", "xml"]), + undefined=jinja2.StrictUndefined, + ).from_string(function_calling_template) + + # Convert legacy functions to tools + if functions is not None: + tools = [ + { + "type": "function", + "function": function, + } + for function in functions + ] + + # Convert legacy function_call to tool_choice + if function_call is not None: + if isinstance(function_call, str) and ( + function_call == "none" or function_call == "auto" + ): + tool_choice = function_call + if isinstance(function_call, dict) and "name" in function_call: + tool_choice = { + "type": "function", + "function": { + "name": function_call["name"], + }, + } + + # Case 1: No tool choice by user + if ( + tool_choice is None + or (isinstance(tool_choice, str) and tool_choice == "none") + or tools is None + or len(tools) == 0 + ): + prompt = template_renderer.render( + messages=messages, + tools=[], + tool_calls=None, + ) + if response_format is not None and response_format["type"] == "json_object": + try: + grammar = ( + llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(response_format["schema"]) + ) + if "schema" in response_format + else None + ) + except Exception as e: + if llama.verbose: + print( + "Failed to parse response format as JSON schema, falling back to default grammar" + ) + print(e) + grammar = ( + llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF) + if grammar is None + else grammar + ) + return _convert_completion_to_chat( + llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=stream, + stop=stop, + max_tokens=max_tokens, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, + grammar=grammar, + ), + stream=stream, + ) + + def _convert_completion_to_chat_function( + tool_name: str, + completion_or_chunks: Union[ + llama_types.CreateCompletionResponse, + Iterator[llama_types.CreateCompletionStreamResponse], + ], + stream: bool, + ): + if not stream: + completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore + assert "usage" in completion + tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"] + # TODO: Fix for legacy function calls + chat_completion: llama_types.CreateChatCompletionResponse = { + "id": "chat" + completion["id"], + "object": "chat.completion", + "created": completion["created"], + "model": completion["model"], + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": None, + "function_call": { + "name": tool_name, + "arguments": completion["choices"][0]["text"], + }, + "tool_calls": [ + { + "id": tool_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": completion["choices"][0]["text"], + }, + } + ], + }, + "finish_reason": "tool_calls", + } + ], + "usage": completion["usage"], + } + return chat_completion + else: + chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore + + def _stream_response_to_function_stream( + chunks: Iterator[llama_types.CreateCompletionStreamResponse], + ) -> Iterator[llama_types.CreateChatCompletionStreamResponse]: + # blank first message + first = True + id_ = None + created = None + model = None + tool_id = None + for chunk in chunks: + if first: + id_ = "chat" + chunk["id"] + created = chunk["created"] + model = chunk["model"] + tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"] + yield { + "id": id_, + "object": "chat.completion.chunk", + "created": created, + "model": model, + "choices": [ + { + "index": 0, + "finish_reason": None, + "logprobs": None, + "delta": { + "role": "assistant", + "content": None, + "function_call": None, + "tool_calls": None, + }, + } + ], + } + yield { + "id": "chat" + chunk["id"], + "object": "chat.completion.chunk", + "created": chunk["created"], + "model": chunk["model"], + "choices": [ + { + "index": 0, + "finish_reason": None, + "logprobs": None, + "delta": { + "role": None, + "content": None, + "function_call": { + "name": tool_name, + "arguments": chunk["choices"][0]["text"], + }, + "tool_calls": [ + { + "index": 0, + "id": tool_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": "", + }, + } + ], + }, + } + ], + } + first = False + continue + assert tool_id is not None + yield { + "id": "chat" + chunk["id"], + "object": "chat.completion.chunk", + "created": chunk["created"], + "model": chunk["model"], + "choices": [ + { + "index": 0, + "finish_reason": None, + "logprobs": None, + "delta": { + "role": None, + "content": None, + "function_call": { + "name": tool_name, + "arguments": chunk["choices"][0]["text"], + }, + "tool_calls": [ + { + "index": 0, + "id": tool_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": chunk["choices"][0][ + "text" + ], + }, + } + ], + }, + } + ], + } + + if id_ is not None and created is not None and model is not None: + yield { + "id": id_, + "object": "chat.completion.chunk", + "created": created, + "model": model, + "choices": [ + { + "index": 0, + "finish_reason": "tool_calls", + "logprobs": None, + "delta": { + "role": None, + "content": None, + "function_call": None, + "tool_calls": None, + }, + } + ], + } + + return _stream_response_to_function_stream(chunks) + + # Case 2: Tool choice by user + if isinstance(tool_choice, dict): + tool_name = tool_choice["function"]["name"] + tool = next( + (tool for tool in tools if tool["function"]["name"] == tool_name), None + ) + if tool is None: + raise ValueError(f"Tool with name '{tool_name}' not found in tools") + prompt = template_renderer.render( + messages=messages, + tools=tools, + tool_calls=True, + ) + prompt += f"functions.{tool_name}:\n" + try: + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(tool["function"]["parameters"]), verbose=llama.verbose + ) + except Exception as e: + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) + if llama.verbose: + print( + "Failed to parse function body as JSON schema, falling back to default grammar" + ) + print(e) + completion_or_chunks = llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=stream, + stop=stop, + max_tokens=max_tokens, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, + grammar=grammar, + ) + return _convert_completion_to_chat_function( + tool_name, completion_or_chunks, stream + ) + + # Case 3: Automatic tool choice + assert isinstance(tool_choice, str) and tool_choice == "auto" + function_names = " | ".join( + [f'''"functions.{tool['function']['name']}:"''' for tool in tools] + ) + initial_gbnf_tool_grammar = ( + """root ::= functions | "message:"\n""" + f"""functions ::= {function_names}\n""" + ) + follow_up_gbnf_tool_grammar = ( + """root ::= functions | "<|im_end|>"\n""" + f"""functions ::= {function_names}\n""" + ) + prompt = template_renderer.render( + messages=messages, + tools=tools, + tool_calls=True, + ) + completion_or_chunks = llama.create_completion( + prompt=prompt, + temperature=0, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=False, + stop=[":"], + max_tokens=None, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, + grammar=llama_grammar.LlamaGrammar.from_string( + initial_gbnf_tool_grammar, verbose=llama.verbose + ), + ) + completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore + text = completion["choices"][0]["text"] + if "message" in text: + return _convert_completion_to_chat( + llama.create_completion( + prompt=prompt + "message:\n", + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=stream, + stop=["<|im_end|>"], + max_tokens=None, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, + grammar=llama_grammar.LlamaGrammar.from_string( + follow_up_gbnf_tool_grammar, verbose=llama.verbose + ), + ), + stream=stream, + ) + + # One or more function calls + tool_name = text[len("functions.") :] + tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None) + if not stream: + completions = [] + completions_tool_name = [] + while tool is not None: + prompt += f"functions.{tool_name}:\n" + try: + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(tool["function"]["parameters"]), verbose=llama.verbose + ) + except Exception as e: + grammar = llama_grammar.LlamaGrammar.from_string( + llama_grammar.JSON_GBNF, verbose=llama.verbose + ) + if llama.verbose: + print( + "Failed to parse function body as JSON schema, falling back to default grammar" + ) + print(e) + completion_or_chunks = llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=False, + stop=stop, + max_tokens=None, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, + grammar=grammar, + ) + completions.append(completion_or_chunks) + completions_tool_name.append(tool_name) + prompt += completion_or_chunks["choices"][0]["text"] + prompt += "\n" + + response = llama.create_completion( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + typical_p=typical_p, + stream=False, + stop=stop, + max_tokens=None, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + repeat_penalty=repeat_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, + grammar=llama_grammar.LlamaGrammar.from_string( + follow_up_gbnf_tool_grammar, verbose=llama.verbose + ), + ) + + tool_name = response["choices"][0]["text"][len("functions.") :] + tool = next( + (tool for tool in tools if tool["function"]["name"] == tool_name), None + ) + + # Merge completions + function_call = { + "function_call": { + "name": tool_name, + "arguments": completions[0]["choices"][0]["text"], + } + } if len(completions) == 1 else {} + return { + "id": "chat" + completion["id"], + "object": "chat.completion", + "created": completion["created"], + "model": completion["model"], + "choices": [ + { + "finish_reason": "tool_calls", + "index": 0, + "message": { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_" + + f"_{i}_" + + tool_name + + "_" + + completion["id"], + "type": "function", + "function": { + "name": tool_name, + "arguments": completion["choices"][0]["text"], + }, + } + for i, (tool_name, completion) in enumerate( + zip(completions_tool_name, completions) + ) + ], + **function_call + }, + } + ], + "usage": { + "completion_tokens": sum( + completion["usage"]["completion_tokens"] + for completion in completions + ), + "prompt_tokens": sum( + completion["usage"]["prompt_tokens"] for completion in completions + ), + "total_tokens": sum( + completion["usage"]["total_tokens"] for completion in completions + ), + }, + } + + raise ValueError("Automatic streaming tool choice is not supported") diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index c3deba8..1b1befe 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -97,7 +97,7 @@ class CreateChatCompletionResponse(TypedDict): class ChatCompletionMessageToolCallChunkFunction(TypedDict): - name: str + name: Optional[str] arguments: str @@ -118,12 +118,12 @@ class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict): class ChatCompletionStreamResponseDelta(TypedDict): - content: NotRequired[str] + content: NotRequired[Optional[str]] function_call: NotRequired[ - ChatCompletionStreamResponseDeltaFunctionCall + Optional[ChatCompletionStreamResponseDeltaFunctionCall] ] # DEPRECATED - tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]] - role: NotRequired[Literal["system", "user", "assistant", "tool"]] + tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]] + role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]] class ChatCompletionStreamResponseChoice(TypedDict): @@ -132,6 +132,7 @@ class ChatCompletionStreamResponseChoice(TypedDict): ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty ] finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]] + logprobs: NotRequired[Optional[CompletionLogprobs]] class CreateChatCompletionStreamResponse(TypedDict): From cb791716b42eb897acf66b8b78c4a67b6e026a74 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 12 Feb 2024 16:19:05 -0500 Subject: [PATCH 17/38] fix: Always set logits_all = True when using speculative decoding --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 3efd95d..4869a9d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -281,7 +281,7 @@ class Llama: ) self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0 self.context_params.mul_mat_q = mul_mat_q - self.context_params.logits_all = logits_all + self.context_params.logits_all = logits_all if draft_model is None else True # Must be set to True for speculative decoding self.context_params.embedding = embedding self.context_params.offload_kqv = offload_kqv From b82b0e10148659bceb5e79ae221a4bce2c54da79 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 12 Feb 2024 16:27:43 -0500 Subject: [PATCH 18/38] docs: Temporarily revert function calling docs --- README.md | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 59a7cd4..679c977 100644 --- a/README.md +++ b/README.md @@ -292,18 +292,22 @@ To constrain the response to a specific JSON Schema, you can use the `schema` pr ### Function Calling -The high-level API also provides a simple interface for function calling. +The high-level API also provides a simple interface for function calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat forma. -The only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary). The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**. You can provide either `functionary-v1` or `functionary-v2` for the `chat_format` when initializing the Llama class. - -Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files. +The gguf-converted files for functionary can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF) ```python ->>> from llama_cpp import Llama, LlamaHFTokenizer ->>> tokenizer = LlamaHFTokenizer.from_pretrained("path/to/functionary/") ->>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", tokenizer=tokenizer, chat_format="functionary-v2") +>>> from llama_cpp import Llama +>>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary") +>>> # or +>>> llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling") >>> llm.create_chat_completion( messages = [ + { + "role": "system", + "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary" + + }, { "role": "user", "content": "Extract Jason is 25 years old" @@ -330,12 +334,12 @@ Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, i } } }], - tool_choice={ + tool_choice=[{ "type": "function", "function": { "name": "UserDetail" } - }, + }] ) ``` From d605875772a381d863b3960d14e8eeb52908b561 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 12 Feb 2024 16:28:30 -0500 Subject: [PATCH 19/38] Bump version --- CHANGELOG.md | 8 ++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ce0b43..d2bb710 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.40] + +- feat: Update llama.cpp to ggerganov/llama.cpp@3bdc4cd0f595a6096cca4a64aa75ffa8a3503465 +- feat: Generic chatml Function Calling using chat_format="chatml-function-calling"` by @abetlen in #957 +- fix: Circular dependancy preventing early Llama object free by @notwa in #1176 +- docs: Set the correct command for compiling with syscl support by @akarshanbiswas in #1172 +- feat: use gpu backend for clip if available by @iamlemec in #1175 + ## [0.2.39] - feat: Update llama.cpp to ggerganov/llama.cpp@b08f22c882a1443e6b97081f3ce718a4d1a741f8 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 837e3c9..ccafd02 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.39" \ No newline at end of file +__version__ = "0.2.40" \ No newline at end of file From 4348a6cdf057f5746db213867f93ed1359091fa3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 13 Feb 2024 02:04:54 -0500 Subject: [PATCH 20/38] docs: Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 679c977..3d8d4d4 100644 --- a/README.md +++ b/README.md @@ -292,7 +292,7 @@ To constrain the response to a specific JSON Schema, you can use the `schema` pr ### Function Calling -The high-level API also provides a simple interface for function calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat forma. +The high-level API also provides a simple interface for function calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format. The gguf-converted files for functionary can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF) From 5efc45bdfde9c37db27dabecc2955ab9863506c9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 13 Feb 2024 02:43:07 -0500 Subject: [PATCH 21/38] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 3bdc4cd..895407f 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 3bdc4cd0f595a6096cca4a64aa75ffa8a3503465 +Subproject commit 895407f31b358e3d9335e847d13f033491ec8a5b From d1822fed6b706f38bd1ff0de4dec5baaa3cf84fa Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 13 Feb 2024 02:44:00 -0500 Subject: [PATCH 22/38] fix: Don't change order of json schema object properties unless prop_order is passed, Closes #1180 --- llama_cpp/llama_grammar.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index d8ef563..3eb3b96 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -1471,12 +1471,15 @@ class SchemaConverter: if schema_type == "object" and "properties" in schema: # TODO: `required` keyword - prop_order = self._prop_order - prop_pairs = sorted( - schema["properties"].items(), - # sort by position in prop_order (if specified) then by key - key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]), - ) + if self._prop_order: + prop_order = self._prop_order + prop_pairs = sorted( + schema["properties"].items(), + # sort by position in prop_order (if specified) then by key + key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]), + ) + else: + prop_pairs = schema["properties"].items() rule = '"{" space' for i, (prop_name, prop_schema) in enumerate(prop_pairs): From 6fe8b427e1608782ad29b313130ba2fa3e4220b8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 13 Feb 2024 02:46:52 -0500 Subject: [PATCH 23/38] Bump version --- CHANGELOG.md | 5 +++++ llama_cpp/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2bb710..e8fcb80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.41] + +- feat: Update llama.cpp to ggerganov/llama.cpp@895407f31b358e3d9335e847d13f033491ec8a5b +- fix: Don't change order of json schema object properties in generated grammar unless prop_order is passed by @abetlen in d1822fed6b706f38bd1ff0de4dec5baaa3cf84fa + ## [0.2.40] - feat: Update llama.cpp to ggerganov/llama.cpp@3bdc4cd0f595a6096cca4a64aa75ffa8a3503465 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index ccafd02..6bc5e8a 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.40" \ No newline at end of file +__version__ = "0.2.41" \ No newline at end of file From 4b0e3320bd8c2c209e29978d0b21e2e471cc9ee3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 13 Feb 2024 03:11:35 -0500 Subject: [PATCH 24/38] fix: minor formatting bugs for chatml-function-calling --- llama_cpp/llama_chat_format.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index af60d5f..66e40ae 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2072,12 +2072,14 @@ def chatml_function_calling( "{% if message.role == 'assistant' %}" ## Reglar message "{% if message.content and message.content | length > 0 %}" + "{% if tool_calls %}" "message:\n" + "{% endif %}" "{{ message.content }}" "\n<|im_end|>\n" "{% endif %}" ## Function calls - "{% if message.tool_calls %}" + "{% if 'tool_calls' in message %}" "{% for tool_call in message.tool_calls %}" "functions.{{ tool_call.function.name }}:\n" "{{ tool_call.function.arguments }}" From 68fb71b6a26a1e57331868f959b47ab4b87851e1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 13 Feb 2024 03:24:41 -0500 Subject: [PATCH 25/38] fix: missing generation_prompt in chatml-function-calling --- llama_cpp/llama_chat_format.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 66e40ae..809a827 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2088,6 +2088,7 @@ def chatml_function_calling( "{% endif %}" "{% endif %}" "{% endfor %}" + "{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" ) template_renderer = jinja2.Environment( loader=jinja2.BaseLoader(), @@ -2130,6 +2131,7 @@ def chatml_function_calling( messages=messages, tools=[], tool_calls=None, + add_generation_prompt=True, ) if response_format is not None and response_format["type"] == "json_object": try: @@ -2363,6 +2365,7 @@ def chatml_function_calling( messages=messages, tools=tools, tool_calls=True, + add_generation_prompt=True, ) prompt += f"functions.{tool_name}:\n" try: @@ -2420,6 +2423,7 @@ def chatml_function_calling( messages=messages, tools=tools, tool_calls=True, + add_generation_prompt=True, ) completion_or_chunks = llama.create_completion( prompt=prompt, From f7cdf78788da3ef33e3d3a482998d756ee47e8e3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 13 Feb 2024 12:24:00 -0500 Subject: [PATCH 26/38] Update llama.cpp --- llama_cpp/llama_cpp.py | 18 ++++++++++++++++++ vendor/llama.cpp | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 2724edd..9979a67 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -470,6 +470,7 @@ class llama_model_params(Structure): # bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool embedding; // embedding mode only # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU +# bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) # }; class llama_context_params(Structure): """Parameters for llama_context @@ -496,6 +497,7 @@ class llama_context_params(Structure): logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) embedding (bool): embedding mode only offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU + do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) """ _fields_ = [ @@ -520,6 +522,7 @@ class llama_context_params(Structure): ("logits_all", c_bool), ("embedding", c_bool), ("offload_kqv", c_bool), + ("do_pooling", c_bool), ] @@ -1699,6 +1702,21 @@ _lib.llama_get_embeddings.argtypes = [llama_context_p] _lib.llama_get_embeddings.restype = c_float_p +# // Get the embeddings for the ith sequence +# // llama_get_embeddings(ctx) + i*n_embd +# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); +def llama_get_embeddings_ith( + ctx: llama_context_p, i: Union[c_int32, int] +): # type: (...) -> Array[float] # type: ignore + """Get the embeddings for the ith sequence + llama_get_embeddings(ctx) + i*n_embd""" + return _lib.llama_get_embeddings_ith(ctx, i) + + +_lib.llama_get_embeddings_ith.argtypes = [llama_context_p, c_int32] +_lib.llama_get_embeddings_ith.restype = c_float_p + + # // # // Vocab # // diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 895407f..ea9c8e1 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 895407f31b358e3d9335e847d13f033491ec8a5b +Subproject commit ea9c8e11436ad50719987fa23a289c74b7b40d40 From d6be5333e1e28dd07cfec5babd6332c7d1f50788 Mon Sep 17 00:00:00 2001 From: Andrew Lapp Date: Tue, 13 Feb 2024 17:26:07 +0000 Subject: [PATCH 27/38] fix: sample idx off-by-one error for logit_processors (#1179) * fix sample_idx off-by-one error * self._scores is indexed differently, only modify the index within self._input_ids --------- Co-authored-by: Andrew Lapp Co-authored-by: Andrei --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4869a9d..8d726d3 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -557,7 +557,7 @@ class Llama: logits[:] = ( logits_processor(self._input_ids, logits) if idx is None - else logits_processor(self._input_ids[:idx], logits) + else logits_processor(self._input_ids[:idx + 1], logits) ) sampling_params = _LlamaSamplingParams( From b1637c2319936df0ecf1b3eb18ca971b346a147e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 13 Feb 2024 12:35:04 -0500 Subject: [PATCH 28/38] Bump version --- CHANGELOG.md | 6 ++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8fcb80..dbc4dca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.42] + +- feat: Update llama.cpp to ggerganov/llama.cpp@ea9c8e11436ad50719987fa23a289c74b7b40d40 +- fix: sample idx off-by-one error for logit_processors by @lapp0 in #1179 +- fix: chat formatting bugs in `chatml-function-calling` by @abetlen in 4b0e3320bd8c2c209e29978d0b21e2e471cc9ee3 and 68fb71b6a26a1e57331868f959b47ab4b87851e1 + ## [0.2.41] - feat: Update llama.cpp to ggerganov/llama.cpp@895407f31b358e3d9335e847d13f033491ec8a5b diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 6bc5e8a..6e71792 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.41" \ No newline at end of file +__version__ = "0.2.42" \ No newline at end of file From 345215a76cf57b769474ea5dc1aefc5ccfb06d5c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 13 Feb 2024 23:02:50 -0500 Subject: [PATCH 29/38] fix: more chatml-function-calling fixes --- llama_cpp/llama_chat_format.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 809a827..7f365e3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2061,12 +2061,12 @@ def chatml_function_calling( "\nfunctions.:" '\n{ "arg1": "value1", "arg2": "value2" }' "{% endif %}" - "\n<|im_end|>\n" + "<|im_end|>\n" "{% endif %}" # User message "{% if message.role == 'user' %}" "{{ message.content }}" - "\n<|im_end|>\n" + "<|im_end|>\n" "{% endif %}" # Assistant message "{% if message.role == 'assistant' %}" @@ -2076,7 +2076,7 @@ def chatml_function_calling( "message:\n" "{% endif %}" "{{ message.content }}" - "\n<|im_end|>\n" + "<|im_end|>\n" "{% endif %}" ## Function calls "{% if 'tool_calls' in message %}" @@ -2084,11 +2084,11 @@ def chatml_function_calling( "functions.{{ tool_call.function.name }}:\n" "{{ tool_call.function.arguments }}" "{% endfor %}" - "\n<|im_end|>\n" + "<|im_end|>\n" "{% endif %}" "{% endif %}" "{% endfor %}" - "{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" ) template_renderer = jinja2.Environment( loader=jinja2.BaseLoader(), @@ -2120,6 +2120,8 @@ def chatml_function_calling( }, } + stop = [stop, "<|im_end|>"] if isinstance(stop, str) else stop + ["<|im_end|>"] if stop else ["<|im_end|>"] + # Case 1: No tool choice by user if ( tool_choice is None From 7dbbfdecadebe7750be650d9409959640ff9a460 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 13 Feb 2024 23:53:56 -0500 Subject: [PATCH 30/38] fix: submodule kompute is not included in sdist. Closes #1165 --- .github/workflows/build-and-release.yaml | 4 ++-- .github/workflows/build-docker.yaml | 2 +- .github/workflows/publish-to-test.yaml | 2 +- .github/workflows/publish.yaml | 2 +- .github/workflows/test.yaml | 10 +++++----- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 61027ef..63c81f1 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -16,7 +16,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - submodules: "true" + submodules: "recursive" # Used to host cibuildwheel - uses: actions/setup-python@v3 @@ -48,7 +48,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - submodules: "true" + submodules: "recursive" - uses: actions/setup-python@v3 with: python-version: "3.8" diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 27a6b1e..750b91e 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -14,7 +14,7 @@ jobs: - name: Checkout uses: actions/checkout@v3 with: - submodules: "true" + submodules: "recursive" - name: Set up QEMU uses: docker/setup-qemu-action@v2 diff --git a/.github/workflows/publish-to-test.yaml b/.github/workflows/publish-to-test.yaml index 9932d61..47e7c40 100644 --- a/.github/workflows/publish-to-test.yaml +++ b/.github/workflows/publish-to-test.yaml @@ -18,7 +18,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - submodules: "true" + submodules: "recursive" - name: Set up Python uses: actions/setup-python@v4 with: diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 7d6c970..1afdd66 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -12,7 +12,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - submodules: "true" + submodules: "recursive" - name: Set up Python uses: actions/setup-python@v4 with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 2cc6fb0..77df546 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -19,7 +19,7 @@ jobs: steps: - uses: actions/checkout@v4 with: - submodules: "true" + submodules: "recursive" - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: @@ -42,7 +42,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - submodules: "true" + submodules: "recursive" - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: @@ -65,7 +65,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - submodules: "true" + submodules: "recursive" - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: @@ -85,7 +85,7 @@ jobs: # steps: # - uses: actions/checkout@v3 # with: - # submodules: "true" + # submodules: "recursive" # - name: Set up Python 3.8 # uses: actions/setup-python@v4 # with: @@ -112,7 +112,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - submodules: "true" + submodules: "recursive" - name: Set up Python 3.8 uses: actions/setup-python@v4 with: From 7a79e5ac493a3e25a38861828d3e0be3b3c71771 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 13 Feb 2024 23:54:05 -0500 Subject: [PATCH 31/38] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index ea9c8e1..f5ca054 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit ea9c8e11436ad50719987fa23a289c74b7b40d40 +Subproject commit f5ca054855dea83f424003162f26de376e5643f6 From 07a783779a62a4aac0b11161c7e0eb983ff215f8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 13 Feb 2024 23:57:10 -0500 Subject: [PATCH 32/38] fix: Update openbuddy prompt format. Closes #1155 --- llama_cpp/llama_chat_format.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 7f365e3..8dd0ddf 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -734,17 +734,14 @@ def format_openbuddy( messages: List[llama_types.ChatCompletionRequestMessage], **kwargs: Any, ) -> ChatFormatterResponse: - _system_message = """Consider a conversation between User (a human) and Assistant (named Buddy). -Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy -Buddy cannot access the Internet. -Buddy can fluently speak the user's language (e.g. English, Chinese). -Buddy can generate poems, stories, code, essays, songs, parodies, and more. -Buddy possesses vast knowledge about the world, history, and culture. -Buddy's responses are always safe, creative, high-quality, human-like, and interesting. -Buddy strictly refuses to discuss political, NSFW, or other unsafe topics. + _system_message = """You are a helpful, respectful and honest INTP-T AI Assistant named Buddy. You are talking to a human User. +Always answer as helpfully and logically as possible, while being safe. Your answers should not include any harmful, political, religious, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. +If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. +You can speak fluently in many languages, for example: English, Chinese. +You cannot access the internet, but you have vast knowledge, cutoff: 2021-09. +You are trained by OpenBuddy team, (https://openbuddy.ai, https://github.com/OpenBuddy/OpenBuddy), you are based on LLaMA and Falcon transformers model, not related to GPT or OpenAI. -User: Hi. -Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?""" +""" _roles = dict(user="User", assistant="Assistant") _sep = "\n" system_message = _system_message From 6943bab6d817bf71927642ab29e25b94a01fd22c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Feb 2024 03:38:41 -0500 Subject: [PATCH 33/38] fix: destructor exception where internal classes are missing some uninitialized attributes --- llama_cpp/_internals.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 3a71ef0..9473d35 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -42,6 +42,8 @@ class _LlamaModel: self._llama_free_model = llama_cpp._lib.llama_free_model # type: ignore + self.model = None + if not os.path.exists(path_model): raise ValueError(f"Model path does not exist: {path_model}") @@ -248,6 +250,7 @@ class _LlamaContext: self.verbose = verbose self._llama_free = llama_cpp._lib.llama_free # type: ignore + self.ctx = None assert self.model.model is not None @@ -497,6 +500,7 @@ class _LlamaBatch: self._llama_batch_free = llama_cpp._lib.llama_batch_free # type: ignore + self.batch = None self.batch = llama_cpp.llama_batch_init( self.n_tokens, self.embd, self.n_seq_max ) From 7b9960d1cbeeca2df6cc3ada6614bc12b2b309fc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Feb 2024 03:47:21 -0500 Subject: [PATCH 34/38] Update llama.cpp --- llama_cpp/llava_cpp.py | 71 +----------------------------------------- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 71 deletions(-) diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py index b1f90b9..8195bd4 100644 --- a/llama_cpp/llava_cpp.py +++ b/llama_cpp/llava_cpp.py @@ -146,30 +146,8 @@ _libllava.llava_eval_image_embed.restype = c_bool ################################################ -# struct clip_vision_hparams { -# int32_t image_size; -# int32_t patch_size; -# int32_t hidden_size; -# int32_t n_intermediate; -# int32_t projection_dim; -# int32_t n_head; -# int32_t n_layer; -# float eps; -# }; -class clip_vision_hparams(Structure): - _fields_ = [ - ("image_size", c_int32), - ("patch_size", c_int32), - ("hidden_size", c_int32), - ("n_intermediate", c_int32), - ("projection_dim", c_int32), - ("n_head", c_int32), - ("n_layer", c_int32), - ("eps", c_float), - ] - # /** load mmproj model */ -# CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity); +# CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); def clip_model_load(fname: bytes, verbosity: Union[c_int, int]) -> clip_ctx_p: return _libllava.clip_model_load(fname, verbosity) @@ -183,50 +161,3 @@ def clip_free(ctx: clip_ctx_p): _libllava.clip_free.argtypes = [clip_ctx_p] _libllava.clip_free.restype = None - -# size_t clip_embd_nbytes(const struct clip_ctx * ctx); -# int clip_n_patches(const struct clip_ctx * ctx); -# int clip_n_mmproj_embd(const struct clip_ctx * ctx); - -# // RGB uint8 image -# struct clip_image_u8 { -# int nx; -# int ny; -# uint8_t * data = NULL; -# size_t size; -# }; - -# // RGB float32 image (NHWC) -# // Memory layout: RGBRGBRGB... -# struct clip_image_f32 { -# int nx; -# int ny; -# float * data = NULL; -# size_t size; -# }; - -# struct clip_image_u8_batch { -# struct clip_image_u8 * data; -# size_t size; -# }; - -# struct clip_image_f32_batch { -# struct clip_image_f32 * data; -# size_t size; -# }; - -# struct clip_image_u8 * make_clip_image_u8(); -# struct clip_image_f32 * make_clip_image_f32(); -# CLIP_API void clip_image_u8_free(clip_image_u8 * img); -# CLIP_API void clip_image_f32_free(clip_image_f32 * img); -# CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); -# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ -# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); - -# bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square); -# bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec); - -# bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs, -# float * vec); - -# bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype); \ No newline at end of file diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f5ca054..aa23412 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f5ca054855dea83f424003162f26de376e5643f6 +Subproject commit aa2341298924ac89778252015efcb792f2df1e20 From 36b843228f04bd09b642d1500bdf2910f6196c8f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Feb 2024 03:47:40 -0500 Subject: [PATCH 35/38] misc: fix makefile build commands --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ff1484c..e2ce4d0 100644 --- a/Makefile +++ b/Makefile @@ -19,10 +19,10 @@ build.opencl: CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e . build.openblas: - CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e . + CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e . build.blis: - CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" python3 -m pip install --verbose -e . + CMAKE_ARGS="-DLLAMA_BLAS=on -DLLAMA_BLAS_VENDOR=FLAME" python3 -m pip install --verbose -e . build.metal: CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install --verbose -e . From d7a67917ba5b601e146377c6d877893dc49bba83 Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Wed, 14 Feb 2024 03:26:09 -0600 Subject: [PATCH 36/38] feat: Support batch embeddings (#1186) * handle batched embeddings * fix normalization issue * fix type hints, ensure no breaking changes to embed * Clear kv cache / reset internal state after embedding complete --------- Co-authored-by: Andrei --- llama_cpp/_internals.py | 22 +++++++ llama_cpp/llama.py | 135 ++++++++++++++++++++++++++++++---------- 2 files changed, 123 insertions(+), 34 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 9473d35..c60fdff 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -510,6 +510,14 @@ class _LlamaBatch: self._llama_batch_free(self.batch) self.batch = None + def n_tokens(self) -> int: + assert self.batch is not None + return self.batch.n_tokens + + def reset(self): + assert self.batch is not None + self.batch.n_tokens = 0 + def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool): assert self.batch is not None n_tokens = len(batch) @@ -522,6 +530,20 @@ class _LlamaBatch: self.batch.logits[i] = logits_all self.batch.logits[n_tokens - 1] = True + def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool): + assert self.batch is not None + n_tokens = len(batch) + n_tokens0 = self.batch.n_tokens + self.batch.n_tokens += n_tokens + for i in range(n_tokens): + j = n_tokens0 + i + self.batch.token[j] = batch[i] + self.batch.pos[j] = i + self.batch.seq_id[j][0] = seq_id + self.batch.n_seq_id[j] = 1 + self.batch.logits[j] = logits_all + self.batch.logits[n_tokens - 1] = True + class _LlamaTokenDataArray: def __init__(self, *, n_vocab: int): diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 8d726d3..3e09a20 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -717,10 +717,53 @@ class Llama: Returns: An embedding object. """ - assert self._ctx.ctx is not None assert self._model.model is not None model_name: str = model if model is not None else self.model_path + # get numeric embeddings + embeds: List[List[float]] + total_tokens: int + embeds, total_tokens = self.embed(input, return_count=True) # type: ignore + + # convert to CreateEmbeddingResponse + data: List[Embedding] = [ + { + "object": "embedding", + "embedding": emb, + "index": idx, + } + for idx, emb in enumerate(embeds) + ] + + return { + "object": "list", + "data": data, + "model": model_name, + "usage": { + "prompt_tokens": total_tokens, + "total_tokens": total_tokens, + }, + } + + def embed( + self, + input: Union[str, List[str]], + normalize: bool = True, + truncate: bool = True, + return_count: bool = False, + ): + """Embed a string. + + Args: + input: The utf-8 encoded string to embed. + + Returns: + A list of embeddings + """ + assert self._ctx.ctx is not None + n_embd = self.n_embd() + n_ctx = self.n_ctx() + if self.context_params.embedding == False: raise RuntimeError( "Llama model must be created with embedding=True to call this method" @@ -734,48 +777,72 @@ class Llama: else: inputs = input - data: List[Embedding] = [] + # reset batch + self._batch.reset() + + # decode and fetch embeddings + data: List[List[float]] = [] + def decode_batch(sizes: List[int]): + assert self._ctx.ctx is not None + llama_cpp.llama_kv_cache_clear(self._ctx.ctx) + self._ctx.decode(self._batch) + self._batch.reset() + + # store embeddings + for i, s in enumerate(sizes): + embedding = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[ + :n_embd + ] + norm = np.linalg.norm(embedding) if normalize else s + embedding: List[float] = [v / float(norm) for v in embedding] + data.append(embedding) + + # init state total_tokens = 0 - for index, input in enumerate(inputs): - tokens = self.tokenize(input.encode("utf-8"), special=True) - self.reset() - self.eval(tokens) + t_batch = 0 + s_sizes: List[int] = [] + + # accumulate batches and encode + for text in inputs: + tokens = self.tokenize(text.encode("utf-8")) + if truncate: + tokens = tokens[:n_ctx] + n_tokens = len(tokens) total_tokens += n_tokens - embedding = llama_cpp.llama_get_embeddings(self._ctx.ctx)[ - : llama_cpp.llama_n_embd(self._model.model) - ] - data.append( - { - "object": "embedding", - "embedding": embedding, - "index": index, - } - ) + # check for overrun + if n_tokens > n_ctx: + raise ValueError( + f"Requested tokens ({n_tokens}) exceed context window of {n_ctx}" + ) + + # time to eval batch + if t_batch + n_tokens > self._n_ctx: + decode_batch(s_sizes) + t_batch = 0 + s_sizes = [] + + # add to batch + self._batch.add_sequence(tokens, len(s_sizes), False) + t_batch += n_tokens + s_sizes.append(n_tokens) + + # hanlde last batch + decode_batch(s_sizes) + if self.verbose: llama_cpp.llama_print_timings(self._ctx.ctx) - return { - "object": "list", - "data": data, - "model": model_name, - "usage": { - "prompt_tokens": total_tokens, - "total_tokens": total_tokens, - }, - } + output = data[0] if isinstance(input, str) else data - def embed(self, input: str) -> List[float]: - """Embed a string. + llama_cpp.llama_kv_cache_clear(self._ctx.ctx) + self.reset() - Args: - input: The utf-8 encoded string to embed. - - Returns: - A list of embeddings - """ - return list(map(float, self.create_embedding(input)["data"][0]["embedding"])) + if return_count: + return output, total_tokens + else: + return output def _create_completion( self, From c336f782693c447a13da250ee12facb535708981 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Feb 2024 04:27:30 -0500 Subject: [PATCH 37/38] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index aa23412..8084d55 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit aa2341298924ac89778252015efcb792f2df1e20 +Subproject commit 8084d554406b767d36b3250b3b787462d5dd626f From ae71ad1a147b10c2c3ba99eb086521cddcc4fad4 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 14 Feb 2024 04:31:42 -0500 Subject: [PATCH 38/38] Bump version --- CHANGELOG.md | 7 +++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dbc4dca..39b553f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.43] + +- feat: Update llama.cpp to ggerganov/llama.cpp@8084d554406b767d36b3250b3b787462d5dd626f +- feat: Support batch embeddings by @iamlemec in #1186 +- fix: submodule kompute is not included in sdist by @abetlen in 7dbbfdecadebe7750be650d9409959640ff9a460 +- fix: fix: Update openbuddy prompt format by @abetlen in 07a783779a62a4aac0b11161c7e0eb983ff215f8 + ## [0.2.42] - feat: Update llama.cpp to ggerganov/llama.cpp@ea9c8e11436ad50719987fa23a289c74b7b40d40 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 6e71792..e0bd254 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.42" \ No newline at end of file +__version__ = "0.2.43" \ No newline at end of file