From 5e3e67af47908919968f06b738321c32e646a97b Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 6 Feb 2024 12:44:07 -0500
Subject: [PATCH 01/38] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 098f6d7..b08f22c 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 098f6d737b65134cf220d12b9b706e8cfc5e4610
+Subproject commit b08f22c882a1443e6b97081f3ce718a4d1a741f8

From 34f31040f610925552a66b3a033e31320b6f6ad8 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 6 Feb 2024 12:47:59 -0500
Subject: [PATCH 02/38] Bump version

---
 CHANGELOG.md          | 7 ++++++-
 llama_cpp/__init__.py | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9632210..5ce0b43 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,11 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.39]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@b08f22c882a1443e6b97081f3ce718a4d1a741f8
+- fix: Fix destructor logging bugs by using llama_log_callback to avoid suppress_stdout_stderr by @abetlen in 59760c85eddc72dfcc1839f43760ef72c23d6874
+
 ## [0.2.38]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915
 - feat: Add speculative decoding by @abetlen in #1120
-- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template 078cca0361bf5a94d2cf52ed04980d20e32d6f95
+- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template by @abetlen in 078cca0361bf5a94d2cf52ed04980d20e32d6f95
 
 ## [0.2.37]
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 94cd401..837e3c9 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.38"
\ No newline at end of file
+__version__ = "0.2.39"
\ No newline at end of file

From ce1277549012a33e5c2360f42bf53aaf1b95e528 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 6 Feb 2024 18:50:56 -0500
Subject: [PATCH 03/38] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index b08f22c..213d143 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit b08f22c882a1443e6b97081f3ce718a4d1a741f8
+Subproject commit 213d1439fadefe182f69c5f7e8dd3b4b6572ebcb

From 901827013b732d74f1f67033062d13a6204a62bd Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Thu, 8 Feb 2024 09:07:03 +0800
Subject: [PATCH 04/38] feat: Integrate functionary v1.4 and v2 models + add
 custom tokenizer support to Llama class (#1078)

* convert functionary-v1 chat handler to use hf autotokenizer

* add hf_tokenizer + inteegrate functionary-v1.4 prompt template

* integrate functionary v2 prompt template

* update readme

* set up parallel function calling wip

* set up parallel function calling

* Update README.md

* Update README.md

* refactor tokenizers

* include old functionary handler for backward compatibility

* add hf_tokenizer_path in server ModelSettings

* convert functionary-v1 chat handler to use hf autotokenizer

* add hf_tokenizer + inteegrate functionary-v1.4 prompt template

* integrate functionary v2 prompt template

* update readme

* set up parallel function calling wip

* resolve merge conflict

* Update README.md

* Update README.md

* refactor tokenizers

* include old functionary handler for backward compatibility

* add hf_tokenizer_path in server ModelSettings

* Cleanup PR, fix breaking changes

* Use hf_pretrained_model_name_or_path for tokenizer

* fix hf tokenizer in streaming

* update README

* refactor offset mapping

---------

Co-authored-by: Andrei <abetlen@gmail.com>
---
 README.md                      |  19 +-
 llama_cpp/llama.py             | 101 ++++++--
 llama_cpp/llama_chat_format.py | 433 ++++++++++++++++++++++++++++++++-
 llama_cpp/server/model.py      |   6 +
 4 files changed, 525 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index 4131bb3..bddef64 100644
--- a/README.md
+++ b/README.md
@@ -293,19 +293,16 @@ To constrain the response to a specific JSON Schema, you can use the `schema` pr
 
 The high-level API also provides a simple interface for function calling.
 
-Note that the only model that supports full function calling at this time is "functionary".
-The gguf-converted files for this model can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
+The only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary). The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**. You can provide either `functionary-v1` or `functionary-v2` for the `chat_format` when initializing the Llama class.
+
+Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files.
 
 ```python
->>> from llama_cpp import Llama
->>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
+>>> from llama_cpp import Llama, LlamaHFTokenizer
+>>> tokenizer = LlamaHFTokenizer.from_pretrained("path/to/functionary/")
+>>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", tokenizer=tokenizer, chat_format="functionary-v2")
 >>> llm.create_chat_completion(
       messages = [
-        {
-          "role": "system",
-          "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
-
-        },
         {
           "role": "user",
           "content": "Extract Jason is 25 years old"
@@ -332,12 +329,12 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
           }
         }
       }],
-      tool_choice=[{
+      tool_choice={
         "type": "function",
         "function": {
           "name": "UserDetail"
         }
-      }]
+      },
 )
 ```
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 85943db..bad75df 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import os
 import sys
+import abc
 import uuid
 import time
 import multiprocessing
@@ -14,11 +15,14 @@ from typing import (
     Iterator,
     Deque,
     Callable,
+    Any,
 )
 from collections import deque
 
 import ctypes
 
+from llama_cpp.llama_types import List
+
 from .llama_types import *
 from .llama_grammar import LlamaGrammar
 from .llama_cache import (
@@ -95,6 +99,8 @@ class Llama:
         chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
         # Speculative Decoding
         draft_model: Optional[LlamaDraftModel] = None,
+        # Tokenizer Override
+        tokenizer: Optional[BaseLlamaTokenizer] = None,
         # Misc
         verbose: bool = True,
         # Extra Params
@@ -159,6 +165,7 @@ class Llama:
             chat_format: String specifying the chat format to use when calling create_chat_completion.
             chat_handler: Optional chat handler to use when calling create_chat_completion.
             draft_model: Optional draft model to use for speculative decoding.
+            tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
             verbose: Print verbose output to stderr.
 
         Raises:
@@ -235,6 +242,7 @@ class Llama:
         self.n_threads_batch = n_threads_batch or max(
             multiprocessing.cpu_count() // 2, 1
         )
+        
         # Context Params
         self.context_params = llama_cpp.llama_context_default_params()
         self.context_params.seed = seed
@@ -286,6 +294,10 @@ class Llama:
         self._model = _LlamaModel(
             path_model=self.model_path, params=self.model_params, verbose=self.verbose
         )
+
+        # Override tokenizer
+        self.tokenizer_ = tokenizer or LlamaTokenizer(self)
+
         # Set the default value for the context and correct the batch
         if n_ctx == 0:
             n_ctx = self._model.n_ctx_train()
@@ -431,18 +443,19 @@ class Llama:
         Returns:
             A list of tokens.
         """
-        return self._model.tokenize(text, add_bos, special)
+        return self.tokenizer_.tokenize(text, add_bos, special)
 
-    def detokenize(self, tokens: List[int]) -> bytes:
+    def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
         """Detokenize a list of tokens.
 
         Args:
             tokens: The list of tokens to detokenize.
+            prev_tokens: The list of previous tokens. Offset mapping will be performed if provided
 
         Returns:
             The detokenized string.
         """
-        return self._model.detokenize(tokens)
+        return self.tokenizer_.detokenize(tokens, prev_tokens)
 
     def set_cache(self, cache: Optional[BaseLlamaCache]):
         """Set the cache.
@@ -935,7 +948,8 @@ class Llama:
 
             if stream:
                 remaining_tokens = completion_tokens[returned_tokens:]
-                remaining_text = self.detokenize(remaining_tokens)
+                prev_tokens = completion_tokens[:returned_tokens]
+                remaining_text = self.detokenize(completion_tokens, prev_tokens)
                 remaining_length = len(remaining_text)
 
                 # We want to avoid yielding any characters from
@@ -957,13 +971,13 @@ class Llama:
                     for token in remaining_tokens:
                         if token == self.token_bos():
                             continue
-                        token_end_position += len(self.detokenize([token]))
+                        token_end_position += len(remaining_text)
                         # Check if stop sequence is in the token
                         if token_end_position > (
                             remaining_length - first_stop_position
                         ):
                             break
-                        token_str = self.detokenize([token]).decode(
+                        token_str = remaining_text.decode(
                             "utf-8", errors="ignore"
                         )
                         text_offset = len(prompt) + len(
@@ -988,11 +1002,7 @@ class Llama:
                         }
                         top_logprob.update({token_str: current_logprobs[int(token)]})
                         logprobs_or_none = {
-                            "tokens": [
-                                self.detokenize([token]).decode(
-                                    "utf-8", errors="ignore"
-                                )
-                            ],
+                            "tokens": [token_str],
                             "text_offset": [text_offset],
                             "token_logprobs": [current_logprobs[int(token)]],
                             "top_logprobs": [top_logprob],
@@ -1005,9 +1015,7 @@ class Llama:
                             "model": model_name,
                             "choices": [
                                 {
-                                    "text": self.detokenize([token]).decode(
-                                        "utf-8", errors="ignore"
-                                    ),
+                                    "text": token_str,
                                     "index": 0,
                                     "logprobs": logprobs_or_none,
                                     "finish_reason": None,
@@ -1019,7 +1027,7 @@ class Llama:
                         decode_success = False
                         for i in range(1, len(remaining_tokens) + 1):
                             try:
-                                bs = self.detokenize(remaining_tokens[:i])
+                                bs = remaining_text
                                 ts = bs.decode("utf-8")
                                 decode_success = True
                                 break
@@ -1055,6 +1063,7 @@ class Llama:
 
             if len(completion_tokens) >= max_tokens:
                 text = self.detokenize(completion_tokens)
+                
                 finish_reason = "length"
                 break
 
@@ -1693,8 +1702,8 @@ class Llama:
         """Return the vocabulary size."""
         return self._model.n_vocab()
 
-    def tokenizer(self) -> "LlamaTokenizer":
-        """Return the tokenizer for this model."""
+    def tokenizer(self) -> LlamaTokenizer:
+        """Return the llama tokenizer for this model."""
         return LlamaTokenizer(self)
 
     def token_eos(self) -> int:
@@ -1738,23 +1747,71 @@ class Llama:
         return longest_prefix
 
 
-class LlamaTokenizer:
+class BaseLlamaTokenizer(abc.ABC):
+    @abc.abstractmethod
+    def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
+        raise NotImplementedError
+
+
+class LlamaTokenizer(BaseLlamaTokenizer):
     def __init__(self, llama: Llama):
         self.llama = llama
+        self._model = llama._model # type: ignore
 
-    def encode(self, text: str, add_bos: bool = True) -> List[int]:
-        return self.llama.tokenize(
-            text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=True
+    def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]:
+        return self._model.tokenize(text, add_bos=add_bos, special=special)
+
+    def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
+        if prev_tokens is not None:
+            return self._model.detokenize(tokens[len(prev_tokens):])
+        else:
+            return self._model.detokenize(tokens)
+
+    def encode(self, text: str, add_bos: bool = True, special: bool = True) -> List[int]:
+        return self.tokenize(
+            text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
         )
 
     def decode(self, tokens: List[int]) -> str:
-        return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
+        return self.detokenize(tokens).decode("utf-8", errors="ignore")
 
     @classmethod
     def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
         return cls(Llama(model_path=path, vocab_only=True))
 
 
+class LlamaHFTokenizer(BaseLlamaTokenizer):
+    def __init__(self, hf_tokenizer: Any):
+        self.hf_tokenizer = hf_tokenizer
+
+    def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]:
+        return self.hf_tokenizer.encode(text.decode("utf-8", errors="ignore"), add_special_tokens=special)
+    
+    def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
+        if prev_tokens is not None:
+            text = self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
+            prev_text = self.hf_tokenizer.decode(prev_tokens).encode("utf-8", errors="ignore")
+            return text[len(prev_text):]
+        else:
+            return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
+        try:
+            from transformers import AutoTokenizer
+        except ImportError:
+            raise ImportError(
+                "The `transformers` library is required to use the `HFTokenizer`."
+                "You can install it with `pip install transformers`."
+            )
+        hf_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path)
+        return cls(hf_tokenizer)
+
+
 class LlamaState:
     def __init__(
         self,
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 08f991b..2e42041 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -4,7 +4,9 @@ import os
 import json
 import ctypes
 import dataclasses
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol
+import random
+import string
+from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol
 
 import jinja2
 
@@ -1332,6 +1334,435 @@ def functionary_chat_handler(
     )
 
 
+@register_chat_completion_handler("functionary-v1")
+@register_chat_completion_handler("functionary-v2")
+def functionary_v1_v2_chat_handler(
+    llama: llama.Llama,
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    top_k: int = 40,
+    min_p: float = 0.05,
+    typical_p: float = 1.0,
+    stream: bool = False,
+    stop: Optional[Union[str, List[str]]] = [],
+    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
+    max_tokens: Optional[int] = None,
+    presence_penalty: float = 0.0,
+    frequency_penalty: float = 0.0,
+    repeat_penalty: float = 1.1,
+    tfs_z: float = 1.0,
+    mirostat_mode: int = 0,
+    mirostat_tau: float = 5.0,
+    mirostat_eta: float = 0.1,
+    model: Optional[str] = None,
+    logits_processor: Optional[llama.LogitsProcessorList] = None,
+    grammar: Optional[llama.LlamaGrammar] = None,
+    **kwargs,  # type: ignore
+) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+    SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
+    
+    tokenizer = llama.tokenizer_
+    assert hasattr(tokenizer, "hf_tokenizer"), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
+    from transformers import AutoTokenizer
+    
+    if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens:
+        version = "v1"
+        END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>"
+        END_USER_TOKEN = "<|END_OF_USER|>"
+        END_ASSISTANT_TOKEN = "<|END_OF_ASSISTANT|>"
+        END_FUNCTION_RESULT_TOKEN = "<|END_OF_FUNCTION_RESULT|>"
+        START_FUNCTION_CALL_TOKEN = "<|START_OF_FUNCTION_CALL|>"
+        END_FUNCTION_CALL_TOKEN = "<|END_OF_FUNCTION_CALL|>"
+    else:
+        version = "v2"
+        RECIPIENT_TOKEN = "<|recipient|>"
+        FROM_TOKEN = "<|from|>"
+        STOP_TOKEN = "<|stop|>"
+        CONTENT_TOKEN = "<|content|>"
+
+    def generate_type_definition(
+        param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
+    ) -> str:
+        indent = "  " * indent_level
+        if "$ref" in param:
+            # Reference to a shared definition
+            ref_name = param["$ref"].split("/")[
+                -1
+            ]  # Extract the type name from the reference
+            return ref_name
+        elif param.get("type") == "array":
+            items = param.get("items", {})
+            item_type = generate_type_definition(items, indent_level + 1, shared_defs)
+            return f"Array<{item_type}>"
+        elif param.get("type") == "object":
+            properties = param.get("properties", {})
+            nested_schema = "{\n"
+            for nested_param_name, nested_param in properties.items():
+                nested_param_type = generate_type_definition(
+                    nested_param, indent_level + 1, shared_defs
+                )
+                nested_schema += (
+                    f"{indent}  {nested_param_name}: {nested_param_type},\n"
+                )
+            nested_schema += indent + "}"
+            return nested_schema
+        elif "enum" in param:
+            # Enum type
+            return " | ".join([f'"{enum_value}"' for enum_value in param["enum"]])
+        else:
+            # Simple type
+            return param.get("type", "any")
+
+    def generate_shared_definitions(shared_defs, indent_level: int) -> str:
+        indent = "  " * indent_level
+        shared_definitions = ""
+        for def_name, def_properties in shared_defs.items():
+            shared_definitions += f"{indent}type {def_name} = "
+            if def_properties.get("type") == "object":
+                shared_definitions += generate_type_definition(
+                    def_properties, indent_level, shared_defs
+                )
+            elif "enum" in def_properties:
+                # Enum type
+                shared_definitions += " | ".join(
+                    [f'"{enum_value}"' for enum_value in def_properties["enum"]]
+                )
+            shared_definitions += ";\n"
+        return shared_definitions
+
+    def generate_schema_from_functions(functions, namespace="functions") -> str:
+        schema = (
+            "// Supported function definitions that should be called when necessary.\n"
+        )
+        schema += f"namespace {namespace} {{\n\n"
+
+        # Generate shared definitions
+        shared_definitions = {}
+        for function in functions:
+            parameters = function.get("parameters", {})
+            shared_definitions.update(parameters.get("$defs", {}))
+
+        schema += generate_shared_definitions(shared_definitions, 1)
+
+        for function in functions:
+            function_name = function["name"]
+            description = function.get("description", "")
+            parameters = function.get("parameters", {})
+            required_params = parameters.get("required", [])
+
+            schema += f"// {description}\n"
+            schema += f"type {function_name} = (_: {{\n"
+
+            for param_name, param in parameters.get("properties", {}).items():
+                param_description = param.get("description", "")
+                param_type = generate_type_definition(param, 2, shared_definitions)
+                optional_indicator = "" if param_name in required_params else "?"
+                schema += f"// {param_description}\n"
+                schema += f"{param_name}{optional_indicator}: {param_type},\n"
+            schema += "}) => any;\n\n"
+
+        schema += "}} // namespace {}".format(namespace)
+        return schema
+
+    def prepare_messages_for_inference(
+        messages: List[llama_types.ChatCompletionRequestMessage],
+        tokenizer: AutoTokenizer,
+        version: Literal["v1", "v2"],
+        functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
+        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+    ):
+        all_messages: List[llama_types.ChatCompletionRequestMessage] = []
+        if functions is not None:
+            all_messages.append(
+                llama_types.ChatCompletionRequestSystemMessage(
+                    role="system", content=generate_schema_from_functions(functions)
+                )
+            )
+        elif tools is not None:
+            all_messages.append(
+                llama_types.ChatCompletionRequestSystemMessage(
+                    role="system",
+                    content=generate_schema_from_functions(
+                        [
+                            tool["function"]
+                            for tool in tools
+                            if tool["type"] == "function"
+                        ]
+                    ),
+                )
+            )
+
+        all_messages.append(
+            llama_types.ChatCompletionRequestSystemMessage(
+                role="system", content=SYSTEM_MESSAGE
+            )
+        )
+
+        for message in messages:
+            # Function call responses
+            if message["role"] == "function" and "name" in message:
+                message["name"] = f"functions.{message['name']}"
+            # Function call requests by assistant
+            if "function_call" in message:
+                message["function_call"][
+                    "name"
+                ] = f"functions.{message['function_call']['name']}"
+            all_messages.append(message)
+            
+        if version == "v1":
+            suffix = "assistant:\n"
+        else:
+            suffix = "<|from|>assistant\n<|recipient|>"
+        
+        return tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False) + suffix
+
+    if tools is not None:
+        functions = [tool["function"] for tool in tools if tool["type"] == "function"]
+
+    if tool_choice is not None:
+        function_call = (
+            tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
+        )
+
+    prompt = prepare_messages_for_inference(messages, tokenizer, version, functions, tools)
+    
+    # If no tools/functions are provided
+    if function_call is None and (functions is None or len(functions) == 0):
+        if version == "v1":
+            stop = END_ASSISTANT_TOKEN
+        else:
+            stop = STOP_TOKEN
+            prompt += "all\n<|content|>"
+        
+        completion_or_completion_chunks = llama.create_completion(
+            prompt=prompt,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            stream=stream,
+            stop=stop,
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            grammar=grammar,
+        )
+        return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
+    
+    assert stream is False  # TODO: support stream mode
+    
+    def get_grammar(function_call):
+        function_body = None
+        for function in functions or []:
+            if function["name"] == function_call:
+                function_body = function["parameters"]
+                break
+        for tool in tools or []:
+            if tool["type"] == "function" and tool["function"]["name"] == function_call:
+                function_body = tool["function"]["parameters"]
+                break
+            
+        try:
+            with suppress_stdout_stderr(disable=llama.verbose):
+                grammar_text = llama_grammar.json_schema_to_gbnf(
+                    json.dumps(function_body)
+                )
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.json_schema_to_gbnf(json.dumps(function_body))
+                )
+                print(grammar_text)
+        except Exception as e:
+            if llama.verbose:
+                print(
+                    "Failed to parse function body as JSON schema, falling back to default grammar"
+                )
+                print(e)
+            with suppress_stdout_stderr(disable=llama.verbose):
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF
+                )
+        
+        return grammar
+    
+    def create_completion(stop):
+        completion: llama_types.Completion = llama.create_completion(
+            prompt=prompt,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            stream=stream,
+            stop=stop,
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            grammar=grammar,
+        )
+        
+        return completion
+    
+    function_calls, function_bodies = [], []
+    
+    if version == "v1":
+        # If no or "auto" tool_choice/function_call
+        if function_call is None or (
+            isinstance(function_call, str) and function_call == "auto"
+        ):
+            stops = ["\n", END_ASSISTANT_TOKEN]
+        # If tool_choice/function_call is "none"
+        elif isinstance(function_call, str) and function_call == "none":
+            prompt = prepare_messages_for_inference(messages, tokenizer, version, [], [])
+            stops = END_ASSISTANT_TOKEN
+        # If tool_choice/function_call is provided
+        elif isinstance(function_call, dict):
+            prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
+            stops = END_FUNCTION_CALL_TOKEN
+            function_call = function_call["name"]
+            function_calls.append(function_call)
+            grammar = get_grammar(function_call)
+        else:
+            prompt = prompt
+            stops = ["\n", END_ASSISTANT_TOKEN]
+
+        completion = create_completion(stop=stops)
+        completion_text = completion["choices"][0]["text"]
+        
+        # If the generation does not involve a function call
+        if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text:
+            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+        # If the generation involves a function call in completion, generate the parameters
+        elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text:
+            prompt += completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
+            function_calls.append(completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip())
+            grammar = get_grammar(function_calls[-1])
+            completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
+            function_bodies.append(completion["choices"][0]["text"].strip())
+        # If the prompt involves a function call, just append generated parameters to function_bodies
+        else:
+            function_bodies.append(completion_text.strip())
+    else:
+        # Loop until all parallel function calls are generated
+        while True:
+            # If no or "auto" tool_choice/function_call
+            if function_call is None or (
+                isinstance(function_call, str) and function_call == "auto"
+            ):
+                grammar = None
+                stops = CONTENT_TOKEN
+            # If tool_choice/function_call is "none"
+            elif isinstance(function_call, str) and function_call == "none":
+                prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>"
+                stops = STOP_TOKEN
+            # If tool_choice/function_call is provided
+            elif isinstance(function_call, dict):
+                prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
+                stops = STOP_TOKEN
+                function_call = function_call["name"]
+                function_calls.append(function_call)
+                grammar = get_grammar(function_call)
+            else:
+                prompt = prompt
+                stops = STOP_TOKEN
+                
+            completion = create_completion(stop=stops)
+            completion_text = completion["choices"][0]["text"]
+            
+            # If the generation does not involve a function call
+            if prompt.endswith("all\n<|content|>") and not completion_text.startswith("all"):
+                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+            # Generate model response if the model decides not to call any function
+            elif (prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all")):
+                prompt += completion_text + CONTENT_TOKEN
+                completion = create_completion(stop=STOP_TOKEN)
+                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+            # Generate parameters if model decides to call a function
+            elif prompt.endswith(RECIPIENT_TOKEN):
+                function_calls.append(completion_text[:-1])
+                grammar = get_grammar(function_calls[-1])
+                completion = create_completion(stop=[STOP_TOKEN, "\n"])
+                function_bodies.append(completion["choices"][0]["text"].strip())
+                prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}"
+                grammar = None
+                
+                # Try to generate the beginning of next turn
+                # If empty completion, break from loop
+                next_turn_completion_text = create_completion(
+                    stop=[STOP_TOKEN, RECIPIENT_TOKEN]
+                )["choices"][0]["text"]
+                if len(next_turn_completion_text) > 0:
+                    prompt += f"\n{FROM_TOKEN}assistant\n{RECIPIENT_TOKEN}"
+                else:
+                    break
+            # Break from loop if tool_choice/function_call is provided as a dict
+            else:
+                function_bodies.append(completion_text.strip())
+                break
+            
+    assert "usage" in completion
+    assert len(function_calls) > 0
+    assert len(function_calls) == len(function_bodies)
+    
+    tool_calls = []
+    for function_call, function_body in zip(function_calls, function_bodies):
+        tool_calls.append(
+            {
+                "id": "call_" + "".join(
+                    [random.choice(string.ascii_letters + string.digits) for _ in range(24)]
+                ),
+                "type": "function",
+                "function": {
+                    "name": function_call,
+                    "arguments": function_body,
+                },
+            }
+        )
+
+    # TODO: support stream mode
+    return llama_types.CreateChatCompletionResponse(
+        id="chat" + completion["id"],
+        object="chat.completion",
+        created=completion["created"],
+        model=completion["model"],
+        choices=[
+            {
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": None,
+                    "function_call": {
+                        "name": tool_calls[0]["function"]["name"],
+                        "arguments": tool_calls[0]["function"]["arguments"],
+                    },
+                    "tool_calls": tool_calls,
+                },
+                "finish_reason": "tool_calls",
+            }
+        ],
+        usage=completion["usage"],
+    )
+
+
 class Llava15ChatHandler:
     _clip_free = None
 
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 925ab99..6d8ec24 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -93,6 +93,10 @@ class LlamaProxy:
                 )
             )
 
+        tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
+        if settings.hf_pretrained_model_name_or_path is not None:
+            tokenizer = llama_cpp.LlamaHFTokenizer.from_pretrained(settings.hf_pretrained_model_name_or_path)
+
         draft_model = None
         if settings.draft_model is not None:
             draft_model = llama_speculative.LlamaPromptLookupDecoding(
@@ -156,6 +160,8 @@ class LlamaProxy:
             chat_handler=chat_handler,
             # Speculative Decoding
             draft_model=draft_model,
+            # Tokenizer
+            tokenizer=tokenizer,
             # Misc
             verbose=settings.verbose,
         )

From 2ef7ba3aed572609fbf7292adb125e41e5279a15 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 8 Feb 2024 01:07:44 -0500
Subject: [PATCH 05/38] misc: rename grammar test

---
 tests/{test_grammar.py => test_llama_grammar.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{test_grammar.py => test_llama_grammar.py} (100%)

diff --git a/tests/test_grammar.py b/tests/test_llama_grammar.py
similarity index 100%
rename from tests/test_grammar.py
rename to tests/test_llama_grammar.py

From b5fca911b57a23565c55c31802fb9603a0c6497c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 8 Feb 2024 01:08:18 -0500
Subject: [PATCH 06/38] feat: Move tokenizer to own module

---
 llama_cpp/llama.py           | 69 ++------------------------
 llama_cpp/llama_tokenizer.py | 96 ++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 65 deletions(-)
 create mode 100644 llama_cpp/llama_tokenizer.py

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index bad75df..30ae3b5 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import os
 import sys
-import abc
 import uuid
 import time
 import multiprocessing
@@ -15,7 +14,6 @@ from typing import (
     Iterator,
     Deque,
     Callable,
-    Any,
 )
 from collections import deque
 
@@ -31,6 +29,10 @@ from .llama_cache import (
     LlamaDiskCache,  # type: ignore
     LlamaRAMCache,  # type: ignore
 )
+from .llama_tokenizer import (
+    BaseLlamaTokenizer,
+    LlamaTokenizer
+)
 import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama_chat_format as llama_chat_format
 
@@ -1747,69 +1749,6 @@ class Llama:
         return longest_prefix
 
 
-class BaseLlamaTokenizer(abc.ABC):
-    @abc.abstractmethod
-    def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]:
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
-        raise NotImplementedError
-
-
-class LlamaTokenizer(BaseLlamaTokenizer):
-    def __init__(self, llama: Llama):
-        self.llama = llama
-        self._model = llama._model # type: ignore
-
-    def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]:
-        return self._model.tokenize(text, add_bos=add_bos, special=special)
-
-    def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
-        if prev_tokens is not None:
-            return self._model.detokenize(tokens[len(prev_tokens):])
-        else:
-            return self._model.detokenize(tokens)
-
-    def encode(self, text: str, add_bos: bool = True, special: bool = True) -> List[int]:
-        return self.tokenize(
-            text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
-        )
-
-    def decode(self, tokens: List[int]) -> str:
-        return self.detokenize(tokens).decode("utf-8", errors="ignore")
-
-    @classmethod
-    def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
-        return cls(Llama(model_path=path, vocab_only=True))
-
-
-class LlamaHFTokenizer(BaseLlamaTokenizer):
-    def __init__(self, hf_tokenizer: Any):
-        self.hf_tokenizer = hf_tokenizer
-
-    def tokenize(self, text: bytes, add_bos: bool = True, special: bool = True) -> List[int]:
-        return self.hf_tokenizer.encode(text.decode("utf-8", errors="ignore"), add_special_tokens=special)
-    
-    def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
-        if prev_tokens is not None:
-            text = self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
-            prev_text = self.hf_tokenizer.decode(prev_tokens).encode("utf-8", errors="ignore")
-            return text[len(prev_text):]
-        else:
-            return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
-        try:
-            from transformers import AutoTokenizer
-        except ImportError:
-            raise ImportError(
-                "The `transformers` library is required to use the `HFTokenizer`."
-                "You can install it with `pip install transformers`."
-            )
-        hf_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path)
-        return cls(hf_tokenizer)
 
 
 class LlamaState:
diff --git a/llama_cpp/llama_tokenizer.py b/llama_cpp/llama_tokenizer.py
new file mode 100644
index 0000000..0ad3c3a
--- /dev/null
+++ b/llama_cpp/llama_tokenizer.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+import abc
+from typing import (
+    List,
+    Optional,
+    Any,
+)
+
+import llama_cpp
+from llama_cpp.llama_types import List
+
+
+class BaseLlamaTokenizer(abc.ABC):
+    @abc.abstractmethod
+    def tokenize(
+        self, text: bytes, add_bos: bool = True, special: bool = True
+    ) -> List[int]:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def detokenize(
+        self, tokens: List[int], prev_tokens: Optional[List[int]] = None
+    ) -> bytes:
+        raise NotImplementedError
+
+
+class LlamaTokenizer(BaseLlamaTokenizer):
+    def __init__(self, llama: llama_cpp.Llama):
+        self.llama = llama
+        self._model = llama._model  # type: ignore
+
+    def tokenize(
+        self, text: bytes, add_bos: bool = True, special: bool = True
+    ) -> List[int]:
+        return self._model.tokenize(text, add_bos=add_bos, special=special)
+
+    def detokenize(
+        self, tokens: List[int], prev_tokens: Optional[List[int]] = None
+    ) -> bytes:
+        if prev_tokens is not None:
+            return self._model.detokenize(tokens[len(prev_tokens) :])
+        else:
+            return self._model.detokenize(tokens)
+
+    def encode(
+        self, text: str, add_bos: bool = True, special: bool = True
+    ) -> List[int]:
+        return self.tokenize(
+            text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
+        )
+
+    def decode(self, tokens: List[int]) -> str:
+        return self.detokenize(tokens).decode("utf-8", errors="ignore")
+
+    @classmethod
+    def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
+        return cls(llama_cpp.Llama(model_path=path, vocab_only=True))
+
+
+class LlamaHFTokenizer(BaseLlamaTokenizer):
+    def __init__(self, hf_tokenizer: Any):
+        self.hf_tokenizer = hf_tokenizer
+
+    def tokenize(
+        self, text: bytes, add_bos: bool = True, special: bool = True
+    ) -> List[int]:
+        return self.hf_tokenizer.encode(
+            text.decode("utf-8", errors="ignore"), add_special_tokens=special
+        )
+
+    def detokenize(
+        self, tokens: List[int], prev_tokens: Optional[List[int]] = None
+    ) -> bytes:
+        if prev_tokens is not None:
+            text = self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
+            prev_text = self.hf_tokenizer.decode(prev_tokens).encode(
+                "utf-8", errors="ignore"
+            )
+            return text[len(prev_text) :]
+        else:
+            return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
+        try:
+            from transformers import AutoTokenizer
+        except ImportError:
+            raise ImportError(
+                "The `transformers` library is required to use the `HFTokenizer`."
+                "You can install it with `pip install transformers`."
+            )
+        hf_tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path=pretrained_model_name_or_path
+        )
+        return cls(hf_tokenizer)

From 85d3374b4d5892e51e27b9973f9ce3623e076e2a Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 8 Feb 2024 01:13:28 -0500
Subject: [PATCH 07/38] fix: broken import

---
 llama_cpp/server/model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 6d8ec24..5308dc2 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -6,6 +6,7 @@ from typing import Dict, Optional, Union, List
 
 import llama_cpp
 import llama_cpp.llama_speculative as llama_speculative
+import llama_cpp.llama_tokenizer as llama_tokenizer
 
 from llama_cpp.server.settings import ModelSettings
 
@@ -95,7 +96,7 @@ class LlamaProxy:
 
         tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
         if settings.hf_pretrained_model_name_or_path is not None:
-            tokenizer = llama_cpp.LlamaHFTokenizer.from_pretrained(settings.hf_pretrained_model_name_or_path)
+            tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(settings.hf_pretrained_model_name_or_path)
 
         draft_model = None
         if settings.draft_model is not None:

From dfc1b173414b550f8f5be1b94430af16b53a63cb Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 8 Feb 2024 23:38:12 -0500
Subject: [PATCH 08/38] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 213d143..8e6a9d2 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 213d1439fadefe182f69c5f7e8dd3b4b6572ebcb
+Subproject commit 8e6a9d2de0096af7120606c74ee2f26684e87b41

From e16f06e6eb555947f4404c20732921c8ea76c4f7 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 9 Feb 2024 02:02:13 -0500
Subject: [PATCH 09/38] fix: revert _create_completions.

---
 llama_cpp/llama.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index bad75df..f445fb0 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -948,8 +948,7 @@ class Llama:
 
             if stream:
                 remaining_tokens = completion_tokens[returned_tokens:]
-                prev_tokens = completion_tokens[:returned_tokens]
-                remaining_text = self.detokenize(completion_tokens, prev_tokens)
+                remaining_text = self.detokenize(remaining_tokens)
                 remaining_length = len(remaining_text)
 
                 # We want to avoid yielding any characters from
@@ -971,13 +970,13 @@ class Llama:
                     for token in remaining_tokens:
                         if token == self.token_bos():
                             continue
-                        token_end_position += len(remaining_text)
+                        token_end_position += len(self.detokenize([token]))
                         # Check if stop sequence is in the token
                         if token_end_position > (
                             remaining_length - first_stop_position
                         ):
                             break
-                        token_str = remaining_text.decode(
+                        token_str = self.detokenize([token]).decode(
                             "utf-8", errors="ignore"
                         )
                         text_offset = len(prompt) + len(
@@ -1002,7 +1001,11 @@ class Llama:
                         }
                         top_logprob.update({token_str: current_logprobs[int(token)]})
                         logprobs_or_none = {
-                            "tokens": [token_str],
+                            "tokens": [
+                                self.detokenize([token]).decode(
+                                    "utf-8", errors="ignore"
+                                )
+                            ],
                             "text_offset": [text_offset],
                             "token_logprobs": [current_logprobs[int(token)]],
                             "top_logprobs": [top_logprob],
@@ -1015,7 +1018,9 @@ class Llama:
                             "model": model_name,
                             "choices": [
                                 {
-                                    "text": token_str,
+                                    "text": self.detokenize([token]).decode(
+                                        "utf-8", errors="ignore"
+                                    ),
                                     "index": 0,
                                     "logprobs": logprobs_or_none,
                                     "finish_reason": None,
@@ -1027,7 +1032,7 @@ class Llama:
                         decode_success = False
                         for i in range(1, len(remaining_tokens) + 1):
                             try:
-                                bs = remaining_text
+                                bs = self.detokenize(remaining_tokens[:i])
                                 ts = bs.decode("utf-8")
                                 decode_success = True
                                 break
@@ -1063,7 +1068,6 @@ class Llama:
 
             if len(completion_tokens) >= max_tokens:
                 text = self.detokenize(completion_tokens)
-                
                 finish_reason = "length"
                 break
 

From 63b0c37836169baa71c04484e5344294928bd359 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 9 Feb 2024 13:36:58 -0500
Subject: [PATCH 10/38] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 8e6a9d2..4b7b38b 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 8e6a9d2de0096af7120606c74ee2f26684e87b41
+Subproject commit 4b7b38bef5addbd31f453871d79647fbae6bec8a

From 19b55ad3e55cc707938b191ab7779f5fd69cd0c6 Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Sun, 11 Feb 2024 12:53:59 -0600
Subject: [PATCH 11/38] feat: use gpu backend for clip if available (#1175)

---
 CMakeLists.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 795dad7..b4df8ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,14 @@ if (LLAMA_BUILD)
     )
 
     if (LLAVA_BUILD)
+        if (LLAMA_CUBLAS)
+            add_compile_definitions(GGML_USE_CUBLAS)
+        endif()
+
+        if (LLAMA_METAL)
+            add_compile_definitions(GGML_USE_METAL)
+        endif()
+
         # Building llava
         add_subdirectory(vendor/llama.cpp/examples/llava)
         set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")

From 918ff27e501f621ab7d511a9c71c0783d870082c Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshanbiswas@fedoraproject.org>
Date: Mon, 12 Feb 2024 00:25:15 +0530
Subject: [PATCH 12/38] docs: Set the correct command for compiling with syscl
 support (#1172)

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bddef64..59a7cd4 100644
--- a/README.md
+++ b/README.md
@@ -118,7 +118,8 @@ CMAKE_ARGS="-DLLAMA_KOMPUTE=on" pip install llama-cpp-python
 To install with SYCL support, set the `LLAMA_SYCL=on` environment variable before installing:
 
 ```bash
-CMAKE_ARGS="-DLLAMA_SYCL=on" pip install llama-cpp-python
+source /opt/intel/oneapi/setvars.sh   
+CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install llama-cpp-python
 ```
 
 ### Windows Notes

From a05d90446fea83426fe8dc0d6c78afe6c3dd0894 Mon Sep 17 00:00:00 2001
From: Connor <cloningdonor@gmail.com>
Date: Sun, 11 Feb 2024 10:57:57 -0800
Subject: [PATCH 13/38] fix: Circular dependancy preventing early Llama object
 free  (#1176)

commit 901827013b732d74f1f67033062d13a6204a62bd introduced a cyclic dependency
within Llama objects. That change causes old models to linger in memory longer
than necessary, thereby creating memory bloat in most applications attempting
to switch between models at runtime. This patch simply removes the problematic
line, allowing models to deallocate without relying on GC. One might also
consider combining `weakref.ref` with a `@property` if the `llama` attribute is
absolutely necessary to expose in the tokenizer class.
---
 llama_cpp/llama_tokenizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_cpp/llama_tokenizer.py b/llama_cpp/llama_tokenizer.py
index 0ad3c3a..c2aad47 100644
--- a/llama_cpp/llama_tokenizer.py
+++ b/llama_cpp/llama_tokenizer.py
@@ -27,7 +27,6 @@ class BaseLlamaTokenizer(abc.ABC):
 
 class LlamaTokenizer(BaseLlamaTokenizer):
     def __init__(self, llama: llama_cpp.Llama):
-        self.llama = llama
         self._model = llama._model  # type: ignore
 
     def tokenize(

From 936867063984dc695be71eab21115f183dc4d33b Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 11 Feb 2024 14:02:46 -0500
Subject: [PATCH 14/38] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 4b7b38b..97a3365 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 4b7b38bef5addbd31f453871d79647fbae6bec8a
+Subproject commit 97a336507ed9b971d72262bec7e2b8b7016a054a

From 69413ce08e7119b0b10ddb1a52eb8eaa2a865f7f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 11 Feb 2024 19:00:17 -0500
Subject: [PATCH 15/38] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 2 ++
 vendor/llama.cpp       | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index da2a7f3..2724edd 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -139,9 +139,11 @@ llama_seq_id = c_int32
 # enum llama_vocab_type {
 #     LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
 #     LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
+#     LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
 # };
 LLAMA_VOCAB_TYPE_SPM = 0
 LLAMA_VOCAB_TYPE_BPE = 1
+LLAMA_VOCAB_TYPE_WPM = 2
 
 
 # enum llama_token_type {
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 97a3365..3bdc4cd 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 97a336507ed9b971d72262bec7e2b8b7016a054a
+Subproject commit 3bdc4cd0f595a6096cca4a64aa75ffa8a3503465

From 153a0049d90329dec2fc44628fdc6fc8c5f31ae4 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 12 Feb 2024 15:56:07 -0500
Subject: [PATCH 16/38] feat: Generic chatml Function Calling (#957)

* Add demo notebook

* Add initial chat handler

* Update OpenAI types

* Add generic chatml function calling (wip)

* Update chatml generic function calling.

* Progress on auto-tool calls

* fix streaming functions

* Remove print statements

* fix: Suppress output from llama.cpp init and grammar creation

* Add OpenAI v1 python api compatible chat completion function

* Support non-streaming multi-tool calls

* Format

* Include function_call in response.
---
 .../notebooks/OpenHermesFunctionCalling.ipynb | 910 ++++++++++++++++++
 llama_cpp/llama.py                            |  38 +-
 llama_cpp/llama_chat_format.py                | 761 +++++++++++++--
 llama_cpp/llama_types.py                      |  11 +-
 4 files changed, 1660 insertions(+), 60 deletions(-)
 create mode 100644 examples/notebooks/OpenHermesFunctionCalling.ipynb

diff --git a/examples/notebooks/OpenHermesFunctionCalling.ipynb b/examples/notebooks/OpenHermesFunctionCalling.ipynb
new file mode 100644
index 0000000..c0de3fd
--- /dev/null
+++ b/examples/notebooks/OpenHermesFunctionCalling.ipynb
@@ -0,0 +1,910 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  \"name\": \"get_article_details\",\n",
+      "  \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"title\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"authors\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      },\n",
+      "      \"short_summary\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"date_published\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"tags\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Article\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import inspect\n",
+    "from typing import get_type_hints\n",
+    "\n",
+    "class Article:\n",
+    "    pass\n",
+    "\n",
+    "class Weather:\n",
+    "    pass\n",
+    "\n",
+    "class Directions:\n",
+    "    pass\n",
+    "\n",
+    "def calculate_mortgage_payment(loan_amount: int, interest_rate: float, loan_term: int) -> float:\n",
+    "    \"\"\"Get the monthly mortgage payment given an interest rate percentage.\"\"\"\n",
+    "    \n",
+    "    # TODO: you must implement this to actually call it later\n",
+    "    pass\n",
+    "\n",
+    "def get_article_details(title: str, authors: list[str], short_summary: str, date_published: str, tags: list[str]) -> Article:\n",
+    "    '''Get article details from unstructured article text.\n",
+    "date_published: formatted as \"MM/DD/YYYY\"'''\n",
+    "    \n",
+    "    # TODO: you must implement this to actually call it later\n",
+    "    pass\n",
+    "\n",
+    "def get_weather(zip_code: str) -> Weather:\n",
+    "    \"\"\"Get the current weather given a zip code.\"\"\"\n",
+    "    \n",
+    "    # TODO: you must implement this to actually call it later\n",
+    "    pass\n",
+    "\n",
+    "def get_directions(start: str, destination: str) -> Directions:\n",
+    "    \"\"\"Get directions from Google Directions API.\n",
+    "start: start address as a string including zipcode (if any)\n",
+    "destination: end address as a string including zipcode (if any)\"\"\"\n",
+    "    \n",
+    "    # TODO: you must implement this to actually call it later\n",
+    "    pass\n",
+    "\n",
+    "def get_type_name(t):\n",
+    "    name = str(t)\n",
+    "    if \"list\" in name or \"dict\" in name:\n",
+    "        return name\n",
+    "    else:\n",
+    "        return t.__name__\n",
+    "\n",
+    "def serialize_function_to_json(func):\n",
+    "    signature = inspect.signature(func)\n",
+    "    type_hints = get_type_hints(func)\n",
+    "\n",
+    "    function_info = {\n",
+    "        \"name\": func.__name__,\n",
+    "        \"description\": func.__doc__,\n",
+    "        \"parameters\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {}\n",
+    "        },\n",
+    "        \"returns\": type_hints.get('return', 'void').__name__\n",
+    "    }\n",
+    "\n",
+    "    for name, _ in signature.parameters.items():\n",
+    "        param_type = get_type_name(type_hints.get(name, type(None)))\n",
+    "        function_info[\"parameters\"][\"properties\"][name] = {\"type\": param_type}\n",
+    "\n",
+    "    return json.dumps(function_info, indent=2)\n",
+    "\n",
+    "print(serialize_function_to_json(get_article_details))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xml.etree.ElementTree as ET\n",
+    "import re\n",
+    "\n",
+    "def extract_function_calls(completion):\n",
+    "    completion = completion.strip()\n",
+    "    pattern = r\"(<multiplefunctions>(.*?)</multiplefunctions>)\"\n",
+    "    match = re.search(pattern, completion, re.DOTALL)\n",
+    "    if not match:\n",
+    "        return None\n",
+    "    \n",
+    "    multiplefn = match.group(1)\n",
+    "    root = ET.fromstring(multiplefn)\n",
+    "    functions = root.findall(\"functioncall\")\n",
+    "    return [json.loads(fn.text) for fn in functions]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_hermes_prompt(prompt, functions):\n",
+    "    functions = \"\\n\\n\".join([serialize_function_to_json(fn) for fn in functions])\n",
+    "    prompt = f\"\"\"<|im_start|>system\n",
+    "You are a helpful assistant with access to the following functions:\n",
+    "\n",
+    "{functions}\n",
+    "\n",
+    "To use these functions respond with:\n",
+    "<multiplefunctions>\n",
+    "    <functioncall> {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} </functioncall>\n",
+    "    <functioncall> {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} </functioncall>\n",
+    "    ...\n",
+    "</multiplefunctions>\n",
+    "\n",
+    "Edge cases you must handle:\n",
+    "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
+    "<|im_start|>user\n",
+    "{prompt}<|im_end|>\n",
+    "<|im_start|>assistant\"\"\"\n",
+    "    return prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<|im_start|>system\n",
+      "You are a helpful assistant with access to the following functions:\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"get_weather\",\n",
+      "  \"description\": \"Get the current weather given a zip code.\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"zip_code\": {\n",
+      "        \"type\": \"str\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Weather\"\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"calculate_mortgage_payment\",\n",
+      "  \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"loan_amount\": {\n",
+      "        \"type\": \"int\"\n",
+      "      },\n",
+      "      \"interest_rate\": {\n",
+      "        \"type\": \"float\"\n",
+      "      },\n",
+      "      \"loan_term\": {\n",
+      "        \"type\": \"int\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"float\"\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"get_article_details\",\n",
+      "  \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"title\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"authors\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      },\n",
+      "      \"short_summary\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"date_published\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"tags\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Article\"\n",
+      "}\n",
+      "\n",
+      "To use these functions respond with:\n",
+      "<multiplefunctions>\n",
+      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
+      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
+      "    ...\n",
+      "</multiplefunctions>\n",
+      "\n",
+      "Edge cases you must handle:\n",
+      "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
+      "<|im_start|>user\n",
+      "What's the weather in 10001?<|im_end|>\n",
+      "<|im_start|>assistant\n",
+      "<|im_start|>system\n",
+      "You are a helpful assistant with access to the following functions:\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"get_weather\",\n",
+      "  \"description\": \"Get the current weather given a zip code.\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"zip_code\": {\n",
+      "        \"type\": \"str\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Weather\"\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"calculate_mortgage_payment\",\n",
+      "  \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"loan_amount\": {\n",
+      "        \"type\": \"int\"\n",
+      "      },\n",
+      "      \"interest_rate\": {\n",
+      "        \"type\": \"float\"\n",
+      "      },\n",
+      "      \"loan_term\": {\n",
+      "        \"type\": \"int\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"float\"\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"get_article_details\",\n",
+      "  \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"title\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"authors\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      },\n",
+      "      \"short_summary\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"date_published\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"tags\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Article\"\n",
+      "}\n",
+      "\n",
+      "To use these functions respond with:\n",
+      "<multiplefunctions>\n",
+      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
+      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
+      "    ...\n",
+      "</multiplefunctions>\n",
+      "\n",
+      "Edge cases you must handle:\n",
+      "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
+      "<|im_start|>user\n",
+      "Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.<|im_end|>\n",
+      "<|im_start|>assistant\n",
+      "<|im_start|>system\n",
+      "You are a helpful assistant with access to the following functions:\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"get_weather\",\n",
+      "  \"description\": \"Get the current weather given a zip code.\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"zip_code\": {\n",
+      "        \"type\": \"str\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Weather\"\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"calculate_mortgage_payment\",\n",
+      "  \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"loan_amount\": {\n",
+      "        \"type\": \"int\"\n",
+      "      },\n",
+      "      \"interest_rate\": {\n",
+      "        \"type\": \"float\"\n",
+      "      },\n",
+      "      \"loan_term\": {\n",
+      "        \"type\": \"int\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"float\"\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"get_article_details\",\n",
+      "  \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"title\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"authors\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      },\n",
+      "      \"short_summary\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"date_published\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"tags\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Article\"\n",
+      "}\n",
+      "\n",
+      "To use these functions respond with:\n",
+      "<multiplefunctions>\n",
+      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
+      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
+      "    ...\n",
+      "</multiplefunctions>\n",
+      "\n",
+      "Edge cases you must handle:\n",
+      "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
+      "<|im_start|>user\n",
+      "What's the current exchange rate for USD to EUR?<|im_end|>\n",
+      "<|im_start|>assistant\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompts = [\n",
+    "    \"What's the weather in 10001?\",\n",
+    "    \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n",
+    "    \"What's the current exchange rate for USD to EUR?\"\n",
+    "]\n",
+    "functions = [get_weather, calculate_mortgage_payment, get_article_details]\n",
+    "\n",
+    "for prompt in prompts:\n",
+    "    print(generate_hermes_prompt(prompt, functions))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no\n",
+      "ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes\n",
+      "ggml_init_cublas: found 1 CUDA devices:\n",
+      "  Device 0: NVIDIA GeForce RTX 2060, compute capability 7.5\n",
+      "llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from ../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf (version GGUF V3 (latest))\n",
+      "llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32002,     1,     1 ]\n",
+      "llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor    8:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor    9:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   10:              blk.1.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   11:              blk.1.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   12:              blk.1.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   13:         blk.1.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   14:            blk.1.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   15:              blk.1.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   16:            blk.1.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   17:           blk.1.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   18:            blk.1.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   19:              blk.2.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   20:              blk.2.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   21:              blk.2.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   22:         blk.2.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   23:            blk.2.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   24:              blk.2.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   25:            blk.2.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   26:           blk.2.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   27:            blk.2.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   28:              blk.3.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   29:              blk.3.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   30:              blk.3.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   31:         blk.3.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   32:            blk.3.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   33:              blk.3.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   34:            blk.3.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   35:           blk.3.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   36:            blk.3.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   37:              blk.4.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   38:              blk.4.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   39:              blk.4.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   40:         blk.4.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   41:            blk.4.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   42:              blk.4.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   43:            blk.4.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   44:           blk.4.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   45:            blk.4.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   46:              blk.5.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   47:              blk.5.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   48:              blk.5.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   49:         blk.5.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   50:            blk.5.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   51:              blk.5.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   52:            blk.5.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   53:           blk.5.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   54:            blk.5.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   55:              blk.6.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   56:              blk.6.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   57:              blk.6.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   58:         blk.6.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   59:            blk.6.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   60:              blk.6.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   61:            blk.6.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   62:           blk.6.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   63:            blk.6.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   64:              blk.7.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   65:              blk.7.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   66:              blk.7.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   67:         blk.7.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   68:            blk.7.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   69:              blk.7.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   70:            blk.7.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   71:           blk.7.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   72:            blk.7.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   73:              blk.8.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   74:              blk.8.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   75:              blk.8.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   76:         blk.8.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   77:            blk.8.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   78:              blk.8.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   79:            blk.8.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   80:           blk.8.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   81:            blk.8.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   82:              blk.9.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   83:              blk.9.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   84:              blk.9.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   85:         blk.9.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   86:            blk.9.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   87:              blk.9.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   88:            blk.9.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   89:           blk.9.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   90:            blk.9.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   91:             blk.10.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   92:             blk.10.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   93:             blk.10.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   94:        blk.10.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   95:           blk.10.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   96:             blk.10.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   97:           blk.10.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   98:          blk.10.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   99:           blk.10.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  100:             blk.11.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  101:             blk.11.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  102:             blk.11.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  103:        blk.11.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  104:           blk.11.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  105:             blk.11.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  106:           blk.11.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  107:          blk.11.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  108:           blk.11.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  109:             blk.12.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  110:             blk.12.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  111:             blk.12.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  112:        blk.12.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  113:           blk.12.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  114:             blk.12.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  115:           blk.12.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  116:          blk.12.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  117:           blk.12.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  118:             blk.13.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  119:             blk.13.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  120:             blk.13.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  121:        blk.13.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  122:           blk.13.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  123:             blk.13.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  124:           blk.13.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  125:          blk.13.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  126:           blk.13.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  127:             blk.14.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  128:             blk.14.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  129:             blk.14.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  130:        blk.14.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  131:           blk.14.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  132:             blk.14.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  133:           blk.14.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  134:          blk.14.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  135:           blk.14.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  136:             blk.15.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  137:             blk.15.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  138:             blk.15.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  139:        blk.15.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  140:           blk.15.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  141:             blk.15.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  142:           blk.15.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  143:          blk.15.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  144:           blk.15.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  145:             blk.16.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  146:             blk.16.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  147:             blk.16.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  148:        blk.16.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  149:           blk.16.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  150:             blk.16.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  151:           blk.16.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  152:          blk.16.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  153:           blk.16.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  154:             blk.17.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  155:             blk.17.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  156:             blk.17.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  157:        blk.17.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  158:           blk.17.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  159:             blk.17.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  160:           blk.17.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  161:          blk.17.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  162:           blk.17.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  163:             blk.18.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  164:             blk.18.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  165:             blk.18.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  166:        blk.18.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  167:           blk.18.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  168:             blk.18.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  169:           blk.18.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  170:          blk.18.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  171:           blk.18.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  172:             blk.19.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  173:             blk.19.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  174:             blk.19.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  175:        blk.19.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  176:           blk.19.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  177:             blk.19.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  178:           blk.19.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  179:          blk.19.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  180:           blk.19.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  181:             blk.20.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  182:             blk.20.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  183:             blk.20.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  184:        blk.20.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  185:           blk.20.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  186:             blk.20.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  187:           blk.20.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  188:          blk.20.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  189:           blk.20.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  190:             blk.21.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  191:             blk.21.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  192:             blk.21.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  193:        blk.21.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  194:           blk.21.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  195:             blk.21.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  196:           blk.21.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  197:          blk.21.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  198:           blk.21.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  199:             blk.22.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  200:             blk.22.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  201:             blk.22.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  202:        blk.22.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  203:           blk.22.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  204:             blk.22.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  205:           blk.22.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  206:          blk.22.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  207:           blk.22.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  208:             blk.23.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  209:             blk.23.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  210:             blk.23.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  211:        blk.23.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  212:           blk.23.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  213:             blk.23.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  214:           blk.23.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  215:          blk.23.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  216:           blk.23.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  217:             blk.24.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  218:             blk.24.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  219:             blk.24.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  220:        blk.24.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  221:           blk.24.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  222:             blk.24.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  223:           blk.24.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  224:          blk.24.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  225:           blk.24.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  226:             blk.25.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  227:             blk.25.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  228:             blk.25.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  229:        blk.25.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  230:           blk.25.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  231:             blk.25.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  232:           blk.25.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  233:          blk.25.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  234:           blk.25.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  235:             blk.26.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  236:             blk.26.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  237:             blk.26.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  238:        blk.26.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  239:           blk.26.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  240:             blk.26.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  241:           blk.26.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  242:          blk.26.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  243:           blk.26.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  244:             blk.27.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  245:             blk.27.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  246:             blk.27.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  247:        blk.27.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  248:           blk.27.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  249:             blk.27.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  250:           blk.27.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  251:          blk.27.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  252:           blk.27.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  253:             blk.28.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  254:             blk.28.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  255:             blk.28.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  256:        blk.28.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  257:           blk.28.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  258:             blk.28.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  259:           blk.28.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  260:          blk.28.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  261:           blk.28.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  262:             blk.29.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  263:             blk.29.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  264:             blk.29.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  265:        blk.29.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  266:           blk.29.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  267:             blk.29.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  268:           blk.29.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  269:          blk.29.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  270:           blk.29.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  271:             blk.30.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  272:             blk.30.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  273:             blk.30.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  274:        blk.30.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  275:           blk.30.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  276:             blk.30.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  277:           blk.30.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  278:          blk.30.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  279:           blk.30.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  280:             blk.31.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  281:             blk.31.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  282:             blk.31.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  283:        blk.31.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  284:           blk.31.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  285:             blk.31.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  286:           blk.31.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  287:          blk.31.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  288:           blk.31.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  289:               output_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  290:                    output.weight q6_K     [  4096, 32002,     1,     1 ]\n",
+      "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
+      "llama_model_loader: - kv   1:                               general.name str              = teknium_openhermes-2.5-mistral-7b\n",
+      "llama_model_loader: - kv   2:                       llama.context_length u32              = 32768\n",
+      "llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096\n",
+      "llama_model_loader: - kv   4:                          llama.block_count u32              = 32\n",
+      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336\n",
+      "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128\n",
+      "llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32\n",
+      "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 8\n",
+      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010\n",
+      "llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 10000.000000\n",
+      "llama_model_loader: - kv  11:                          general.file_type u32              = 15\n",
+      "llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama\n",
+      "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32002]   = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
+      "llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32002]   = [0.000000, 0.000000, 0.000000, 0.0000...\n",
+      "llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32002]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
+      "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1\n",
+      "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 32000\n",
+      "llama_model_loader: - kv  18:            tokenizer.ggml.padding_token_id u32              = 0\n",
+      "llama_model_loader: - kv  19:               general.quantization_version u32              = 2\n",
+      "llama_model_loader: - type  f32:   65 tensors\n",
+      "llama_model_loader: - type q4_K:  193 tensors\n",
+      "llama_model_loader: - type q6_K:   33 tensors\n",
+      "llm_load_vocab: special tokens definition check successful ( 261/32002 ).\n",
+      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
+      "llm_load_print_meta: arch             = llama\n",
+      "llm_load_print_meta: vocab type       = SPM\n",
+      "llm_load_print_meta: n_vocab          = 32002\n",
+      "llm_load_print_meta: n_merges         = 0\n",
+      "llm_load_print_meta: n_ctx_train      = 32768\n",
+      "llm_load_print_meta: n_embd           = 4096\n",
+      "llm_load_print_meta: n_head           = 32\n",
+      "llm_load_print_meta: n_head_kv        = 8\n",
+      "llm_load_print_meta: n_layer          = 32\n",
+      "llm_load_print_meta: n_rot            = 128\n",
+      "llm_load_print_meta: n_gqa            = 4\n",
+      "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
+      "llm_load_print_meta: f_norm_rms_eps   = 1.0e-05\n",
+      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
+      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
+      "llm_load_print_meta: n_ff             = 14336\n",
+      "llm_load_print_meta: rope scaling     = linear\n",
+      "llm_load_print_meta: freq_base_train  = 10000.0\n",
+      "llm_load_print_meta: freq_scale_train = 1\n",
+      "llm_load_print_meta: n_yarn_orig_ctx  = 32768\n",
+      "llm_load_print_meta: rope_finetuned   = unknown\n",
+      "llm_load_print_meta: model type       = 7B\n",
+      "llm_load_print_meta: model ftype      = mostly Q4_K - Medium\n",
+      "llm_load_print_meta: model params     = 7.24 B\n",
+      "llm_load_print_meta: model size       = 4.07 GiB (4.83 BPW) \n",
+      "llm_load_print_meta: general.name   = teknium_openhermes-2.5-mistral-7b\n",
+      "llm_load_print_meta: BOS token = 1 '<s>'\n",
+      "llm_load_print_meta: EOS token = 32000 '<|im_end|>'\n",
+      "llm_load_print_meta: UNK token = 0 '<unk>'\n",
+      "llm_load_print_meta: PAD token = 0 '<unk>'\n",
+      "llm_load_print_meta: LF token  = 13 '<0x0A>'\n",
+      "llm_load_tensors: ggml ctx size =    0.11 MiB\n",
+      "llm_load_tensors: using CUDA for GPU acceleration\n",
+      "llm_load_tensors: mem required  =   70.42 MiB\n",
+      "llm_load_tensors: offloading 32 repeating layers to GPU\n",
+      "llm_load_tensors: offloading non-repeating layers to GPU\n",
+      "llm_load_tensors: offloaded 35/35 layers to GPU\n",
+      "llm_load_tensors: VRAM used: 4095.06 MiB\n",
+      "...............................................................................................\n",
+      "llama_new_context_with_model: n_ctx      = 2048\n",
+      "llama_new_context_with_model: freq_base  = 10000.0\n",
+      "llama_new_context_with_model: freq_scale = 1\n",
+      "llama_kv_cache_init: offloading v cache to GPU\n",
+      "llama_kv_cache_init: offloading k cache to GPU\n",
+      "llama_kv_cache_init: VRAM kv self = 256.00 MiB\n",
+      "llama_new_context_with_model: kv self size  =  256.00 MiB\n",
+      "llama_build_graph: non-view tensors processed: 740/740\n",
+      "llama_new_context_with_model: compute buffer total size = 159.07 MiB\n",
+      "llama_new_context_with_model: VRAM scratch buffer: 156.00 MiB\n",
+      "llama_new_context_with_model: total VRAM used: 4507.07 MiB (model: 4095.06 MiB, context: 412.00 MiB)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import llama_cpp\n",
+    "\n",
+    "llama = llama_cpp.Llama(model_path=\"../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf\", n_gpu_layers=-1, n_ctx=2048, verbose=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'name': 'get_weather', 'arguments': {'zip_code': '10001'}}]\n",
+      "====================================================================================================\n",
+      "[{'name': 'calculate_mortgage_payment', 'arguments': {'loan_amount': 200000, 'interest_rate': 0.04, 'loan_term': 30}}]\n",
+      "====================================================================================================\n",
+      "Unfortunately, I do not have a built-in function to check currency exchange rates. However, you can use third-party APIs or websites like Google Finance or XE to get this information.\n",
+      "====================================================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompts = [\n",
+    "    \"What's the weather in 10001?\",\n",
+    "    \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n",
+    "    \"What's the current exchange rate for USD to EUR?\"\n",
+    "]\n",
+    "functions = [get_weather, calculate_mortgage_payment, get_article_details]\n",
+    "\n",
+    "for prompt in prompts:\n",
+    "    prompt = generate_hermes_prompt(prompt, functions)\n",
+    "    completion = llama.create_completion(prompt, max_tokens=-1)[\"choices\"][0][\"text\"]\n",
+    "    function_calls = extract_function_calls(completion)\n",
+    "    if function_calls:\n",
+    "        print(function_calls)\n",
+    "    else:\n",
+    "        print(completion.strip())\n",
+    "    print(\"=\"*100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "get_weather\n",
+      "{'zip_code': '05751'}\n",
+      "====================================================================================================\n",
+      "get_weather\n",
+      "{'zip_code': '05751'}\n",
+      "get_weather\n",
+      "{'zip_code': '07030'}\n",
+      "calculate_mortgage_payment\n",
+      "{'loan_amount': 250000, 'interest_rate': 4.18, 'loan_term': 30}\n",
+      "====================================================================================================\n",
+      "I don't have a function to get exchange rates, but I can provide some resources where you can find this information. You can check websites like Google Finance, XE.com, or Yahoo Finance for up-to-date currency exchange rates.\n",
+      "====================================================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompts = [\n",
+    "    \"What's the weather in 05751?\",\n",
+    "    \"I'm planning a trip to Killington, Vermont (05751) from Hoboken, NJ (07030). Can you get me weather for both locations and directions?\",\n",
+    "    \"What's the current exchange rate for USD to EUR?\"\n",
+    "]\n",
+    "\n",
+    "for prompt in prompts:\n",
+    "    completion = llama.create_completion(generate_hermes_prompt(prompt, functions), max_tokens=-1)[\"choices\"][0][\"text\"]\n",
+    "    function_calls = extract_function_calls(completion)\n",
+    "\n",
+    "    if function_calls:\n",
+    "        for function in function_calls:\n",
+    "            print(function[\"name\"])\n",
+    "            print(function[\"arguments\"])\n",
+    "    else:\n",
+    "        print(completion.strip())\n",
+    "\n",
+    "    print(\"=\"*100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5+"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 3dcb4b5..3efd95d 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -50,6 +50,9 @@ from ._internals import (
     _LlamaSamplingContext,  # type: ignore
 )
 from ._logger import set_verbose
+from ._utils import (
+    suppress_stdout_stderr
+)
 
 
 class Llama:
@@ -182,7 +185,8 @@ class Llama:
 
         self.numa = numa
         if not Llama.__backend_initialized:
-            llama_cpp.llama_backend_init(self.numa)
+            with suppress_stdout_stderr(disable=verbose):
+                llama_cpp.llama_backend_init(self.numa)
             Llama.__backend_initialized = True
 
         self.model_path = model_path
@@ -1567,6 +1571,38 @@ class Llama:
             logit_bias=logit_bias,
         )
 
+    def create_chat_completion_openai_v1(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ):
+        """Generate a chat completion with return type based on the the OpenAI v1 API.
+
+        OpenAI python package is required to use this method.
+
+        You can install it with `pip install openai`.
+
+        Args:
+            *args: Positional arguments to pass to create_chat_completion.
+            **kwargs: Keyword arguments to pass to create_chat_completion.
+
+        Returns:
+            Generated chat completion or a stream of chat completion chunks.
+        """
+        try:
+            from openai.types.chat import ChatCompletion, ChatCompletionChunk
+            stream = kwargs.get("stream", False) # type: ignore
+            assert isinstance(stream, bool)
+            if stream:
+                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore
+            else:
+                return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore
+        except ImportError:
+            raise ImportError(
+                "To use create_chat_completion_openai_v1, you must install the openai package."
+                "You can install it with `pip install openai`."
+            )
+
     def __getstate__(self):
         return dict(
             model_path=self.model_path,
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 2e42041..af60d5f 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -31,6 +31,7 @@ MISTRAL_INSTRUCT_EOS_TOKEN = "</s>"
 
 ### Chat Completion Handler ###
 
+
 class LlamaChatCompletionHandler(Protocol):
     """Base Protocol for a llama chat completion handler.
 
@@ -77,8 +78,7 @@ class LlamaChatCompletionHandler(Protocol):
     ) -> Union[
         llama_types.CreateChatCompletionResponse,
         Iterator[llama_types.CreateChatCompletionStreamResponse],
-    ]:
-        ...
+    ]: ...
 
 
 class LlamaChatCompletionHandlerNotFoundException(Exception):
@@ -134,6 +134,7 @@ def register_chat_completion_handler(name: str):
 
 ### Chat Formatter ###
 
+
 @dataclasses.dataclass
 class ChatFormatterResponse:
     """Dataclass that stores completion parameters for a given chat format and
@@ -157,8 +158,7 @@ class ChatFormatter(Protocol):
         *,
         messages: List[llama_types.ChatCompletionRequestMessage],
         **kwargs: Any,
-    ) -> ChatFormatterResponse:
-        ...
+    ) -> ChatFormatterResponse: ...
 
 
 class Jinja2ChatFormatter(ChatFormatter):
@@ -195,7 +195,7 @@ class Jinja2ChatFormatter(ChatFormatter):
             eos_token=self.eos_token,
             bos_token=self.bos_token,
             raise_exception=raise_exception,
-            add_generation_prompt=self.add_generation_prompt
+            add_generation_prompt=self.add_generation_prompt,
         )
 
         return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token])
@@ -255,11 +255,13 @@ def _convert_text_completion_chunks_to_chat(
             "choices": [
                 {
                     "index": 0,
-                    "delta": {
-                        "content": chunk["choices"][0]["text"],
-                    }
-                    if chunk["choices"][0]["finish_reason"] is None
-                    else {},
+                    "delta": (
+                        {
+                            "content": chunk["choices"][0]["text"],
+                        }
+                        if chunk["choices"][0]["finish_reason"] is None
+                        else {}
+                    ),
                     "finish_reason": chunk["choices"][0]["finish_reason"],
                 }
             ],
@@ -338,10 +340,12 @@ def chat_formatter_to_chat_completion_handler(
                 # create grammar from json schema
                 if "schema" in response_format:
                     grammar = llama_grammar.LlamaGrammar.from_json_schema(
-                        json.dumps(response_format["schema"])
+                        json.dumps(response_format["schema"]), verbose=llama.verbose
                     )
             except Exception as e:
-                grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                )
 
         completion_or_chunks = llama.create_completion(
             prompt=prompt,
@@ -452,7 +456,9 @@ def hf_tokenizer_config_to_chat_completion_handler(
     tokenizer_config: Dict[str, Any],
     add_generation_prompt: bool = True,
 ) -> LlamaChatCompletionHandler:
-    chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config, add_generation_prompt=add_generation_prompt)
+    chat_formatter = hf_tokenizer_config_to_chat_formatter(
+        tokenizer_config, add_generation_prompt=add_generation_prompt
+    )
     return chat_formatter_to_chat_completion_handler(chat_formatter)
 
 
@@ -463,11 +469,12 @@ def guess_chat_format_from_gguf_metadata(metadata: Dict[str, str]) -> Optional[s
     if metadata["tokenizer.chat_template"] == CHATML_CHAT_TEMPLATE:
         return "chatml"
 
-    if metadata["tokenizer.chat_template"] == MISTRAL_INSTRUCT_CHAT_TEMPLATE: 
+    if metadata["tokenizer.chat_template"] == MISTRAL_INSTRUCT_CHAT_TEMPLATE:
         return "mistral-instruct"
 
     return None
 
+
 ### Utility functions for formatting chat prompts ###
 # TODO: Replace these with jinja2 templates
 
@@ -916,9 +923,17 @@ def format_mistral_instruct(
     stop = eos
     prompt = bos
     for message in messages:
-        if message["role"] == "user" and message["content"] is not None and isinstance(message["content"], str):
+        if (
+            message["role"] == "user"
+            and message["content"] is not None
+            and isinstance(message["content"], str)
+        ):
             prompt += "[INST] " + message["content"]
-        elif message["role"] == "assistant" and message["content"] is not None and isinstance(message["content"], str):
+        elif (
+            message["role"] == "assistant"
+            and message["content"] is not None
+            and isinstance(message["content"], str)
+        ):
             prompt += " [/INST]" + message["content"] + eos
     prompt += " [/INST]"
     return ChatFormatterResponse(prompt=prompt, stop=stop)
@@ -958,6 +973,7 @@ def format_openchat(
     _prompt = _format_chatml(system_message, _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 
+
 # Chat format for Saiga models, see more details and available models:
 # https://huggingface.co/collections/IlyaGusev/saiga2-saigamistral-6505d4ccc3d1e53166b636cd
 @register_chat_format("saiga")
@@ -979,8 +995,10 @@ def format_saiga(
     _prompt += "<s>bot"
     return ChatFormatterResponse(prompt=_prompt.strip())
 
+
 # Tricky chat formats that require custom chat handlers
 
+
 @register_chat_completion_handler("functionary")
 def functionary_chat_handler(
     llama: llama.Llama,
@@ -1253,7 +1271,8 @@ def functionary_chat_handler(
                     json.dumps(function_body)
                 )
                 grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.json_schema_to_gbnf(json.dumps(function_body))
+                    llama_grammar.json_schema_to_gbnf(json.dumps(function_body)),
+                    verbose=llama.verbose,
                 )
                 print(grammar_text)
         except Exception as e:
@@ -1264,11 +1283,14 @@ def functionary_chat_handler(
                 print(e)
             with suppress_stdout_stderr(disable=llama.verbose):
                 grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.JSON_GBNF
+                    llama_grammar.JSON_GBNF,
+                    verbose=llama.verbose,
                 )
     else:
         with suppress_stdout_stderr(disable=llama.verbose):
-            grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+            grammar = llama_grammar.LlamaGrammar.from_string(
+                llama_grammar.JSON_GBNF, verbose=llama.verbose
+            )
 
     completion: llama_types.Completion = llama.create_completion(
         prompt=new_prompt,
@@ -1365,11 +1387,13 @@ def functionary_v1_v2_chat_handler(
     **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
-    
+
     tokenizer = llama.tokenizer_
-    assert hasattr(tokenizer, "hf_tokenizer"), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
+    assert hasattr(
+        tokenizer, "hf_tokenizer"
+    ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
     from transformers import AutoTokenizer
-    
+
     if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens:
         version = "v1"
         END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>"
@@ -1513,13 +1537,16 @@ def functionary_v1_v2_chat_handler(
                     "name"
                 ] = f"functions.{message['function_call']['name']}"
             all_messages.append(message)
-            
+
         if version == "v1":
             suffix = "assistant:\n"
         else:
             suffix = "<|from|>assistant\n<|recipient|>"
-        
-        return tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False) + suffix
+
+        return (
+            tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False)
+            + suffix
+        )
 
     if tools is not None:
         functions = [tool["function"] for tool in tools if tool["type"] == "function"]
@@ -1529,8 +1556,10 @@ def functionary_v1_v2_chat_handler(
             tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
         )
 
-    prompt = prepare_messages_for_inference(messages, tokenizer, version, functions, tools)
-    
+    prompt = prepare_messages_for_inference(
+        messages, tokenizer, version, functions, tools
+    )
+
     # If no tools/functions are provided
     if function_call is None and (functions is None or len(functions) == 0):
         if version == "v1":
@@ -1538,7 +1567,7 @@ def functionary_v1_v2_chat_handler(
         else:
             stop = STOP_TOKEN
             prompt += "all\n<|content|>"
-        
+
         completion_or_completion_chunks = llama.create_completion(
             prompt=prompt,
             temperature=temperature,
@@ -1561,9 +1590,9 @@ def functionary_v1_v2_chat_handler(
             grammar=grammar,
         )
         return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
-    
+
     assert stream is False  # TODO: support stream mode
-    
+
     def get_grammar(function_call):
         function_body = None
         for function in functions or []:
@@ -1574,7 +1603,7 @@ def functionary_v1_v2_chat_handler(
             if tool["type"] == "function" and tool["function"]["name"] == function_call:
                 function_body = tool["function"]["parameters"]
                 break
-            
+
         try:
             with suppress_stdout_stderr(disable=llama.verbose):
                 grammar_text = llama_grammar.json_schema_to_gbnf(
@@ -1592,11 +1621,11 @@ def functionary_v1_v2_chat_handler(
                 print(e)
             with suppress_stdout_stderr(disable=llama.verbose):
                 grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.JSON_GBNF
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose
                 )
-        
+
         return grammar
-    
+
     def create_completion(stop):
         completion: llama_types.Completion = llama.create_completion(
             prompt=prompt,
@@ -1619,11 +1648,11 @@ def functionary_v1_v2_chat_handler(
             logits_processor=logits_processor,
             grammar=grammar,
         )
-        
+
         return completion
-    
+
     function_calls, function_bodies = [], []
-    
+
     if version == "v1":
         # If no or "auto" tool_choice/function_call
         if function_call is None or (
@@ -1632,7 +1661,9 @@ def functionary_v1_v2_chat_handler(
             stops = ["\n", END_ASSISTANT_TOKEN]
         # If tool_choice/function_call is "none"
         elif isinstance(function_call, str) and function_call == "none":
-            prompt = prepare_messages_for_inference(messages, tokenizer, version, [], [])
+            prompt = prepare_messages_for_inference(
+                messages, tokenizer, version, [], []
+            )
             stops = END_ASSISTANT_TOKEN
         # If tool_choice/function_call is provided
         elif isinstance(function_call, dict):
@@ -1647,14 +1678,27 @@ def functionary_v1_v2_chat_handler(
 
         completion = create_completion(stop=stops)
         completion_text = completion["choices"][0]["text"]
-        
+
         # If the generation does not involve a function call
-        if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text:
+        if (
+            START_FUNCTION_CALL_TOKEN not in prompt
+            and START_FUNCTION_CALL_TOKEN not in completion_text
+        ):
             return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
         # If the generation involves a function call in completion, generate the parameters
-        elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text:
-            prompt += completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
-            function_calls.append(completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip())
+        elif (
+            START_FUNCTION_CALL_TOKEN not in prompt
+            and START_FUNCTION_CALL_TOKEN in completion_text
+        ):
+            prompt += (
+                completion_text.replace(
+                    f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN
+                )
+                + "\n"
+            )
+            function_calls.append(
+                completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
+            )
             grammar = get_grammar(function_calls[-1])
             completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
             function_bodies.append(completion["choices"][0]["text"].strip())
@@ -1672,7 +1716,10 @@ def functionary_v1_v2_chat_handler(
                 stops = CONTENT_TOKEN
             # If tool_choice/function_call is "none"
             elif isinstance(function_call, str) and function_call == "none":
-                prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>"
+                prompt = (
+                    prepare_messages_for_inference(messages, tokenizer, version, [], [])
+                    + "all\n<|content|>"
+                )
                 stops = STOP_TOKEN
             # If tool_choice/function_call is provided
             elif isinstance(function_call, dict):
@@ -1684,15 +1731,17 @@ def functionary_v1_v2_chat_handler(
             else:
                 prompt = prompt
                 stops = STOP_TOKEN
-                
+
             completion = create_completion(stop=stops)
             completion_text = completion["choices"][0]["text"]
-            
+
             # If the generation does not involve a function call
-            if prompt.endswith("all\n<|content|>") and not completion_text.startswith("all"):
+            if prompt.endswith("all\n<|content|>") and not completion_text.startswith(
+                "all"
+            ):
                 return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
             # Generate model response if the model decides not to call any function
-            elif (prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all")):
+            elif prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all"):
                 prompt += completion_text + CONTENT_TOKEN
                 completion = create_completion(stop=STOP_TOKEN)
                 return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
@@ -1704,7 +1753,7 @@ def functionary_v1_v2_chat_handler(
                 function_bodies.append(completion["choices"][0]["text"].strip())
                 prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}"
                 grammar = None
-                
+
                 # Try to generate the beginning of next turn
                 # If empty completion, break from loop
                 next_turn_completion_text = create_completion(
@@ -1718,17 +1767,21 @@ def functionary_v1_v2_chat_handler(
             else:
                 function_bodies.append(completion_text.strip())
                 break
-            
+
     assert "usage" in completion
     assert len(function_calls) > 0
     assert len(function_calls) == len(function_bodies)
-    
+
     tool_calls = []
     for function_call, function_body in zip(function_calls, function_bodies):
         tool_calls.append(
             {
-                "id": "call_" + "".join(
-                    [random.choice(string.ascii_letters + string.digits) for _ in range(24)]
+                "id": "call_"
+                + "".join(
+                    [
+                        random.choice(string.ascii_letters + string.digits)
+                        for _ in range(24)
+                    ]
                 ),
                 "type": "function",
                 "function": {
@@ -1924,7 +1977,9 @@ class Llava15ChatHandler:
                         json.dumps(response_format["schema"])
                     )
             except Exception as e:
-                grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF
+                )
 
         return _convert_completion_to_chat(
             llama.create_completion(
@@ -1950,3 +2005,601 @@ class Llava15ChatHandler:
             ),
             stream=stream,
         )
+
+
+@register_chat_completion_handler("chatml-function-calling")
+def chatml_function_calling(
+    llama: llama.Llama,
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    top_k: int = 40,
+    min_p: float = 0.05,
+    typical_p: float = 1.0,
+    stream: bool = False,
+    stop: Optional[Union[str, List[str]]] = [],
+    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
+    max_tokens: Optional[int] = None,
+    presence_penalty: float = 0.0,
+    frequency_penalty: float = 0.0,
+    repeat_penalty: float = 1.1,
+    tfs_z: float = 1.0,
+    mirostat_mode: int = 0,
+    mirostat_tau: float = 5.0,
+    mirostat_eta: float = 0.1,
+    model: Optional[str] = None,
+    logits_processor: Optional[llama.LogitsProcessorList] = None,
+    grammar: Optional[llama.LlamaGrammar] = None,
+    **kwargs,  # type: ignore
+) -> Union[
+    llama_types.CreateChatCompletionResponse,
+    Iterator[llama_types.CreateChatCompletionStreamResponse],
+]:
+    function_calling_template = (
+        "{% for message in messages %}"
+        "<|im_start|>{{ message.role }}\n"
+        # System message
+        "{% if message.role == 'system' %}"
+        "{{ message.content }}"
+        "{% if tool_calls %}"
+        "\n\nYou have access to the following functions:\n"
+        "{% for tool in tools %}"
+        "\nfunctions.{{ tool.function.name }}:\n"
+        "{{ tool.function.parameters | tojson }}"
+        "\n{% endfor %}"
+        "\n\nYou can respond to users messages with either a single message or one or more function calls."
+        "\n\nTo respond with a message begin the message with 'message:', use the following format:"
+        "\n\nmessage:"
+        "\n<message>"
+        "\n\nTo respond with one or more function calls begin the message with 'functions.<function_name>:', use the following format:"
+        "\n\nfunctions.<function_name>:"
+        '\n{ "arg1": "value1", "arg2": "value2" }'
+        "\nfunctions.<function_name>:"
+        '\n{ "arg1": "value1", "arg2": "value2" }'
+        "{% endif %}"
+        "\n<|im_end|>\n"
+        "{% endif %}"
+        # User message
+        "{% if message.role == 'user' %}"
+        "{{ message.content }}"
+        "\n<|im_end|>\n"
+        "{% endif %}"
+        # Assistant message
+        "{% if message.role == 'assistant' %}"
+        ## Reglar message
+        "{% if message.content and message.content | length > 0 %}"
+        "message:\n"
+        "{{ message.content }}"
+        "\n<|im_end|>\n"
+        "{% endif %}"
+        ## Function calls
+        "{% if message.tool_calls %}"
+        "{% for tool_call in message.tool_calls %}"
+        "functions.{{ tool_call.function.name }}:\n"
+        "{{ tool_call.function.arguments }}"
+        "{% endfor %}"
+        "\n<|im_end|>\n"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+    )
+    template_renderer = jinja2.Environment(
+        loader=jinja2.BaseLoader(),
+        autoescape=jinja2.select_autoescape(["html", "xml"]),
+        undefined=jinja2.StrictUndefined,
+    ).from_string(function_calling_template)
+
+    # Convert legacy functions to tools
+    if functions is not None:
+        tools = [
+            {
+                "type": "function",
+                "function": function,
+            }
+            for function in functions
+        ]
+
+    # Convert legacy function_call to tool_choice
+    if function_call is not None:
+        if isinstance(function_call, str) and (
+            function_call == "none" or function_call == "auto"
+        ):
+            tool_choice = function_call
+        if isinstance(function_call, dict) and "name" in function_call:
+            tool_choice = {
+                "type": "function",
+                "function": {
+                    "name": function_call["name"],
+                },
+            }
+
+    # Case 1: No tool choice by user
+    if (
+        tool_choice is None
+        or (isinstance(tool_choice, str) and tool_choice == "none")
+        or tools is None
+        or len(tools) == 0
+    ):
+        prompt = template_renderer.render(
+            messages=messages,
+            tools=[],
+            tool_calls=None,
+        )
+        if response_format is not None and response_format["type"] == "json_object":
+            try:
+                grammar = (
+                    llama_grammar.LlamaGrammar.from_json_schema(
+                        json.dumps(response_format["schema"])
+                    )
+                    if "schema" in response_format
+                    else None
+                )
+            except Exception as e:
+                if llama.verbose:
+                    print(
+                        "Failed to parse response format as JSON schema, falling back to default grammar"
+                    )
+                    print(e)
+            grammar = (
+                llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+                if grammar is None
+                else grammar
+            )
+        return _convert_completion_to_chat(
+            llama.create_completion(
+                prompt=prompt,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                stream=stream,
+                stop=stop,
+                max_tokens=max_tokens,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                grammar=grammar,
+            ),
+            stream=stream,
+        )
+
+    def _convert_completion_to_chat_function(
+        tool_name: str,
+        completion_or_chunks: Union[
+            llama_types.CreateCompletionResponse,
+            Iterator[llama_types.CreateCompletionStreamResponse],
+        ],
+        stream: bool,
+    ):
+        if not stream:
+            completion: llama_types.CreateCompletionResponse = completion_or_chunks  # type: ignore
+            assert "usage" in completion
+            tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
+            # TODO: Fix for legacy function calls
+            chat_completion: llama_types.CreateChatCompletionResponse = {
+                "id": "chat" + completion["id"],
+                "object": "chat.completion",
+                "created": completion["created"],
+                "model": completion["model"],
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": None,
+                            "function_call": {
+                                "name": tool_name,
+                                "arguments": completion["choices"][0]["text"],
+                            },
+                            "tool_calls": [
+                                {
+                                    "id": tool_id,
+                                    "type": "function",
+                                    "function": {
+                                        "name": tool_name,
+                                        "arguments": completion["choices"][0]["text"],
+                                    },
+                                }
+                            ],
+                        },
+                        "finish_reason": "tool_calls",
+                    }
+                ],
+                "usage": completion["usage"],
+            }
+            return chat_completion
+        else:
+            chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
+
+            def _stream_response_to_function_stream(
+                chunks: Iterator[llama_types.CreateCompletionStreamResponse],
+            ) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
+                # blank first message
+                first = True
+                id_ = None
+                created = None
+                model = None
+                tool_id = None
+                for chunk in chunks:
+                    if first:
+                        id_ = "chat" + chunk["id"]
+                        created = chunk["created"]
+                        model = chunk["model"]
+                        tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
+                        yield {
+                            "id": id_,
+                            "object": "chat.completion.chunk",
+                            "created": created,
+                            "model": model,
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "finish_reason": None,
+                                    "logprobs": None,
+                                    "delta": {
+                                        "role": "assistant",
+                                        "content": None,
+                                        "function_call": None,
+                                        "tool_calls": None,
+                                    },
+                                }
+                            ],
+                        }
+                        yield {
+                            "id": "chat" + chunk["id"],
+                            "object": "chat.completion.chunk",
+                            "created": chunk["created"],
+                            "model": chunk["model"],
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "finish_reason": None,
+                                    "logprobs": None,
+                                    "delta": {
+                                        "role": None,
+                                        "content": None,
+                                        "function_call": {
+                                            "name": tool_name,
+                                            "arguments": chunk["choices"][0]["text"],
+                                        },
+                                        "tool_calls": [
+                                            {
+                                                "index": 0,
+                                                "id": tool_id,
+                                                "type": "function",
+                                                "function": {
+                                                    "name": tool_name,
+                                                    "arguments": "",
+                                                },
+                                            }
+                                        ],
+                                    },
+                                }
+                            ],
+                        }
+                        first = False
+                        continue
+                    assert tool_id is not None
+                    yield {
+                        "id": "chat" + chunk["id"],
+                        "object": "chat.completion.chunk",
+                        "created": chunk["created"],
+                        "model": chunk["model"],
+                        "choices": [
+                            {
+                                "index": 0,
+                                "finish_reason": None,
+                                "logprobs": None,
+                                "delta": {
+                                    "role": None,
+                                    "content": None,
+                                    "function_call": {
+                                        "name": tool_name,
+                                        "arguments": chunk["choices"][0]["text"],
+                                    },
+                                    "tool_calls": [
+                                        {
+                                            "index": 0,
+                                            "id": tool_id,
+                                            "type": "function",
+                                            "function": {
+                                                "name": tool_name,
+                                                "arguments": chunk["choices"][0][
+                                                    "text"
+                                                ],
+                                            },
+                                        }
+                                    ],
+                                },
+                            }
+                        ],
+                    }
+
+                if id_ is not None and created is not None and model is not None:
+                    yield {
+                        "id": id_,
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": model,
+                        "choices": [
+                            {
+                                "index": 0,
+                                "finish_reason": "tool_calls",
+                                "logprobs": None,
+                                "delta": {
+                                    "role": None,
+                                    "content": None,
+                                    "function_call": None,
+                                    "tool_calls": None,
+                                },
+                            }
+                        ],
+                    }
+
+            return _stream_response_to_function_stream(chunks)
+
+    # Case 2: Tool choice by user
+    if isinstance(tool_choice, dict):
+        tool_name = tool_choice["function"]["name"]
+        tool = next(
+            (tool for tool in tools if tool["function"]["name"] == tool_name), None
+        )
+        if tool is None:
+            raise ValueError(f"Tool with name '{tool_name}' not found in tools")
+        prompt = template_renderer.render(
+            messages=messages,
+            tools=tools,
+            tool_calls=True,
+        )
+        prompt += f"functions.{tool_name}:\n"
+        try:
+            grammar = llama_grammar.LlamaGrammar.from_json_schema(
+                json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
+            )
+        except Exception as e:
+            grammar = llama_grammar.LlamaGrammar.from_string(
+                llama_grammar.JSON_GBNF, verbose=llama.verbose
+            )
+            if llama.verbose:
+                print(
+                    "Failed to parse function body as JSON schema, falling back to default grammar"
+                )
+                print(e)
+        completion_or_chunks = llama.create_completion(
+            prompt=prompt,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            stream=stream,
+            stop=stop,
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            grammar=grammar,
+        )
+        return _convert_completion_to_chat_function(
+            tool_name, completion_or_chunks, stream
+        )
+
+    # Case 3: Automatic tool choice
+    assert isinstance(tool_choice, str) and tool_choice == "auto"
+    function_names = " | ".join(
+        [f'''"functions.{tool['function']['name']}:"''' for tool in tools]
+    )
+    initial_gbnf_tool_grammar = (
+        """root   ::= functions | "message:"\n"""
+        f"""functions ::= {function_names}\n"""
+    )
+    follow_up_gbnf_tool_grammar = (
+        """root   ::= functions | "<|im_end|>"\n"""
+        f"""functions ::= {function_names}\n"""
+    )
+    prompt = template_renderer.render(
+        messages=messages,
+        tools=tools,
+        tool_calls=True,
+    )
+    completion_or_chunks = llama.create_completion(
+        prompt=prompt,
+        temperature=0,
+        top_p=top_p,
+        top_k=top_k,
+        min_p=min_p,
+        typical_p=typical_p,
+        stream=False,
+        stop=[":"],
+        max_tokens=None,
+        presence_penalty=presence_penalty,
+        frequency_penalty=frequency_penalty,
+        repeat_penalty=repeat_penalty,
+        tfs_z=tfs_z,
+        mirostat_mode=mirostat_mode,
+        mirostat_tau=mirostat_tau,
+        mirostat_eta=mirostat_eta,
+        model=model,
+        logits_processor=logits_processor,
+        grammar=llama_grammar.LlamaGrammar.from_string(
+            initial_gbnf_tool_grammar, verbose=llama.verbose
+        ),
+    )
+    completion: llama_types.CreateCompletionResponse = completion_or_chunks  # type: ignore
+    text = completion["choices"][0]["text"]
+    if "message" in text:
+        return _convert_completion_to_chat(
+            llama.create_completion(
+                prompt=prompt + "message:\n",
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                stream=stream,
+                stop=["<|im_end|>"],
+                max_tokens=None,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                grammar=llama_grammar.LlamaGrammar.from_string(
+                    follow_up_gbnf_tool_grammar, verbose=llama.verbose
+                ),
+            ),
+            stream=stream,
+        )
+
+    # One or more function calls
+    tool_name = text[len("functions.") :]
+    tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
+    if not stream:
+        completions = []
+        completions_tool_name = []
+        while tool is not None:
+            prompt += f"functions.{tool_name}:\n"
+            try:
+                grammar = llama_grammar.LlamaGrammar.from_json_schema(
+                    json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
+                )
+            except Exception as e:
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                )
+                if llama.verbose:
+                    print(
+                        "Failed to parse function body as JSON schema, falling back to default grammar"
+                    )
+                    print(e)
+            completion_or_chunks = llama.create_completion(
+                prompt=prompt,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                stream=False,
+                stop=stop,
+                max_tokens=None,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                grammar=grammar,
+            )
+            completions.append(completion_or_chunks)
+            completions_tool_name.append(tool_name)
+            prompt += completion_or_chunks["choices"][0]["text"]
+            prompt += "\n"
+
+            response = llama.create_completion(
+                prompt=prompt,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                stream=False,
+                stop=stop,
+                max_tokens=None,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                grammar=llama_grammar.LlamaGrammar.from_string(
+                    follow_up_gbnf_tool_grammar, verbose=llama.verbose
+                ),
+            )
+
+            tool_name = response["choices"][0]["text"][len("functions.") :]
+            tool = next(
+                (tool for tool in tools if tool["function"]["name"] == tool_name), None
+            )
+
+        # Merge completions
+        function_call = { 
+            "function_call": {
+                "name": tool_name,
+                "arguments": completions[0]["choices"][0]["text"],
+            }
+        } if len(completions) == 1 else {}
+        return {
+            "id": "chat" + completion["id"],
+            "object": "chat.completion",
+            "created": completion["created"],
+            "model": completion["model"],
+            "choices": [
+                {
+                    "finish_reason": "tool_calls",
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_"
+                                + f"_{i}_"
+                                + tool_name
+                                + "_"
+                                + completion["id"],
+                                "type": "function",
+                                "function": {
+                                    "name": tool_name,
+                                    "arguments": completion["choices"][0]["text"],
+                                },
+                            }
+                            for i, (tool_name, completion) in enumerate(
+                                zip(completions_tool_name, completions)
+                            )
+                        ],
+                        **function_call
+                    },
+                }
+            ],
+            "usage": {
+                "completion_tokens": sum(
+                    completion["usage"]["completion_tokens"]
+                    for completion in completions
+                ),
+                "prompt_tokens": sum(
+                    completion["usage"]["prompt_tokens"] for completion in completions
+                ),
+                "total_tokens": sum(
+                    completion["usage"]["total_tokens"] for completion in completions
+                ),
+            },
+        }
+
+    raise ValueError("Automatic streaming tool choice is not supported")
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index c3deba8..1b1befe 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -97,7 +97,7 @@ class CreateChatCompletionResponse(TypedDict):
 
 
 class ChatCompletionMessageToolCallChunkFunction(TypedDict):
-    name: str
+    name: Optional[str]
     arguments: str
 
 
@@ -118,12 +118,12 @@ class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
 
 
 class ChatCompletionStreamResponseDelta(TypedDict):
-    content: NotRequired[str]
+    content: NotRequired[Optional[str]]
     function_call: NotRequired[
-        ChatCompletionStreamResponseDeltaFunctionCall
+        Optional[ChatCompletionStreamResponseDeltaFunctionCall]
     ]  # DEPRECATED
-    tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]]
-    role: NotRequired[Literal["system", "user", "assistant", "tool"]]
+    tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]]
+    role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]]
 
 
 class ChatCompletionStreamResponseChoice(TypedDict):
@@ -132,6 +132,7 @@ class ChatCompletionStreamResponseChoice(TypedDict):
         ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
     ]
     finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
+    logprobs: NotRequired[Optional[CompletionLogprobs]]
 
 
 class CreateChatCompletionStreamResponse(TypedDict):

From cb791716b42eb897acf66b8b78c4a67b6e026a74 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 12 Feb 2024 16:19:05 -0500
Subject: [PATCH 17/38] fix: Always set logits_all = True when using
 speculative decoding

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 3efd95d..4869a9d 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -281,7 +281,7 @@ class Llama:
         )
         self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
         self.context_params.mul_mat_q = mul_mat_q
-        self.context_params.logits_all = logits_all
+        self.context_params.logits_all = logits_all if draft_model is None else True # Must be set to True for speculative decoding
         self.context_params.embedding = embedding
         self.context_params.offload_kqv = offload_kqv
 

From b82b0e10148659bceb5e79ae221a4bce2c54da79 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 12 Feb 2024 16:27:43 -0500
Subject: [PATCH 18/38] docs: Temporarily revert function calling docs

---
 README.md | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 59a7cd4..679c977 100644
--- a/README.md
+++ b/README.md
@@ -292,18 +292,22 @@ To constrain the response to a specific JSON Schema, you can use the `schema` pr
 
 ### Function Calling
 
-The high-level API also provides a simple interface for function calling.
+The high-level API also provides a simple interface for function calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat forma.
 
-The only set of models that supports full function calling at this time is [functionary](https://github.com/MeetKai/functionary). The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**. You can provide either `functionary-v1` or `functionary-v2` for the `chat_format` when initializing the Llama class.
-
-Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files.
+The gguf-converted files for functionary can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
 
 ```python
->>> from llama_cpp import Llama, LlamaHFTokenizer
->>> tokenizer = LlamaHFTokenizer.from_pretrained("path/to/functionary/")
->>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", tokenizer=tokenizer, chat_format="functionary-v2")
+>>> from llama_cpp import Llama
+>>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
+>>> # or 
+>>> llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling")
 >>> llm.create_chat_completion(
       messages = [
+        {
+          "role": "system",
+          "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
+
+        },
         {
           "role": "user",
           "content": "Extract Jason is 25 years old"
@@ -330,12 +334,12 @@ Note that due to discrepancies between llama.cpp and HuggingFace's tokenizers, i
           }
         }
       }],
-      tool_choice={
+      tool_choice=[{
         "type": "function",
         "function": {
           "name": "UserDetail"
         }
-      },
+      }]
 )
 ```
 

From d605875772a381d863b3960d14e8eeb52908b561 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 12 Feb 2024 16:28:30 -0500
Subject: [PATCH 19/38] Bump version

---
 CHANGELOG.md          | 8 ++++++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5ce0b43..d2bb710 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.40]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@3bdc4cd0f595a6096cca4a64aa75ffa8a3503465
+- feat: Generic chatml Function Calling using chat_format="chatml-function-calling"` by @abetlen in #957
+- fix: Circular dependancy preventing early Llama object free by @notwa in #1176
+- docs: Set the correct command for compiling with syscl support by @akarshanbiswas in #1172
+- feat: use gpu backend for clip if available by @iamlemec in #1175
+
 ## [0.2.39]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@b08f22c882a1443e6b97081f3ce718a4d1a741f8
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 837e3c9..ccafd02 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.39"
\ No newline at end of file
+__version__ = "0.2.40"
\ No newline at end of file

From 4348a6cdf057f5746db213867f93ed1359091fa3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 13 Feb 2024 02:04:54 -0500
Subject: [PATCH 20/38] docs: Fix typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 679c977..3d8d4d4 100644
--- a/README.md
+++ b/README.md
@@ -292,7 +292,7 @@ To constrain the response to a specific JSON Schema, you can use the `schema` pr
 
 ### Function Calling
 
-The high-level API also provides a simple interface for function calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat forma.
+The high-level API also provides a simple interface for function calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.
 
 The gguf-converted files for functionary can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
 

From 5efc45bdfde9c37db27dabecc2955ab9863506c9 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 13 Feb 2024 02:43:07 -0500
Subject: [PATCH 21/38] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 3bdc4cd..895407f 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 3bdc4cd0f595a6096cca4a64aa75ffa8a3503465
+Subproject commit 895407f31b358e3d9335e847d13f033491ec8a5b

From d1822fed6b706f38bd1ff0de4dec5baaa3cf84fa Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 13 Feb 2024 02:44:00 -0500
Subject: [PATCH 22/38] fix: Don't change order of json schema object
 properties unless prop_order is passed, Closes #1180

---
 llama_cpp/llama_grammar.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py
index d8ef563..3eb3b96 100644
--- a/llama_cpp/llama_grammar.py
+++ b/llama_cpp/llama_grammar.py
@@ -1471,12 +1471,15 @@ class SchemaConverter:
 
         if schema_type == "object" and "properties" in schema:
             # TODO: `required` keyword
-            prop_order = self._prop_order
-            prop_pairs = sorted(
-                schema["properties"].items(),
-                # sort by position in prop_order (if specified) then by key
-                key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
-            )
+            if self._prop_order:
+                prop_order = self._prop_order
+                prop_pairs = sorted(
+                    schema["properties"].items(),
+                    # sort by position in prop_order (if specified) then by key
+                    key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
+                )
+            else:
+                prop_pairs = schema["properties"].items()
 
             rule = '"{" space'
             for i, (prop_name, prop_schema) in enumerate(prop_pairs):

From 6fe8b427e1608782ad29b313130ba2fa3e4220b8 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 13 Feb 2024 02:46:52 -0500
Subject: [PATCH 23/38] Bump version

---
 CHANGELOG.md          | 5 +++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d2bb710..e8fcb80 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.41]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@895407f31b358e3d9335e847d13f033491ec8a5b
+- fix: Don't change order of json schema object properties in generated grammar unless prop_order is passed by @abetlen in d1822fed6b706f38bd1ff0de4dec5baaa3cf84fa
+
 ## [0.2.40]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@3bdc4cd0f595a6096cca4a64aa75ffa8a3503465
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index ccafd02..6bc5e8a 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.40"
\ No newline at end of file
+__version__ = "0.2.41"
\ No newline at end of file

From 4b0e3320bd8c2c209e29978d0b21e2e471cc9ee3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 13 Feb 2024 03:11:35 -0500
Subject: [PATCH 24/38] fix: minor formatting bugs for chatml-function-calling

---
 llama_cpp/llama_chat_format.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index af60d5f..66e40ae 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2072,12 +2072,14 @@ def chatml_function_calling(
         "{% if message.role == 'assistant' %}"
         ## Reglar message
         "{% if message.content and message.content | length > 0 %}"
+        "{% if tool_calls %}"
         "message:\n"
+        "{% endif %}"
         "{{ message.content }}"
         "\n<|im_end|>\n"
         "{% endif %}"
         ## Function calls
-        "{% if message.tool_calls %}"
+        "{% if 'tool_calls' in message %}"
         "{% for tool_call in message.tool_calls %}"
         "functions.{{ tool_call.function.name }}:\n"
         "{{ tool_call.function.arguments }}"

From 68fb71b6a26a1e57331868f959b47ab4b87851e1 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 13 Feb 2024 03:24:41 -0500
Subject: [PATCH 25/38] fix: missing generation_prompt in
 chatml-function-calling

---
 llama_cpp/llama_chat_format.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 66e40ae..809a827 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2088,6 +2088,7 @@ def chatml_function_calling(
         "{% endif %}"
         "{% endif %}"
         "{% endfor %}"
+        "{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
     )
     template_renderer = jinja2.Environment(
         loader=jinja2.BaseLoader(),
@@ -2130,6 +2131,7 @@ def chatml_function_calling(
             messages=messages,
             tools=[],
             tool_calls=None,
+            add_generation_prompt=True,
         )
         if response_format is not None and response_format["type"] == "json_object":
             try:
@@ -2363,6 +2365,7 @@ def chatml_function_calling(
             messages=messages,
             tools=tools,
             tool_calls=True,
+            add_generation_prompt=True,
         )
         prompt += f"functions.{tool_name}:\n"
         try:
@@ -2420,6 +2423,7 @@ def chatml_function_calling(
         messages=messages,
         tools=tools,
         tool_calls=True,
+        add_generation_prompt=True,
     )
     completion_or_chunks = llama.create_completion(
         prompt=prompt,

From f7cdf78788da3ef33e3d3a482998d756ee47e8e3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 13 Feb 2024 12:24:00 -0500
Subject: [PATCH 26/38] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 18 ++++++++++++++++++
 vendor/llama.cpp       |  2 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 2724edd..9979a67 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -470,6 +470,7 @@ class llama_model_params(Structure):
 #     bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
 #     bool embedding;   // embedding mode only
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+#     bool do_pooling;  // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
 # };
 class llama_context_params(Structure):
     """Parameters for llama_context
@@ -496,6 +497,7 @@ class llama_context_params(Structure):
         logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         embedding (bool): embedding mode only
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
+        do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
     """
 
     _fields_ = [
@@ -520,6 +522,7 @@ class llama_context_params(Structure):
         ("logits_all", c_bool),
         ("embedding", c_bool),
         ("offload_kqv", c_bool),
+        ("do_pooling", c_bool),
     ]
 
 
@@ -1699,6 +1702,21 @@ _lib.llama_get_embeddings.argtypes = [llama_context_p]
 _lib.llama_get_embeddings.restype = c_float_p
 
 
+# // Get the embeddings for the ith sequence
+# // llama_get_embeddings(ctx) + i*n_embd
+# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
+def llama_get_embeddings_ith(
+    ctx: llama_context_p, i: Union[c_int32, int]
+):  # type: (...) -> Array[float] # type: ignore
+    """Get the embeddings for the ith sequence
+    llama_get_embeddings(ctx) + i*n_embd"""
+    return _lib.llama_get_embeddings_ith(ctx, i)
+
+
+_lib.llama_get_embeddings_ith.argtypes = [llama_context_p, c_int32]
+_lib.llama_get_embeddings_ith.restype = c_float_p
+
+
 # //
 # // Vocab
 # //
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 895407f..ea9c8e1 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 895407f31b358e3d9335e847d13f033491ec8a5b
+Subproject commit ea9c8e11436ad50719987fa23a289c74b7b40d40

From d6be5333e1e28dd07cfec5babd6332c7d1f50788 Mon Sep 17 00:00:00 2001
From: Andrew Lapp <andrew@github.rew.la>
Date: Tue, 13 Feb 2024 17:26:07 +0000
Subject: [PATCH 27/38] fix: sample idx off-by-one error for logit_processors
 (#1179)

* fix sample_idx off-by-one error

* self._scores is indexed differently, only modify the index within self._input_ids

---------

Co-authored-by: Andrew Lapp <andrew@rew.la>
Co-authored-by: Andrei <abetlen@gmail.com>
---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 4869a9d..8d726d3 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -557,7 +557,7 @@ class Llama:
             logits[:] = (
                 logits_processor(self._input_ids, logits)
                 if idx is None
-                else logits_processor(self._input_ids[:idx], logits)
+                else logits_processor(self._input_ids[:idx + 1], logits)
             )
 
         sampling_params = _LlamaSamplingParams(

From b1637c2319936df0ecf1b3eb18ca971b346a147e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 13 Feb 2024 12:35:04 -0500
Subject: [PATCH 28/38] Bump version

---
 CHANGELOG.md          | 6 ++++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e8fcb80..dbc4dca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.42]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@ea9c8e11436ad50719987fa23a289c74b7b40d40
+- fix: sample idx off-by-one error for logit_processors by @lapp0 in #1179
+- fix: chat formatting bugs in `chatml-function-calling` by @abetlen in 4b0e3320bd8c2c209e29978d0b21e2e471cc9ee3 and 68fb71b6a26a1e57331868f959b47ab4b87851e1
+
 ## [0.2.41]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@895407f31b358e3d9335e847d13f033491ec8a5b
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 6bc5e8a..6e71792 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.41"
\ No newline at end of file
+__version__ = "0.2.42"
\ No newline at end of file

From 345215a76cf57b769474ea5dc1aefc5ccfb06d5c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 13 Feb 2024 23:02:50 -0500
Subject: [PATCH 29/38] fix: more chatml-function-calling fixes

---
 llama_cpp/llama_chat_format.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 809a827..7f365e3 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2061,12 +2061,12 @@ def chatml_function_calling(
         "\nfunctions.<function_name>:"
         '\n{ "arg1": "value1", "arg2": "value2" }'
         "{% endif %}"
-        "\n<|im_end|>\n"
+        "<|im_end|>\n"
         "{% endif %}"
         # User message
         "{% if message.role == 'user' %}"
         "{{ message.content }}"
-        "\n<|im_end|>\n"
+        "<|im_end|>\n"
         "{% endif %}"
         # Assistant message
         "{% if message.role == 'assistant' %}"
@@ -2076,7 +2076,7 @@ def chatml_function_calling(
         "message:\n"
         "{% endif %}"
         "{{ message.content }}"
-        "\n<|im_end|>\n"
+        "<|im_end|>\n"
         "{% endif %}"
         ## Function calls
         "{% if 'tool_calls' in message %}"
@@ -2084,11 +2084,11 @@ def chatml_function_calling(
         "functions.{{ tool_call.function.name }}:\n"
         "{{ tool_call.function.arguments }}"
         "{% endfor %}"
-        "\n<|im_end|>\n"
+        "<|im_end|>\n"
         "{% endif %}"
         "{% endif %}"
         "{% endfor %}"
-        "{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+        "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
     )
     template_renderer = jinja2.Environment(
         loader=jinja2.BaseLoader(),
@@ -2120,6 +2120,8 @@ def chatml_function_calling(
                 },
             }
 
+    stop = [stop, "<|im_end|>"] if isinstance(stop, str) else stop + ["<|im_end|>"] if stop else ["<|im_end|>"]
+
     # Case 1: No tool choice by user
     if (
         tool_choice is None

From 7dbbfdecadebe7750be650d9409959640ff9a460 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 13 Feb 2024 23:53:56 -0500
Subject: [PATCH 30/38] fix: submodule kompute is not included in sdist. Closes
 #1165

---
 .github/workflows/build-and-release.yaml |  4 ++--
 .github/workflows/build-docker.yaml      |  2 +-
 .github/workflows/publish-to-test.yaml   |  2 +-
 .github/workflows/publish.yaml           |  2 +-
 .github/workflows/test.yaml              | 10 +++++-----
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 61027ef..63c81f1 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -16,7 +16,7 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          submodules: "true"
+          submodules: "recursive"
 
       # Used to host cibuildwheel
       - uses: actions/setup-python@v3
@@ -48,7 +48,7 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          submodules: "true"
+          submodules: "recursive"
       - uses: actions/setup-python@v3
         with:
           python-version: "3.8"
diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index 27a6b1e..750b91e 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -14,7 +14,7 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
         with:
-          submodules: "true"
+          submodules: "recursive"
 
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v2
diff --git a/.github/workflows/publish-to-test.yaml b/.github/workflows/publish-to-test.yaml
index 9932d61..47e7c40 100644
--- a/.github/workflows/publish-to-test.yaml
+++ b/.github/workflows/publish-to-test.yaml
@@ -18,7 +18,7 @@ jobs:
     steps:
     - uses: actions/checkout@v3
       with:
-        submodules: "true"
+        submodules: "recursive"
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
index 7d6c970..1afdd66 100644
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@@ -12,7 +12,7 @@ jobs:
     steps:
     - uses: actions/checkout@v3
       with:
-        submodules: "true"
+        submodules: "recursive"
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2cc6fb0..77df546 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -19,7 +19,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
         with:
-          submodules: "true"
+          submodules: "recursive"
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
         with:
@@ -42,7 +42,7 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          submodules: "true"
+          submodules: "recursive"
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
         with:
@@ -65,7 +65,7 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          submodules: "true"
+          submodules: "recursive"
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
         with:
@@ -85,7 +85,7 @@ jobs:
   #   steps:
   #     - uses: actions/checkout@v3
   #       with:
-  #         submodules: "true"
+  #         submodules: "recursive"
   #     - name: Set up Python 3.8
   #       uses: actions/setup-python@v4
   #       with:
@@ -112,7 +112,7 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          submodules: "true"
+          submodules: "recursive"
       - name: Set up Python 3.8
         uses: actions/setup-python@v4
         with:

From 7a79e5ac493a3e25a38861828d3e0be3b3c71771 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 13 Feb 2024 23:54:05 -0500
Subject: [PATCH 31/38] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index ea9c8e1..f5ca054 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit ea9c8e11436ad50719987fa23a289c74b7b40d40
+Subproject commit f5ca054855dea83f424003162f26de376e5643f6

From 07a783779a62a4aac0b11161c7e0eb983ff215f8 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 13 Feb 2024 23:57:10 -0500
Subject: [PATCH 32/38] fix: Update openbuddy prompt format. Closes #1155

---
 llama_cpp/llama_chat_format.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 7f365e3..8dd0ddf 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -734,17 +734,14 @@ def format_openbuddy(
     messages: List[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
-    _system_message = """Consider a conversation between User (a human) and Assistant (named Buddy).
-Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
-Buddy cannot access the Internet.
-Buddy can fluently speak the user's language (e.g. English, Chinese).
-Buddy can generate poems, stories, code, essays, songs, parodies, and more.
-Buddy possesses vast knowledge about the world, history, and culture.
-Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
-Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
+    _system_message = """You are a helpful, respectful and honest INTP-T AI Assistant named Buddy. You are talking to a human User.
+Always answer as helpfully and logically as possible, while being safe. Your answers should not include any harmful, political, religious, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
+You can speak fluently in many languages, for example: English, Chinese.
+You cannot access the internet, but you have vast knowledge, cutoff: 2021-09.
+You are trained by OpenBuddy team, (https://openbuddy.ai, https://github.com/OpenBuddy/OpenBuddy), you are based on LLaMA and Falcon transformers model, not related to GPT or OpenAI.
 
-User: Hi.
-Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?"""
+"""
     _roles = dict(user="User", assistant="Assistant")
     _sep = "\n"
     system_message = _system_message

From 6943bab6d817bf71927642ab29e25b94a01fd22c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 14 Feb 2024 03:38:41 -0500
Subject: [PATCH 33/38] fix: destructor exception where internal classes are
 missing some uninitialized attributes

---
 llama_cpp/_internals.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 3a71ef0..9473d35 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -42,6 +42,8 @@ class _LlamaModel:
 
         self._llama_free_model = llama_cpp._lib.llama_free_model  # type: ignore
 
+        self.model = None
+
         if not os.path.exists(path_model):
             raise ValueError(f"Model path does not exist: {path_model}")
 
@@ -248,6 +250,7 @@ class _LlamaContext:
         self.verbose = verbose
 
         self._llama_free = llama_cpp._lib.llama_free  # type: ignore
+        self.ctx = None
 
         assert self.model.model is not None
 
@@ -497,6 +500,7 @@ class _LlamaBatch:
 
         self._llama_batch_free = llama_cpp._lib.llama_batch_free  # type: ignore
 
+        self.batch = None
         self.batch = llama_cpp.llama_batch_init(
             self.n_tokens, self.embd, self.n_seq_max
         )

From 7b9960d1cbeeca2df6cc3ada6614bc12b2b309fc Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 14 Feb 2024 03:47:21 -0500
Subject: [PATCH 34/38] Update llama.cpp

---
 llama_cpp/llava_cpp.py | 71 +-----------------------------------------
 vendor/llama.cpp       |  2 +-
 2 files changed, 2 insertions(+), 71 deletions(-)

diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
index b1f90b9..8195bd4 100644
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@@ -146,30 +146,8 @@ _libllava.llava_eval_image_embed.restype = c_bool
 ################################################
 
 
-# struct clip_vision_hparams {
-#     int32_t image_size;
-#     int32_t patch_size;
-#     int32_t hidden_size;
-#     int32_t n_intermediate;
-#     int32_t projection_dim;
-#     int32_t n_head;
-#     int32_t n_layer;
-#     float eps;
-# };
-class clip_vision_hparams(Structure):
-    _fields_ = [
-        ("image_size", c_int32),
-        ("patch_size", c_int32),
-        ("hidden_size", c_int32),
-        ("n_intermediate", c_int32),
-        ("projection_dim", c_int32),
-        ("n_head", c_int32),
-        ("n_layer", c_int32),
-        ("eps", c_float),
-    ]
-
 # /** load mmproj model */
-# CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
+# CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
 def clip_model_load(fname: bytes, verbosity: Union[c_int, int]) -> clip_ctx_p:
     return _libllava.clip_model_load(fname, verbosity)
 
@@ -183,50 +161,3 @@ def clip_free(ctx: clip_ctx_p):
 
 _libllava.clip_free.argtypes = [clip_ctx_p]
 _libllava.clip_free.restype = None
-
-# size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-# int clip_n_patches(const struct clip_ctx * ctx);
-# int clip_n_mmproj_embd(const struct clip_ctx * ctx);
-
-# // RGB uint8 image
-# struct clip_image_u8 {
-#     int nx;
-#     int ny;
-#     uint8_t * data = NULL;
-#     size_t size;
-# };
-
-# // RGB float32 image (NHWC)
-# // Memory layout: RGBRGBRGB...
-# struct clip_image_f32 {
-#     int nx;
-#     int ny;
-#     float * data = NULL;
-#     size_t size;
-# };
-
-# struct clip_image_u8_batch {
-#     struct clip_image_u8 * data;
-#     size_t size;
-# };
-
-# struct clip_image_f32_batch {
-#     struct clip_image_f32 * data;
-#     size_t size;
-# };
-
-# struct clip_image_u8 * make_clip_image_u8();
-# struct clip_image_f32 * make_clip_image_f32();
-# CLIP_API void clip_image_u8_free(clip_image_u8 * img);
-# CLIP_API void clip_image_f32_free(clip_image_f32 * img);
-# CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
-# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
-# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
-
-# bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
-# bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
-
-# bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
-#                              float * vec);
-
-# bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
\ No newline at end of file
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f5ca054..aa23412 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f5ca054855dea83f424003162f26de376e5643f6
+Subproject commit aa2341298924ac89778252015efcb792f2df1e20

From 36b843228f04bd09b642d1500bdf2910f6196c8f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 14 Feb 2024 03:47:40 -0500
Subject: [PATCH 35/38] misc: fix makefile build commands

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index ff1484c..e2ce4d0 100644
--- a/Makefile
+++ b/Makefile
@@ -19,10 +19,10 @@ build.opencl:
 	CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
 
 build.openblas:
-	CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
+	CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e .
 
 build.blis:
-	CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" python3 -m pip install --verbose -e .
+	CMAKE_ARGS="-DLLAMA_BLAS=on -DLLAMA_BLAS_VENDOR=FLAME" python3 -m pip install --verbose -e .
 
 build.metal:
 	CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install --verbose -e .

From d7a67917ba5b601e146377c6d877893dc49bba83 Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Wed, 14 Feb 2024 03:26:09 -0600
Subject: [PATCH 36/38] feat: Support batch embeddings (#1186)

* handle batched embeddings

* fix normalization issue

* fix type hints, ensure no breaking changes to embed

* Clear kv cache / reset internal state after embedding complete

---------

Co-authored-by: Andrei <abetlen@gmail.com>
---
 llama_cpp/_internals.py |  22 +++++++
 llama_cpp/llama.py      | 135 ++++++++++++++++++++++++++++++----------
 2 files changed, 123 insertions(+), 34 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 9473d35..c60fdff 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -510,6 +510,14 @@ class _LlamaBatch:
             self._llama_batch_free(self.batch)
             self.batch = None
 
+    def n_tokens(self) -> int:
+        assert self.batch is not None
+        return self.batch.n_tokens
+
+    def reset(self):
+        assert self.batch is not None
+        self.batch.n_tokens = 0
+
     def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
         assert self.batch is not None
         n_tokens = len(batch)
@@ -522,6 +530,20 @@ class _LlamaBatch:
             self.batch.logits[i] = logits_all
         self.batch.logits[n_tokens - 1] = True
 
+    def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
+        assert self.batch is not None
+        n_tokens = len(batch)
+        n_tokens0 = self.batch.n_tokens
+        self.batch.n_tokens += n_tokens
+        for i in range(n_tokens):
+            j = n_tokens0 + i
+            self.batch.token[j] = batch[i]
+            self.batch.pos[j] = i
+            self.batch.seq_id[j][0] = seq_id
+            self.batch.n_seq_id[j] = 1
+            self.batch.logits[j] = logits_all
+        self.batch.logits[n_tokens - 1] = True
+
 
 class _LlamaTokenDataArray:
     def __init__(self, *, n_vocab: int):
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 8d726d3..3e09a20 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -717,10 +717,53 @@ class Llama:
         Returns:
             An embedding object.
         """
-        assert self._ctx.ctx is not None
         assert self._model.model is not None
         model_name: str = model if model is not None else self.model_path
 
+        # get numeric embeddings
+        embeds: List[List[float]]
+        total_tokens: int
+        embeds, total_tokens = self.embed(input, return_count=True)  # type: ignore
+
+        # convert to CreateEmbeddingResponse
+        data: List[Embedding] = [
+            {
+                "object": "embedding",
+                "embedding": emb,
+                "index": idx,
+            }
+            for idx, emb in enumerate(embeds)
+        ]
+
+        return {
+            "object": "list",
+            "data": data,
+            "model": model_name,
+            "usage": {
+                "prompt_tokens": total_tokens,
+                "total_tokens": total_tokens,
+            },
+        }
+
+    def embed(
+        self,
+        input: Union[str, List[str]],
+        normalize: bool = True,
+        truncate: bool = True,
+        return_count: bool = False,
+    ):
+        """Embed a string.
+
+        Args:
+            input: The utf-8 encoded string to embed.
+
+        Returns:
+            A list of embeddings
+        """
+        assert self._ctx.ctx is not None
+        n_embd = self.n_embd()
+        n_ctx = self.n_ctx()
+
         if self.context_params.embedding == False:
             raise RuntimeError(
                 "Llama model must be created with embedding=True to call this method"
@@ -734,48 +777,72 @@ class Llama:
         else:
             inputs = input
 
-        data: List[Embedding] = []
+        # reset batch
+        self._batch.reset()
+
+        # decode and fetch embeddings
+        data: List[List[float]] = []
+        def decode_batch(sizes: List[int]):
+            assert self._ctx.ctx is not None
+            llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
+            self._ctx.decode(self._batch)
+            self._batch.reset()
+
+            # store embeddings
+            for i, s in enumerate(sizes):
+                embedding = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[
+                    :n_embd
+                ]
+                norm = np.linalg.norm(embedding) if normalize else s
+                embedding: List[float] = [v / float(norm) for v in embedding]
+                data.append(embedding)
+
+        # init state
         total_tokens = 0
-        for index, input in enumerate(inputs):
-            tokens = self.tokenize(input.encode("utf-8"), special=True)
-            self.reset()
-            self.eval(tokens)
+        t_batch = 0
+        s_sizes: List[int] = []
+
+        # accumulate batches and encode
+        for text in inputs:
+            tokens = self.tokenize(text.encode("utf-8"))
+            if truncate:
+                tokens = tokens[:n_ctx]
+
             n_tokens = len(tokens)
             total_tokens += n_tokens
-            embedding = llama_cpp.llama_get_embeddings(self._ctx.ctx)[
-                : llama_cpp.llama_n_embd(self._model.model)
-            ]
 
-            data.append(
-                {
-                    "object": "embedding",
-                    "embedding": embedding,
-                    "index": index,
-                }
-            )
+            # check for overrun
+            if n_tokens > n_ctx:
+                raise ValueError(
+                    f"Requested tokens ({n_tokens}) exceed context window of {n_ctx}"
+                )
+
+            # time to eval batch
+            if t_batch + n_tokens > self._n_ctx:
+                decode_batch(s_sizes)
+                t_batch = 0
+                s_sizes = []
+
+            # add to batch
+            self._batch.add_sequence(tokens, len(s_sizes), False)
+            t_batch += n_tokens
+            s_sizes.append(n_tokens)
+
+        # hanlde last batch
+        decode_batch(s_sizes)
+
         if self.verbose:
             llama_cpp.llama_print_timings(self._ctx.ctx)
 
-        return {
-            "object": "list",
-            "data": data,
-            "model": model_name,
-            "usage": {
-                "prompt_tokens": total_tokens,
-                "total_tokens": total_tokens,
-            },
-        }
+        output = data[0] if isinstance(input, str) else data
 
-    def embed(self, input: str) -> List[float]:
-        """Embed a string.
+        llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
+        self.reset()
 
-        Args:
-            input: The utf-8 encoded string to embed.
-
-        Returns:
-            A list of embeddings
-        """
-        return list(map(float, self.create_embedding(input)["data"][0]["embedding"]))
+        if return_count:
+            return output, total_tokens
+        else:
+            return output
 
     def _create_completion(
         self,

From c336f782693c447a13da250ee12facb535708981 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 14 Feb 2024 04:27:30 -0500
Subject: [PATCH 37/38] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index aa23412..8084d55 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit aa2341298924ac89778252015efcb792f2df1e20
+Subproject commit 8084d554406b767d36b3250b3b787462d5dd626f

From ae71ad1a147b10c2c3ba99eb086521cddcc4fad4 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 14 Feb 2024 04:31:42 -0500
Subject: [PATCH 38/38] Bump version

---
 CHANGELOG.md          | 7 +++++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dbc4dca..39b553f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.43]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@8084d554406b767d36b3250b3b787462d5dd626f
+- feat: Support batch embeddings by @iamlemec in #1186
+- fix: submodule kompute is not included in sdist by @abetlen in 7dbbfdecadebe7750be650d9409959640ff9a460
+- fix: fix: Update openbuddy prompt format by @abetlen in 07a783779a62a4aac0b11161c7e0eb983ff215f8
+
 ## [0.2.42]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@ea9c8e11436ad50719987fa23a289c74b7b40d40
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 6e71792..e0bd254 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.42"
\ No newline at end of file
+__version__ = "0.2.43"
\ No newline at end of file