From 8d298b47507c9d82945f0add20d1ddb3e8ea0aa4 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 18 Mar 2024 10:26:36 -0400
Subject: [PATCH 01/11] feat: Update llama.cpp

---
 llama_cpp/llama_cpp.py | 85 ++++++++++++++++++++++++++++++++++++++----
 vendor/llama.cpp       |  2 +-
 2 files changed, 78 insertions(+), 9 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index b9593cf..6b5c1bc 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -581,6 +581,7 @@ class llama_model_params(ctypes.Structure):
 #     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
 
+
 #     // Abort callback
 #     // if it returns true, execution of llama_decode() will be aborted
 #     // currently works only with CPU execution
@@ -1006,6 +1007,11 @@ def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
 def llama_n_embd(model: llama_model_p, /) -> int: ...
 
 
+# LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
+@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_n_layer(model: llama_model_p, /) -> int: ...
+
+
 # // Get the model's RoPE frequency scaling factor
 # LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
 @ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
@@ -1166,12 +1172,18 @@ def llama_model_quantize(
     ...
 
 
+# // Apply a LoRA adapter to a loaded model
+# // path_base_model is the path to a higher quality model to use as a base for
+# // the layers modified by the adapter. Can be NULL to use the current loaded model.
+# // The model needs to be reloaded before applying a new adapter, otherwise the adapter
+# // will be applied on top of the previous one
+# // Returns 0 on success
 # LLAMA_API int32_t llama_model_apply_lora_from_file(
 #         const struct llama_model * model,
-#                   const char * path_lora,
-#                        float   scale,
-#                   const char * path_base_model,
-#                      int32_t   n_threads);
+#                       const char * path_lora,
+#                            float   scale,
+#                       const char * path_base_model,
+#                          int32_t   n_threads);
 @ctypes_function(
     "llama_model_apply_lora_from_file",
     [
@@ -1190,7 +1202,57 @@ def llama_model_apply_lora_from_file(
     path_base_model: Union[ctypes.c_char_p, bytes, None],
     n_threads: Union[ctypes.c_int32, int],
     /,
-) -> int: ...
+) -> int:
+    """Apply a LoRA adapter to a loaded model
+    path_base_model is the path to a higher quality model to use as a base for
+    the layers modified by the adapter. Can be NULL to use the current loaded model.
+    The model needs to be reloaded before applying a new adapter, otherwise the adapter
+    will be applied on top of the previous one
+    Returns 0 on success"""
+    ...
+
+
+# // Apply a loaded control vector to a llama_context, or if data is NULL, clear
+# // the currently loaded vector.
+# // n_embd should be the size of a single layer's control, and data should point
+# // to an n_embd x n_layers buffer starting from layer 1.
+# // il_start and il_end are the layer range the vector should apply to (both inclusive)
+# // See llama_control_vector_load in common to load a control vector.
+# LLAMA_API int32_t llama_control_vector_apply(
+#         struct llama_context * lctx,
+#                  const float * data,
+#                       size_t   len,
+#                      int32_t   n_embd,
+#                      int32_t   il_start,
+#                      int32_t   il_end);
+@ctypes_function(
+    "llama_control_vector_apply",
+    [
+        llama_context_p_ctypes,
+        ctypes.POINTER(ctypes.c_float),
+        ctypes.c_size_t,
+        ctypes.c_int32,
+        ctypes.c_int32,
+        ctypes.c_int32,
+    ],
+    ctypes.c_int32,
+)
+def llama_control_vector_apply(
+    lctx: llama_context_p,
+    data: CtypesPointerOrRef[ctypes.c_float],
+    len: int,
+    n_embd: int,
+    il_start: int,
+    il_end: int,
+    /,
+) -> int:
+    """Apply a loaded control vector to a llama_context, or if data is NULL, clear
+    the currently loaded vector.
+    n_embd should be the size of a single layer's control, and data should point
+    to an n_embd x n_layers buffer starting from layer 1.
+    il_start and il_end are the layer range the vector should apply to (both inclusive)
+    See llama_control_vector_load in common to load a control vector."""
+    ...
 
 
 # //
@@ -1205,6 +1267,12 @@ def llama_model_apply_lora_from_file(
 #     llama_pos pos;
 # };
 class llama_kv_cache_view_cell(ctypes.Structure):
+    """Information associated with an individual cell in the KV cache view.
+
+    Attributes:
+        pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
+            May be negative if the cell is not populated."""
+
     _fields_ = [("pos", llama_pos)]
 
 
@@ -1985,7 +2053,7 @@ def llama_tokenize(
     /,
 ) -> int:
     """Convert the provided text into tokens.
-    
+
     Args:
         model: The model to use for tokenization.
         text: The text to tokenize.
@@ -1995,10 +2063,11 @@ def llama_tokenize(
         add_bos: Whether to add a beginning-of-sentence token.
         special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
                  Does not insert a leading space.
-                 
+
     Returns:
         Returns the number of tokens on success, no more than n_tokens_max
-        Returns a negative number on failure - the number of tokens that would have been returned"""
+        Returns a negative number on failure - the number of tokens that would have been returned
+    """
     ...
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 4e9a7f7..ac9ee6a 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
+Subproject commit ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1

From 8a60c7bc8cae7aa9770eeac0f482d39350763a6f Mon Sep 17 00:00:00 2001
From: Jeffrey Fong <jeffreyfong94@gmail.com>
Date: Mon, 18 Mar 2024 22:40:57 +0800
Subject: [PATCH 02/11] fix: Fix and optimize functionary chat handler (#1282)

* fix functionary chat logic

* further fixes

---------

Co-authored-by: Andrei <abetlen@gmail.com>
---
 llama_cpp/llama_chat_format.py | 131 ++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 66 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 81ca552..c89cce8 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1596,13 +1596,15 @@ def functionary_v1_v2_chat_handler(
         function_call = (
             tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
         )
+    else:
+        function_call = "auto"
 
     prompt = prepare_messages_for_inference(
         messages, tokenizer, version, functions, tools
     )
 
     # If no tools/functions are provided
-    if function_call is None and (functions is None or len(functions) == 0):
+    if function_call == "none" or functions is None or len(functions) == 0:
         if version == "v1":
             stop = END_ASSISTANT_TOKEN
         else:
@@ -1630,6 +1632,7 @@ def functionary_v1_v2_chat_handler(
             logits_processor=logits_processor,
             grammar=grammar,
         )
+        completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
         return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
 
     assert stream is False  # TODO: support stream mode
@@ -1692,13 +1695,12 @@ def functionary_v1_v2_chat_handler(
 
         return completion
 
+    content = ""
     function_calls, function_bodies = [], []
 
     if version == "v1":
         # If no or "auto" tool_choice/function_call
-        if function_call is None or (
-            isinstance(function_call, str) and function_call == "auto"
-        ):
+        if isinstance(function_call, str) and function_call == "auto":
             stops = ["\n", END_ASSISTANT_TOKEN]
         # If tool_choice/function_call is "none"
         elif isinstance(function_call, str) and function_call == "none":
@@ -1747,70 +1749,67 @@ def functionary_v1_v2_chat_handler(
         else:
             function_bodies.append(completion_text.strip())
     else:
-        # Loop until all parallel function calls are generated
-        while True:
-            # If no or "auto" tool_choice/function_call
-            if function_call is None or (
-                isinstance(function_call, str) and function_call == "auto"
-            ):
-                grammar = None
-                stops = CONTENT_TOKEN
-            # If tool_choice/function_call is "none"
-            elif isinstance(function_call, str) and function_call == "none":
-                prompt = (
-                    prepare_messages_for_inference(messages, tokenizer, version, [], [])
-                    + "all\n<|content|>"
-                )
-                stops = STOP_TOKEN
-            # If tool_choice/function_call is provided
-            elif isinstance(function_call, dict):
-                prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
-                stops = STOP_TOKEN
-                function_call = function_call["name"]
-                function_calls.append(function_call)
-                grammar = get_grammar(function_call)
-            else:
-                prompt = prompt
-                stops = STOP_TOKEN
-
+        # If tool_choice/function_call is "none"
+        if isinstance(function_call, str) and function_call == "none":
+            prompt = (
+                prepare_messages_for_inference(messages, tokenizer, version, [], [])
+                + "all\n<|content|>"
+            )
+            stops = [STOP_TOKEN, FROM_TOKEN]
+            completion = create_completion(stop=stops)
+            completion["choices"][0]["text"] = completion["choices"][0]["text"].strip()
+            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+        # If tool_choice/function_call is provided
+        elif isinstance(function_call, dict):
+            prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
+            function_call = function_call["name"]
+            function_calls.append(function_call)
+            grammar = get_grammar(function_call)
+            stops = [STOP_TOKEN, FROM_TOKEN]
             completion = create_completion(stop=stops)
             completion_text = completion["choices"][0]["text"]
-
-            # If the generation does not involve a function call
-            if prompt.endswith("all\n<|content|>") and not completion_text.startswith(
-                "all"
-            ):
-                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
-            # Generate model response if the model decides not to call any function
-            elif prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all"):
-                prompt += completion_text + CONTENT_TOKEN
-                completion = create_completion(stop=STOP_TOKEN)
-                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
-            # Generate parameters if model decides to call a function
-            elif prompt.endswith(RECIPIENT_TOKEN):
-                function_calls.append(completion_text[:-1])
-                grammar = get_grammar(function_calls[-1])
-                completion = create_completion(stop=[STOP_TOKEN, "\n"])
-                function_bodies.append(completion["choices"][0]["text"].strip())
-                prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}"
+            function_bodies.append(completion_text.strip())
+        # If "auto" or no tool_choice/function_call
+        elif isinstance(function_call, str) and function_call == "auto":
+            while True:
+                # Generate function name first
                 grammar = None
-
-                # Try to generate the beginning of next turn
-                # If empty completion, break from loop
-                next_turn_completion_text = create_completion(
-                    stop=[STOP_TOKEN, RECIPIENT_TOKEN]
-                )["choices"][0]["text"]
-                if len(next_turn_completion_text) > 0:
-                    prompt += f"\n{FROM_TOKEN}assistant\n{RECIPIENT_TOKEN}"
+                stops = CONTENT_TOKEN
+                completion = create_completion(stop=stops)
+                completion_text = completion["choices"][0]["text"]
+                function_name = completion_text.strip()
+                if function_name == "all":
+                    prompt += "all\n<|content|>"
                 else:
-                    break
-            # Break from loop if tool_choice/function_call is provided as a dict
-            else:
-                function_bodies.append(completion_text.strip())
-                break
+                    function_call = completion_text.strip()
+                    prompt += f"{function_call}\n<|content|>"
+                    function_calls.append(function_call)
+                    grammar = get_grammar(function_call)
+                # Generate content
+                stops = [RECIPIENT_TOKEN, STOP_TOKEN]
+                completion = create_completion(stop=stops)
+                completion_text = completion["choices"][0]["text"]
+                if function_name == "all":
+                    content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n")
+                    content = content.lstrip()
+                    # Check whether the model wants to generate another turn
+                    if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
+                        cleaned_completion_text = completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n").strip()
+                        prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
+                    else:
+                        break
+                else:
+                    function_bodies.append(completion_text.strip())
+                    # Check whether the model wants to generate another turn
+                    prompt += completion_text.strip()
+                    grammar = None
+                    completion = create_completion(stop=stops)
+                    if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
+                        prompt += "\n<|from|>assistant\n<|recipient|>"
+                    else:
+                        break
 
     assert "usage" in completion
-    assert len(function_calls) > 0
     assert len(function_calls) == len(function_bodies)
 
     tool_calls = []
@@ -1843,14 +1842,14 @@ def functionary_v1_v2_chat_handler(
                 "index": 0,
                 "message": {
                     "role": "assistant",
-                    "content": None,
+                    "content": None if content == "" else content,
                     "function_call": {
                         "name": tool_calls[0]["function"]["name"],
                         "arguments": tool_calls[0]["function"]["arguments"],
-                    },
-                    "tool_calls": tool_calls,
+                    } if len(tool_calls) > 0 else None,
+                    "tool_calls": tool_calls if len(tool_calls) > 0 else None,
                 },
-                "finish_reason": "tool_calls",
+                "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
             }
         ],
         usage=completion["usage"],

From bf64752535ac73032a25d6ba9ae0f064246964f2 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 18 Mar 2024 11:37:30 -0400
Subject: [PATCH 03/11] chore: Bump version

---
 CHANGELOG.md          | 7 +++++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 90dd1e6..a85eaa4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.57]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
+- fix: set default embedding pooling type to unspecified by @abetlen in 4084aabe867b8ec2aba1b22659e59c9318b0d1f3
+- fix: Fix and optimize functionary chat handler by @jeffrey-fong in #1282
+- fix: json mode for basic chat formats by @abetlen in 20e6815252d0efd9f015f7adbf108faaf36e3f3c
+
 ## [0.2.56]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index fcbc715..1e802fa 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.56"
\ No newline at end of file
+__version__ = "0.2.57"
\ No newline at end of file

From 18d7ce918f45bc2bcf80102239142c49ebe29925 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 19 Mar 2024 04:40:24 -0400
Subject: [PATCH 04/11] feat: Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index ac9ee6a..b80cf3b 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
+Subproject commit b80cf3b2d1dee0ad325f7a794fecc66befce7336

From 60d8498f212ca1eb4303d95610022a055718bbb8 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Tue, 19 Mar 2024 04:55:57 -0400
Subject: [PATCH 05/11] feat: Add tools/functions variables to
 Jinja2ChatFormatter, add function response formatting for all simple chat
 formats  (#1273)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add tools/functions variables to Jinja2ChatFormatter

Also fixed missing tools/tool_choices parameters in chat_formatter_to_chat_completion_handler().

* Set grammar when doing explicit function calling

* Add function / tool response for all chat formats

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 llama_cpp/llama_chat_format.py | 408 +++++++++++++++++++--------------
 1 file changed, 233 insertions(+), 175 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index c89cce8..5bda163 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -188,6 +188,10 @@ class Jinja2ChatFormatter(ChatFormatter):
         self,
         *,
         messages: List[llama_types.ChatCompletionRequestMessage],
+        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
         **kwargs: Any,
     ) -> ChatFormatterResponse:
         def raise_exception(message: str):
@@ -199,6 +203,10 @@ class Jinja2ChatFormatter(ChatFormatter):
             bos_token=self.bos_token,
             raise_exception=raise_exception,
             add_generation_prompt=self.add_generation_prompt,
+            functions=functions,
+            function_call=function_call,
+            tools=tools,
+            tool_choice=tool_choice,
         )
 
         return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token])
@@ -288,6 +296,183 @@ def _convert_completion_to_chat(
         return _convert_text_completion_to_chat(completion)
 
 
+def _convert_completion_to_chat_function(
+    tool_name: str,
+    completion_or_chunks: Union[
+        llama_types.CreateCompletionResponse,
+        Iterator[llama_types.CreateCompletionStreamResponse],
+    ],
+    stream: bool,
+):
+    if not stream:
+        completion: llama_types.CreateCompletionResponse = completion_or_chunks  # type: ignore
+        assert "usage" in completion
+        tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
+        # TODO: Fix for legacy function calls
+        chat_completion: llama_types.CreateChatCompletionResponse = {
+            "id": "chat" + completion["id"],
+            "object": "chat.completion",
+            "created": completion["created"],
+            "model": completion["model"],
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": None,
+                        "function_call": {
+                            "name": tool_name,
+                            "arguments": completion["choices"][0]["text"],
+                        },
+                        "tool_calls": [
+                            {
+                                "id": tool_id,
+                                "type": "function",
+                                "function": {
+                                    "name": tool_name,
+                                    "arguments": completion["choices"][0]["text"],
+                                },
+                            }
+                        ],
+                    },
+                    "finish_reason": "tool_calls",
+                }
+            ],
+            "usage": completion["usage"],
+        }
+        return chat_completion
+    else:
+        chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
+
+        def _stream_response_to_function_stream(
+            chunks: Iterator[llama_types.CreateCompletionStreamResponse],
+        ) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
+            # blank first message
+            first = True
+            id_ = None
+            created = None
+            model = None
+            tool_id = None
+            for chunk in chunks:
+                if first:
+                    id_ = "chat" + chunk["id"]
+                    created = chunk["created"]
+                    model = chunk["model"]
+                    tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
+                    yield {
+                        "id": id_,
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": model,
+                        "choices": [
+                            {
+                                "index": 0,
+                                "finish_reason": None,
+                                "logprobs": None,
+                                "delta": {
+                                    "role": "assistant",
+                                    "content": None,
+                                    "function_call": None,
+                                    "tool_calls": None,
+                                },
+                            }
+                        ],
+                    }
+                    yield {
+                        "id": "chat" + chunk["id"],
+                        "object": "chat.completion.chunk",
+                        "created": chunk["created"],
+                        "model": chunk["model"],
+                        "choices": [
+                            {
+                                "index": 0,
+                                "finish_reason": None,
+                                "logprobs": None,
+                                "delta": {
+                                    "role": None,
+                                    "content": None,
+                                    "function_call": {
+                                        "name": tool_name,
+                                        "arguments": chunk["choices"][0]["text"],
+                                    },
+                                    "tool_calls": [
+                                        {
+                                            "index": 0,
+                                            "id": tool_id,
+                                            "type": "function",
+                                            "function": {
+                                                "name": tool_name,
+                                                "arguments": "",
+                                            },
+                                        }
+                                    ],
+                                },
+                            }
+                        ],
+                    }
+                    first = False
+                    continue
+                assert tool_id is not None
+                yield {
+                    "id": "chat" + chunk["id"],
+                    "object": "chat.completion.chunk",
+                    "created": chunk["created"],
+                    "model": chunk["model"],
+                    "choices": [
+                        {
+                            "index": 0,
+                            "finish_reason": None,
+                            "logprobs": None,
+                            "delta": {
+                                "role": None,
+                                "content": None,
+                                "function_call": {
+                                    "name": tool_name,
+                                    "arguments": chunk["choices"][0]["text"],
+                                },
+                                "tool_calls": [
+                                    {
+                                        "index": 0,
+                                        "id": tool_id,
+                                        "type": "function",
+                                        "function": {
+                                            "name": tool_name,
+                                            "arguments": chunk["choices"][0][
+                                                "text"
+                                            ],
+                                        },
+                                    }
+                                ],
+                            },
+                        }
+                    ],
+                }
+
+            if id_ is not None and created is not None and model is not None:
+                yield {
+                    "id": id_,
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": model,
+                    "choices": [
+                        {
+                            "index": 0,
+                            "finish_reason": "tool_calls",
+                            "logprobs": None,
+                            "delta": {
+                                "role": None,
+                                "content": None,
+                                "function_call": None,
+                                "tool_calls": None,
+                            },
+                        }
+                    ],
+                }
+
+        return _stream_response_to_function_stream(chunks)
+
+
+
 def chat_formatter_to_chat_completion_handler(
     chat_formatter: ChatFormatter,
 ) -> LlamaChatCompletionHandler:
@@ -331,6 +516,8 @@ def chat_formatter_to_chat_completion_handler(
             messages=messages,
             functions=functions,
             function_call=function_call,
+            tools=tools,
+            tool_choice=tool_choice,
         )
         prompt = result.prompt
         if result.stop is not None:
@@ -341,6 +528,47 @@ def chat_formatter_to_chat_completion_handler(
         if response_format is not None and response_format["type"] == "json_object":
             grammar = _grammar_for_response_format(response_format, verbose=llama.verbose)
 
+        # Convert legacy functions to tools
+        if functions is not None:
+            tools = [
+                {
+                    "type": "function",
+                    "function": function,
+                }
+                for function in functions
+            ]
+
+        # Convert legacy function_call to tool_choice
+        if function_call is not None:
+            if isinstance(function_call, str) and (
+                function_call == "none" or function_call == "auto"
+            ):
+                tool_choice = function_call
+            if isinstance(function_call, dict) and "name" in function_call:
+                tool_choice = {
+                    "type": "function",
+                    "function": {
+                        "name": function_call["name"],
+                    },
+                }
+
+        tool = None
+        if tool_choice is not None and isinstance(tool_choice, dict) and tools is not None:
+            name = tool_choice["function"]["name"]
+            tool = next((t for t in tools if t["function"]["name"] == name), None)
+            if tool is None:
+                raise ValueError(f"Tool choice '{name}' not found in tools.")
+            schema = tool["function"]["parameters"]
+            try:
+                # create grammar from json schema
+                grammar = llama_grammar.LlamaGrammar.from_json_schema(
+                    json.dumps(schema), verbose=llama.verbose
+                )
+            except Exception as e:
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                )
+
         completion_or_chunks = llama.create_completion(
             prompt=prompt,
             temperature=temperature,
@@ -364,6 +592,11 @@ def chat_formatter_to_chat_completion_handler(
             grammar=grammar,
             logit_bias=logit_bias,
         )
+        if tool is not None:
+            tool_name = tool["function"]["name"]
+            return _convert_completion_to_chat_function(
+                tool_name, completion_or_chunks, stream
+            )
         return _convert_completion_to_chat(completion_or_chunks, stream=stream)
 
     return chat_completion_handler
@@ -2198,181 +2431,6 @@ def chatml_function_calling(
             stream=stream,
         )
 
-    def _convert_completion_to_chat_function(
-        tool_name: str,
-        completion_or_chunks: Union[
-            llama_types.CreateCompletionResponse,
-            Iterator[llama_types.CreateCompletionStreamResponse],
-        ],
-        stream: bool,
-    ):
-        if not stream:
-            completion: llama_types.CreateCompletionResponse = completion_or_chunks  # type: ignore
-            assert "usage" in completion
-            tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
-            # TODO: Fix for legacy function calls
-            chat_completion: llama_types.CreateChatCompletionResponse = {
-                "id": "chat" + completion["id"],
-                "object": "chat.completion",
-                "created": completion["created"],
-                "model": completion["model"],
-                "choices": [
-                    {
-                        "index": 0,
-                        "message": {
-                            "role": "assistant",
-                            "content": None,
-                            "function_call": {
-                                "name": tool_name,
-                                "arguments": completion["choices"][0]["text"],
-                            },
-                            "tool_calls": [
-                                {
-                                    "id": tool_id,
-                                    "type": "function",
-                                    "function": {
-                                        "name": tool_name,
-                                        "arguments": completion["choices"][0]["text"],
-                                    },
-                                }
-                            ],
-                        },
-                        "finish_reason": "tool_calls",
-                    }
-                ],
-                "usage": completion["usage"],
-            }
-            return chat_completion
-        else:
-            chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
-
-            def _stream_response_to_function_stream(
-                chunks: Iterator[llama_types.CreateCompletionStreamResponse],
-            ) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
-                # blank first message
-                first = True
-                id_ = None
-                created = None
-                model = None
-                tool_id = None
-                for chunk in chunks:
-                    if first:
-                        id_ = "chat" + chunk["id"]
-                        created = chunk["created"]
-                        model = chunk["model"]
-                        tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
-                        yield {
-                            "id": id_,
-                            "object": "chat.completion.chunk",
-                            "created": created,
-                            "model": model,
-                            "choices": [
-                                {
-                                    "index": 0,
-                                    "finish_reason": None,
-                                    "logprobs": None,
-                                    "delta": {
-                                        "role": "assistant",
-                                        "content": None,
-                                        "function_call": None,
-                                        "tool_calls": None,
-                                    },
-                                }
-                            ],
-                        }
-                        yield {
-                            "id": "chat" + chunk["id"],
-                            "object": "chat.completion.chunk",
-                            "created": chunk["created"],
-                            "model": chunk["model"],
-                            "choices": [
-                                {
-                                    "index": 0,
-                                    "finish_reason": None,
-                                    "logprobs": None,
-                                    "delta": {
-                                        "role": None,
-                                        "content": None,
-                                        "function_call": {
-                                            "name": tool_name,
-                                            "arguments": chunk["choices"][0]["text"],
-                                        },
-                                        "tool_calls": [
-                                            {
-                                                "index": 0,
-                                                "id": tool_id,
-                                                "type": "function",
-                                                "function": {
-                                                    "name": tool_name,
-                                                    "arguments": "",
-                                                },
-                                            }
-                                        ],
-                                    },
-                                }
-                            ],
-                        }
-                        first = False
-                        continue
-                    assert tool_id is not None
-                    yield {
-                        "id": "chat" + chunk["id"],
-                        "object": "chat.completion.chunk",
-                        "created": chunk["created"],
-                        "model": chunk["model"],
-                        "choices": [
-                            {
-                                "index": 0,
-                                "finish_reason": None,
-                                "logprobs": None,
-                                "delta": {
-                                    "role": None,
-                                    "content": None,
-                                    "function_call": {
-                                        "name": tool_name,
-                                        "arguments": chunk["choices"][0]["text"],
-                                    },
-                                    "tool_calls": [
-                                        {
-                                            "index": 0,
-                                            "id": tool_id,
-                                            "type": "function",
-                                            "function": {
-                                                "name": tool_name,
-                                                "arguments": chunk["choices"][0][
-                                                    "text"
-                                                ],
-                                            },
-                                        }
-                                    ],
-                                },
-                            }
-                        ],
-                    }
-
-                if id_ is not None and created is not None and model is not None:
-                    yield {
-                        "id": id_,
-                        "object": "chat.completion.chunk",
-                        "created": created,
-                        "model": model,
-                        "choices": [
-                            {
-                                "index": 0,
-                                "finish_reason": "tool_calls",
-                                "logprobs": None,
-                                "delta": {
-                                    "role": None,
-                                    "content": None,
-                                    "function_call": None,
-                                    "tool_calls": None,
-                                },
-                            }
-                        ],
-                    }
-
-            return _stream_response_to_function_stream(chunks)
-
     # Case 2: Tool choice by user
     if isinstance(tool_choice, dict):
         tool_name = tool_choice["function"]["name"]

From f7decc956207f181710a43e2795f478ad2a5fc5e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 19 Mar 2024 10:52:53 -0400
Subject: [PATCH 06/11] docs: Add chat examples to openapi ui

---
 llama_cpp/server/app.py | 68 +++++++++++++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 9 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index aa6afc1..2e1081e 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -12,14 +12,7 @@ import llama_cpp
 import anyio
 from anyio.streams.memory import MemoryObjectSendStream
 from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
-from fastapi import (
-    Depends,
-    FastAPI,
-    APIRouter,
-    Request,
-    HTTPException,
-    status,
-)
+from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body
 from fastapi.middleware import Middleware
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import HTTPBearer
@@ -356,7 +349,64 @@ async def create_embedding(
 )
 async def create_chat_completion(
     request: Request,
-    body: CreateChatCompletionRequest,
+    body: CreateChatCompletionRequest = Body(
+        openapi_examples={
+            "normal": {
+                "summary": "Chat Completion",
+                "value": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "What is the capital of France?"},
+                    ],
+                },
+            },
+            "json_mode": {
+                "summary": "JSON Mode",
+                "value": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "Who won the world series in 2020"},
+                    ],
+                    "response_format": { "type": "json_object" }
+                },
+            },
+            "tool_calling": {
+                "summary": "Tool Calling",
+                "value": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "Extract Jason is 30 years old."},
+                    ],
+                    "tools": [
+                        {
+                            "type": "function",
+                            "function": {
+                                "name": "User",
+                                "description": "User record",
+                                "parameters": {
+                                    "type": "object",
+                                    "properties": {
+                                        "name": {"type": "string"},
+                                        "age": {"type": "number"},
+                                    },
+                                    "required": ["name", "age"],
+                                },
+                            }
+                        }
+                    ],
+                    "tool_choice": {
+                        "type": "function",
+                        "function": {
+                            "name": "User",
+                        }
+                    }
+                },
+            },
+        }
+    ),
     llama_proxy: LlamaProxy = Depends(get_llama_proxy),
 ) -> llama_cpp.ChatCompletion:
     exclude = {

From 740f3f38125d1cc1bbe80f25944a292d0e966868 Mon Sep 17 00:00:00 2001
From: bretello <bretello@users.noreply.github.com>
Date: Wed, 20 Mar 2024 17:46:09 +0100
Subject: [PATCH 07/11] fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64
 (#1289)

---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b4df8ef..7415149 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,11 @@ if (LLAMA_BUILD)
         set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
         set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
     endif()
+
+    if (APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+        set(LLAMA_METAL_EMBED_LIBRARY "On" CACHE BOOL "llama: embed metal library" FORCE)
+    endif()
+
     add_subdirectory(vendor/llama.cpp)
     install(
         TARGETS llama 

From 3db03b73027036cf336fda2448894c36d3899cab Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 20 Mar 2024 13:27:43 -0400
Subject: [PATCH 08/11] feat: Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index b80cf3b..f9c7ba3 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit b80cf3b2d1dee0ad325f7a794fecc66befce7336
+Subproject commit f9c7ba34476ffc4f13ae2cdb1aec493a16eb8d47

From c89be28ef945017e8b64ea5f194ae7073fd872e8 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 20 Mar 2024 20:50:47 -0400
Subject: [PATCH 09/11] feat: Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f9c7ba3..42e21c6 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f9c7ba34476ffc4f13ae2cdb1aec493a16eb8d47
+Subproject commit 42e21c68826f2e56b9592dccd9f3c43895b6890d

From e325a831f015fdc807b436bc3c48e52cce658a18 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 22 Mar 2024 23:43:29 -0400
Subject: [PATCH 10/11] feat: Update llama.cpp

---
 llama_cpp/llama_cpp.py | 64 ++++++++++++++++++++++++++++++++++++------
 vendor/llama.cpp       |  2 +-
 2 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 6b5c1bc..1b8f6ca 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -668,13 +668,15 @@ It might not exist for progress report where '.' is output repeatedly."""
 
 # // model quantization parameters
 # typedef struct llama_model_quantize_params {
-#     int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-#     enum llama_ftype ftype;      // quantize to this llama_ftype
-#     bool allow_requantize;       // allow quantizing non-f32/f16 tensors
-#     bool quantize_output_tensor; // quantize output.weight
-#     bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-#     bool pure;                   // quantize all tensors to the default type
-#     void * imatrix;              // pointer to importance matrix data
+#     int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+#     enum llama_ftype ftype;              // quantize to this llama_ftype
+#     enum ggml_type output_tensor_type;   // output tensor type
+#     enum ggml_type token_embedding_type; // itoken embeddings tensor type
+#     bool allow_requantize;               // allow quantizing non-f32/f16 tensors
+#     bool quantize_output_tensor;         // quantize output.weight
+#     bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+#     bool pure;                           // quantize all tensors to the default type
+#     void * imatrix;                      // pointer to importance matrix data
 # } llama_model_quantize_params;
 class llama_model_quantize_params(ctypes.Structure):
     """Parameters for llama_model_quantize
@@ -682,16 +684,20 @@ class llama_model_quantize_params(ctypes.Structure):
     Attributes:
         nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
         ftype (int): quantize to this llama_ftype
+        output_tensor_type (int): output tensor type
+        token_embedding_type (int): itoken embeddings tensor type
         allow_requantize (bool): allow quantizing non-f32/f16 tensors
         quantize_output_tensor (bool): quantize output.weight
         only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
         pure (bool): quantize all tensors to the default type
-        imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data
+        imatrix (ctypes.c_void_p): pointer to importance matrix data
     """
 
     _fields_ = [
         ("nthread", ctypes.c_int32),
         ("ftype", ctypes.c_int),
+        ("output_tensor_type", ctypes.c_int),
+        ("token_embedding_type", ctypes.c_int),
         ("allow_requantize", ctypes.c_bool),
         ("quantize_output_tensor", ctypes.c_bool),
         ("only_copy", ctypes.c_bool),
@@ -2743,6 +2749,48 @@ def llama_beam_search(
 ): ...
 
 
+# /// @details Build a split GGUF final path for this chunk.
+# ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
+# //  Returns the split_path length.
+# LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
+@ctypes_function(
+    "llama_split_path",
+    [ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
+    ctypes.c_int,
+)
+def llama_split_path(
+    split_path: bytes,
+    maxlen: Union[ctypes.c_size_t, int],
+    path_prefix: bytes,
+    split_no: Union[ctypes.c_int, int],
+    split_count: Union[ctypes.c_int, int],
+    /,
+) -> int:
+    """Build a split GGUF final path for this chunk."""
+    ...
+
+
+# /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
+# ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
+# //  Returns the split_prefix length.
+# LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
+@ctypes_function(
+    "llama_split_prefix",
+    [ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
+    ctypes.c_int,
+)
+def llama_split_prefix(
+    split_prefix: bytes,
+    maxlen: Union[ctypes.c_size_t, int],
+    split_path: bytes,
+    split_no: Union[ctypes.c_int, int],
+    split_count: Union[ctypes.c_int, int],
+    /,
+) -> int:
+    """Extract the path prefix from the split_path if and only if the split_no and split_count match."""
+    ...
+
+
 # Performance information
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 42e21c6..50ccaf5 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 42e21c68826f2e56b9592dccd9f3c43895b6890d
+Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652

From c1325dcdfba7cb331b4c09d110001658b94ebb9f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 22 Mar 2024 23:44:04 -0400
Subject: [PATCH 11/11] fix: tool_call missing first token.

---
 llama_cpp/llama_chat_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 5bda163..ccf4fd0 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -402,7 +402,7 @@ def _convert_completion_to_chat_function(
                                             "type": "function",
                                             "function": {
                                                 "name": tool_name,
-                                                "arguments": "",
+                                                "arguments": chunk["choices"][0]["text"],
                                             },
                                         }
                                     ],