feat: Generic chatml Function Calling (#957)

* Add demo notebook * Add initial chat handler * Update OpenAI types * Add generic chatml function calling (wip) * Update chatml generic function calling. * Progress on auto-tool calls * fix streaming functions * Remove print statements * fix: Suppress output from llama.cpp init and grammar creation * Add OpenAI v1 python api compatible chat completion function * Support non-streaming multi-tool calls * Format * Include function_call in response.
2024-02-12 15:56:07 -05:00 · 2024-02-12 15:56:07 -05:00 · 153a0049d9
commit 153a0049d9
parent 69413ce08e
4 changed files with 1660 additions and 60 deletions
--- a/examples/notebooks/OpenHermesFunctionCalling.ipynb
+++ b/examples/notebooks/OpenHermesFunctionCalling.ipynb
@ -0,0 +1,910 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  \"name\": \"get_article_details\",\n",
+      "  \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"title\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"authors\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      },\n",
+      "      \"short_summary\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"date_published\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"tags\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Article\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import inspect\n",
+    "from typing import get_type_hints\n",
+    "\n",
+    "class Article:\n",
+    "    pass\n",
+    "\n",
+    "class Weather:\n",
+    "    pass\n",
+    "\n",
+    "class Directions:\n",
+    "    pass\n",
+    "\n",
+    "def calculate_mortgage_payment(loan_amount: int, interest_rate: float, loan_term: int) -> float:\n",
+    "    \"\"\"Get the monthly mortgage payment given an interest rate percentage.\"\"\"\n",
+    "    \n",
+    "    # TODO: you must implement this to actually call it later\n",
+    "    pass\n",
+    "\n",
+    "def get_article_details(title: str, authors: list[str], short_summary: str, date_published: str, tags: list[str]) -> Article:\n",
+    "    '''Get article details from unstructured article text.\n",
+    "date_published: formatted as \"MM/DD/YYYY\"'''\n",
+    "    \n",
+    "    # TODO: you must implement this to actually call it later\n",
+    "    pass\n",
+    "\n",
+    "def get_weather(zip_code: str) -> Weather:\n",
+    "    \"\"\"Get the current weather given a zip code.\"\"\"\n",
+    "    \n",
+    "    # TODO: you must implement this to actually call it later\n",
+    "    pass\n",
+    "\n",
+    "def get_directions(start: str, destination: str) -> Directions:\n",
+    "    \"\"\"Get directions from Google Directions API.\n",
+    "start: start address as a string including zipcode (if any)\n",
+    "destination: end address as a string including zipcode (if any)\"\"\"\n",
+    "    \n",
+    "    # TODO: you must implement this to actually call it later\n",
+    "    pass\n",
+    "\n",
+    "def get_type_name(t):\n",
+    "    name = str(t)\n",
+    "    if \"list\" in name or \"dict\" in name:\n",
+    "        return name\n",
+    "    else:\n",
+    "        return t.__name__\n",
+    "\n",
+    "def serialize_function_to_json(func):\n",
+    "    signature = inspect.signature(func)\n",
+    "    type_hints = get_type_hints(func)\n",
+    "\n",
+    "    function_info = {\n",
+    "        \"name\": func.__name__,\n",
+    "        \"description\": func.__doc__,\n",
+    "        \"parameters\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {}\n",
+    "        },\n",
+    "        \"returns\": type_hints.get('return', 'void').__name__\n",
+    "    }\n",
+    "\n",
+    "    for name, _ in signature.parameters.items():\n",
+    "        param_type = get_type_name(type_hints.get(name, type(None)))\n",
+    "        function_info[\"parameters\"][\"properties\"][name] = {\"type\": param_type}\n",
+    "\n",
+    "    return json.dumps(function_info, indent=2)\n",
+    "\n",
+    "print(serialize_function_to_json(get_article_details))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xml.etree.ElementTree as ET\n",
+    "import re\n",
+    "\n",
+    "def extract_function_calls(completion):\n",
+    "    completion = completion.strip()\n",
+    "    pattern = r\"(<multiplefunctions>(.*?)</multiplefunctions>)\"\n",
+    "    match = re.search(pattern, completion, re.DOTALL)\n",
+    "    if not match:\n",
+    "        return None\n",
+    "    \n",
+    "    multiplefn = match.group(1)\n",
+    "    root = ET.fromstring(multiplefn)\n",
+    "    functions = root.findall(\"functioncall\")\n",
+    "    return [json.loads(fn.text) for fn in functions]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_hermes_prompt(prompt, functions):\n",
+    "    functions = \"\\n\\n\".join([serialize_function_to_json(fn) for fn in functions])\n",
+    "    prompt = f\"\"\"<|im_start|>system\n",
+    "You are a helpful assistant with access to the following functions:\n",
+    "\n",
+    "{functions}\n",
+    "\n",
+    "To use these functions respond with:\n",
+    "<multiplefunctions>\n",
+    "    <functioncall> {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} </functioncall>\n",
+    "    <functioncall> {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} </functioncall>\n",
+    "    ...\n",
+    "</multiplefunctions>\n",
+    "\n",
+    "Edge cases you must handle:\n",
+    "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
+    "<|im_start|>user\n",
+    "{prompt}<|im_end|>\n",
+    "<|im_start|>assistant\"\"\"\n",
+    "    return prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<|im_start|>system\n",
+      "You are a helpful assistant with access to the following functions:\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"get_weather\",\n",
+      "  \"description\": \"Get the current weather given a zip code.\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"zip_code\": {\n",
+      "        \"type\": \"str\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Weather\"\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"calculate_mortgage_payment\",\n",
+      "  \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"loan_amount\": {\n",
+      "        \"type\": \"int\"\n",
+      "      },\n",
+      "      \"interest_rate\": {\n",
+      "        \"type\": \"float\"\n",
+      "      },\n",
+      "      \"loan_term\": {\n",
+      "        \"type\": \"int\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"float\"\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"get_article_details\",\n",
+      "  \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"title\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"authors\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      },\n",
+      "      \"short_summary\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"date_published\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"tags\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Article\"\n",
+      "}\n",
+      "\n",
+      "To use these functions respond with:\n",
+      "<multiplefunctions>\n",
+      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
+      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
+      "    ...\n",
+      "</multiplefunctions>\n",
+      "\n",
+      "Edge cases you must handle:\n",
+      "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
+      "<|im_start|>user\n",
+      "What's the weather in 10001?<|im_end|>\n",
+      "<|im_start|>assistant\n",
+      "<|im_start|>system\n",
+      "You are a helpful assistant with access to the following functions:\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"get_weather\",\n",
+      "  \"description\": \"Get the current weather given a zip code.\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"zip_code\": {\n",
+      "        \"type\": \"str\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Weather\"\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"calculate_mortgage_payment\",\n",
+      "  \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"loan_amount\": {\n",
+      "        \"type\": \"int\"\n",
+      "      },\n",
+      "      \"interest_rate\": {\n",
+      "        \"type\": \"float\"\n",
+      "      },\n",
+      "      \"loan_term\": {\n",
+      "        \"type\": \"int\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"float\"\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"get_article_details\",\n",
+      "  \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"title\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"authors\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      },\n",
+      "      \"short_summary\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"date_published\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"tags\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Article\"\n",
+      "}\n",
+      "\n",
+      "To use these functions respond with:\n",
+      "<multiplefunctions>\n",
+      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
+      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
+      "    ...\n",
+      "</multiplefunctions>\n",
+      "\n",
+      "Edge cases you must handle:\n",
+      "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
+      "<|im_start|>user\n",
+      "Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.<|im_end|>\n",
+      "<|im_start|>assistant\n",
+      "<|im_start|>system\n",
+      "You are a helpful assistant with access to the following functions:\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"get_weather\",\n",
+      "  \"description\": \"Get the current weather given a zip code.\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"zip_code\": {\n",
+      "        \"type\": \"str\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Weather\"\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"calculate_mortgage_payment\",\n",
+      "  \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"loan_amount\": {\n",
+      "        \"type\": \"int\"\n",
+      "      },\n",
+      "      \"interest_rate\": {\n",
+      "        \"type\": \"float\"\n",
+      "      },\n",
+      "      \"loan_term\": {\n",
+      "        \"type\": \"int\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"float\"\n",
+      "}\n",
+      "\n",
+      "{\n",
+      "  \"name\": \"get_article_details\",\n",
+      "  \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
+      "  \"parameters\": {\n",
+      "    \"type\": \"object\",\n",
+      "    \"properties\": {\n",
+      "      \"title\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"authors\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      },\n",
+      "      \"short_summary\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"date_published\": {\n",
+      "        \"type\": \"str\"\n",
+      "      },\n",
+      "      \"tags\": {\n",
+      "        \"type\": \"list[str]\"\n",
+      "      }\n",
+      "    }\n",
+      "  },\n",
+      "  \"returns\": \"Article\"\n",
+      "}\n",
+      "\n",
+      "To use these functions respond with:\n",
+      "<multiplefunctions>\n",
+      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
+      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
+      "    ...\n",
+      "</multiplefunctions>\n",
+      "\n",
+      "Edge cases you must handle:\n",
+      "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
+      "<|im_start|>user\n",
+      "What's the current exchange rate for USD to EUR?<|im_end|>\n",
+      "<|im_start|>assistant\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompts = [\n",
+    "    \"What's the weather in 10001?\",\n",
+    "    \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n",
+    "    \"What's the current exchange rate for USD to EUR?\"\n",
+    "]\n",
+    "functions = [get_weather, calculate_mortgage_payment, get_article_details]\n",
+    "\n",
+    "for prompt in prompts:\n",
+    "    print(generate_hermes_prompt(prompt, functions))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no\n",
+      "ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes\n",
+      "ggml_init_cublas: found 1 CUDA devices:\n",
+      "  Device 0: NVIDIA GeForce RTX 2060, compute capability 7.5\n",
+      "llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from ../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf (version GGUF V3 (latest))\n",
+      "llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32002,     1,     1 ]\n",
+      "llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor    8:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor    9:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   10:              blk.1.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   11:              blk.1.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   12:              blk.1.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   13:         blk.1.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   14:            blk.1.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   15:              blk.1.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   16:            blk.1.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   17:           blk.1.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   18:            blk.1.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   19:              blk.2.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   20:              blk.2.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   21:              blk.2.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   22:         blk.2.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   23:            blk.2.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   24:              blk.2.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   25:            blk.2.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   26:           blk.2.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   27:            blk.2.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   28:              blk.3.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   29:              blk.3.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   30:              blk.3.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   31:         blk.3.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   32:            blk.3.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   33:              blk.3.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   34:            blk.3.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   35:           blk.3.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   36:            blk.3.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   37:              blk.4.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   38:              blk.4.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   39:              blk.4.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   40:         blk.4.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   41:            blk.4.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   42:              blk.4.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   43:            blk.4.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   44:           blk.4.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   45:            blk.4.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   46:              blk.5.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   47:              blk.5.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   48:              blk.5.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   49:         blk.5.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   50:            blk.5.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   51:              blk.5.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   52:            blk.5.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   53:           blk.5.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   54:            blk.5.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   55:              blk.6.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   56:              blk.6.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   57:              blk.6.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   58:         blk.6.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   59:            blk.6.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   60:              blk.6.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   61:            blk.6.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   62:           blk.6.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   63:            blk.6.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   64:              blk.7.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   65:              blk.7.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   66:              blk.7.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   67:         blk.7.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   68:            blk.7.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   69:              blk.7.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   70:            blk.7.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   71:           blk.7.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   72:            blk.7.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   73:              blk.8.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   74:              blk.8.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   75:              blk.8.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   76:         blk.8.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   77:            blk.8.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   78:              blk.8.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   79:            blk.8.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   80:           blk.8.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   81:            blk.8.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   82:              blk.9.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   83:              blk.9.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   84:              blk.9.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   85:         blk.9.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   86:            blk.9.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   87:              blk.9.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   88:            blk.9.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   89:           blk.9.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   90:            blk.9.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   91:             blk.10.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   92:             blk.10.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   93:             blk.10.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor   94:        blk.10.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   95:           blk.10.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   96:             blk.10.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor   97:           blk.10.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor   98:          blk.10.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor   99:           blk.10.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  100:             blk.11.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  101:             blk.11.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  102:             blk.11.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  103:        blk.11.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  104:           blk.11.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  105:             blk.11.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  106:           blk.11.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  107:          blk.11.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  108:           blk.11.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  109:             blk.12.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  110:             blk.12.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  111:             blk.12.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  112:        blk.12.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  113:           blk.12.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  114:             blk.12.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  115:           blk.12.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  116:          blk.12.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  117:           blk.12.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  118:             blk.13.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  119:             blk.13.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  120:             blk.13.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  121:        blk.13.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  122:           blk.13.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  123:             blk.13.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  124:           blk.13.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  125:          blk.13.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  126:           blk.13.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  127:             blk.14.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  128:             blk.14.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  129:             blk.14.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  130:        blk.14.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  131:           blk.14.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  132:             blk.14.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  133:           blk.14.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  134:          blk.14.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  135:           blk.14.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  136:             blk.15.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  137:             blk.15.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  138:             blk.15.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  139:        blk.15.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  140:           blk.15.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  141:             blk.15.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  142:           blk.15.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  143:          blk.15.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  144:           blk.15.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  145:             blk.16.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  146:             blk.16.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  147:             blk.16.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  148:        blk.16.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  149:           blk.16.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  150:             blk.16.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  151:           blk.16.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  152:          blk.16.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  153:           blk.16.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  154:             blk.17.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  155:             blk.17.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  156:             blk.17.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  157:        blk.17.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  158:           blk.17.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  159:             blk.17.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  160:           blk.17.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  161:          blk.17.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  162:           blk.17.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  163:             blk.18.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  164:             blk.18.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  165:             blk.18.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  166:        blk.18.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  167:           blk.18.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  168:             blk.18.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  169:           blk.18.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  170:          blk.18.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  171:           blk.18.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  172:             blk.19.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  173:             blk.19.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  174:             blk.19.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  175:        blk.19.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  176:           blk.19.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  177:             blk.19.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  178:           blk.19.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  179:          blk.19.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  180:           blk.19.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  181:             blk.20.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  182:             blk.20.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  183:             blk.20.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  184:        blk.20.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  185:           blk.20.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  186:             blk.20.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  187:           blk.20.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  188:          blk.20.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  189:           blk.20.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  190:             blk.21.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  191:             blk.21.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  192:             blk.21.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  193:        blk.21.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  194:           blk.21.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  195:             blk.21.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  196:           blk.21.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  197:          blk.21.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  198:           blk.21.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  199:             blk.22.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  200:             blk.22.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  201:             blk.22.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  202:        blk.22.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  203:           blk.22.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  204:             blk.22.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  205:           blk.22.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  206:          blk.22.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  207:           blk.22.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  208:             blk.23.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  209:             blk.23.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  210:             blk.23.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  211:        blk.23.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  212:           blk.23.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  213:             blk.23.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  214:           blk.23.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  215:          blk.23.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  216:           blk.23.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  217:             blk.24.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  218:             blk.24.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  219:             blk.24.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  220:        blk.24.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  221:           blk.24.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  222:             blk.24.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  223:           blk.24.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  224:          blk.24.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  225:           blk.24.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  226:             blk.25.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  227:             blk.25.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  228:             blk.25.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  229:        blk.25.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  230:           blk.25.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  231:             blk.25.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  232:           blk.25.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  233:          blk.25.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  234:           blk.25.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  235:             blk.26.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  236:             blk.26.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  237:             blk.26.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  238:        blk.26.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  239:           blk.26.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  240:             blk.26.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  241:           blk.26.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  242:          blk.26.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  243:           blk.26.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  244:             blk.27.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  245:             blk.27.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  246:             blk.27.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  247:        blk.27.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  248:           blk.27.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  249:             blk.27.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  250:           blk.27.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  251:          blk.27.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  252:           blk.27.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  253:             blk.28.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  254:             blk.28.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  255:             blk.28.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  256:        blk.28.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  257:           blk.28.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  258:             blk.28.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  259:           blk.28.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  260:          blk.28.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  261:           blk.28.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  262:             blk.29.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  263:             blk.29.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  264:             blk.29.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  265:        blk.29.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  266:           blk.29.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  267:             blk.29.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  268:           blk.29.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  269:          blk.29.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  270:           blk.29.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  271:             blk.30.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  272:             blk.30.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  273:             blk.30.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  274:        blk.30.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  275:           blk.30.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  276:             blk.30.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  277:           blk.30.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  278:          blk.30.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  279:           blk.30.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  280:             blk.31.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  281:             blk.31.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  282:             blk.31.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
+      "llama_model_loader: - tensor  283:        blk.31.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  284:           blk.31.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  285:             blk.31.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
+      "llama_model_loader: - tensor  286:           blk.31.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
+      "llama_model_loader: - tensor  287:          blk.31.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  288:           blk.31.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  289:               output_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+      "llama_model_loader: - tensor  290:                    output.weight q6_K     [  4096, 32002,     1,     1 ]\n",
+      "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
+      "llama_model_loader: - kv   1:                               general.name str              = teknium_openhermes-2.5-mistral-7b\n",
+      "llama_model_loader: - kv   2:                       llama.context_length u32              = 32768\n",
+      "llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096\n",
+      "llama_model_loader: - kv   4:                          llama.block_count u32              = 32\n",
+      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336\n",
+      "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128\n",
+      "llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32\n",
+      "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 8\n",
+      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010\n",
+      "llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 10000.000000\n",
+      "llama_model_loader: - kv  11:                          general.file_type u32              = 15\n",
+      "llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama\n",
+      "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32002]   = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
+      "llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32002]   = [0.000000, 0.000000, 0.000000, 0.0000...\n",
+      "llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32002]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
+      "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1\n",
+      "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 32000\n",
+      "llama_model_loader: - kv  18:            tokenizer.ggml.padding_token_id u32              = 0\n",
+      "llama_model_loader: - kv  19:               general.quantization_version u32              = 2\n",
+      "llama_model_loader: - type  f32:   65 tensors\n",
+      "llama_model_loader: - type q4_K:  193 tensors\n",
+      "llama_model_loader: - type q6_K:   33 tensors\n",
+      "llm_load_vocab: special tokens definition check successful ( 261/32002 ).\n",
+      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
+      "llm_load_print_meta: arch             = llama\n",
+      "llm_load_print_meta: vocab type       = SPM\n",
+      "llm_load_print_meta: n_vocab          = 32002\n",
+      "llm_load_print_meta: n_merges         = 0\n",
+      "llm_load_print_meta: n_ctx_train      = 32768\n",
+      "llm_load_print_meta: n_embd           = 4096\n",
+      "llm_load_print_meta: n_head           = 32\n",
+      "llm_load_print_meta: n_head_kv        = 8\n",
+      "llm_load_print_meta: n_layer          = 32\n",
+      "llm_load_print_meta: n_rot            = 128\n",
+      "llm_load_print_meta: n_gqa            = 4\n",
+      "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
+      "llm_load_print_meta: f_norm_rms_eps   = 1.0e-05\n",
+      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
+      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
+      "llm_load_print_meta: n_ff             = 14336\n",
+      "llm_load_print_meta: rope scaling     = linear\n",
+      "llm_load_print_meta: freq_base_train  = 10000.0\n",
+      "llm_load_print_meta: freq_scale_train = 1\n",
+      "llm_load_print_meta: n_yarn_orig_ctx  = 32768\n",
+      "llm_load_print_meta: rope_finetuned   = unknown\n",
+      "llm_load_print_meta: model type       = 7B\n",
+      "llm_load_print_meta: model ftype      = mostly Q4_K - Medium\n",
+      "llm_load_print_meta: model params     = 7.24 B\n",
+      "llm_load_print_meta: model size       = 4.07 GiB (4.83 BPW) \n",
+      "llm_load_print_meta: general.name   = teknium_openhermes-2.5-mistral-7b\n",
+      "llm_load_print_meta: BOS token = 1 '<s>'\n",
+      "llm_load_print_meta: EOS token = 32000 '<|im_end|>'\n",
+      "llm_load_print_meta: UNK token = 0 '<unk>'\n",
+      "llm_load_print_meta: PAD token = 0 '<unk>'\n",
+      "llm_load_print_meta: LF token  = 13 '<0x0A>'\n",
+      "llm_load_tensors: ggml ctx size =    0.11 MiB\n",
+      "llm_load_tensors: using CUDA for GPU acceleration\n",
+      "llm_load_tensors: mem required  =   70.42 MiB\n",
+      "llm_load_tensors: offloading 32 repeating layers to GPU\n",
+      "llm_load_tensors: offloading non-repeating layers to GPU\n",
+      "llm_load_tensors: offloaded 35/35 layers to GPU\n",
+      "llm_load_tensors: VRAM used: 4095.06 MiB\n",
+      "...............................................................................................\n",
+      "llama_new_context_with_model: n_ctx      = 2048\n",
+      "llama_new_context_with_model: freq_base  = 10000.0\n",
+      "llama_new_context_with_model: freq_scale = 1\n",
+      "llama_kv_cache_init: offloading v cache to GPU\n",
+      "llama_kv_cache_init: offloading k cache to GPU\n",
+      "llama_kv_cache_init: VRAM kv self = 256.00 MiB\n",
+      "llama_new_context_with_model: kv self size  =  256.00 MiB\n",
+      "llama_build_graph: non-view tensors processed: 740/740\n",
+      "llama_new_context_with_model: compute buffer total size = 159.07 MiB\n",
+      "llama_new_context_with_model: VRAM scratch buffer: 156.00 MiB\n",
+      "llama_new_context_with_model: total VRAM used: 4507.07 MiB (model: 4095.06 MiB, context: 412.00 MiB)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import llama_cpp\n",
+    "\n",
+    "llama = llama_cpp.Llama(model_path=\"../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf\", n_gpu_layers=-1, n_ctx=2048, verbose=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'name': 'get_weather', 'arguments': {'zip_code': '10001'}}]\n",
+      "====================================================================================================\n",
+      "[{'name': 'calculate_mortgage_payment', 'arguments': {'loan_amount': 200000, 'interest_rate': 0.04, 'loan_term': 30}}]\n",
+      "====================================================================================================\n",
+      "Unfortunately, I do not have a built-in function to check currency exchange rates. However, you can use third-party APIs or websites like Google Finance or XE to get this information.\n",
+      "====================================================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompts = [\n",
+    "    \"What's the weather in 10001?\",\n",
+    "    \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n",
+    "    \"What's the current exchange rate for USD to EUR?\"\n",
+    "]\n",
+    "functions = [get_weather, calculate_mortgage_payment, get_article_details]\n",
+    "\n",
+    "for prompt in prompts:\n",
+    "    prompt = generate_hermes_prompt(prompt, functions)\n",
+    "    completion = llama.create_completion(prompt, max_tokens=-1)[\"choices\"][0][\"text\"]\n",
+    "    function_calls = extract_function_calls(completion)\n",
+    "    if function_calls:\n",
+    "        print(function_calls)\n",
+    "    else:\n",
+    "        print(completion.strip())\n",
+    "    print(\"=\"*100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "get_weather\n",
+      "{'zip_code': '05751'}\n",
+      "====================================================================================================\n",
+      "get_weather\n",
+      "{'zip_code': '05751'}\n",
+      "get_weather\n",
+      "{'zip_code': '07030'}\n",
+      "calculate_mortgage_payment\n",
+      "{'loan_amount': 250000, 'interest_rate': 4.18, 'loan_term': 30}\n",
+      "====================================================================================================\n",
+      "I don't have a function to get exchange rates, but I can provide some resources where you can find this information. You can check websites like Google Finance, XE.com, or Yahoo Finance for up-to-date currency exchange rates.\n",
+      "====================================================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompts = [\n",
+    "    \"What's the weather in 05751?\",\n",
+    "    \"I'm planning a trip to Killington, Vermont (05751) from Hoboken, NJ (07030). Can you get me weather for both locations and directions?\",\n",
+    "    \"What's the current exchange rate for USD to EUR?\"\n",
+    "]\n",
+    "\n",
+    "for prompt in prompts:\n",
+    "    completion = llama.create_completion(generate_hermes_prompt(prompt, functions), max_tokens=-1)[\"choices\"][0][\"text\"]\n",
+    "    function_calls = extract_function_calls(completion)\n",
+    "\n",
+    "    if function_calls:\n",
+    "        for function in function_calls:\n",
+    "            print(function[\"name\"])\n",
+    "            print(function[\"arguments\"])\n",
+    "    else:\n",
+    "        print(completion.strip())\n",
+    "\n",
+    "    print(\"=\"*100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5+"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -50,6 +50,9 @@ from ._internals import (
    _LlamaSamplingContext,  # type: ignore
 )
 from ._logger import set_verbose
+from ._utils import (
+    suppress_stdout_stderr
+)


 class Llama:
@ -182,6 +185,7 @@ class Llama:

        self.numa = numa
        if not Llama.__backend_initialized:
+            with suppress_stdout_stderr(disable=verbose):
                llama_cpp.llama_backend_init(self.numa)
            Llama.__backend_initialized = True

@ -1567,6 +1571,38 @@ class Llama:
            logit_bias=logit_bias,
        )

+    def create_chat_completion_openai_v1(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ):
+        """Generate a chat completion with return type based on the the OpenAI v1 API.
+
+        OpenAI python package is required to use this method.
+
+        You can install it with `pip install openai`.
+
+        Args:
+            *args: Positional arguments to pass to create_chat_completion.
+            **kwargs: Keyword arguments to pass to create_chat_completion.
+
+        Returns:
+            Generated chat completion or a stream of chat completion chunks.
+        """
+        try:
+            from openai.types.chat import ChatCompletion, ChatCompletionChunk
+            stream = kwargs.get("stream", False) # type: ignore
+            assert isinstance(stream, bool)
+            if stream:
+                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore
+            else:
+                return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore
+        except ImportError:
+            raise ImportError(
+                "To use create_chat_completion_openai_v1, you must install the openai package."
+                "You can install it with `pip install openai`."
+            )
+
    def __getstate__(self):
        return dict(
            model_path=self.model_path,
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -31,6 +31,7 @@ MISTRAL_INSTRUCT_EOS_TOKEN = "</s>"

 ### Chat Completion Handler ###

+
 class LlamaChatCompletionHandler(Protocol):
    """Base Protocol for a llama chat completion handler.

@ -77,8 +78,7 @@ class LlamaChatCompletionHandler(Protocol):
    ) -> Union[
        llama_types.CreateChatCompletionResponse,
        Iterator[llama_types.CreateChatCompletionStreamResponse],
-    ]:
-        ...
+    ]: ...


 class LlamaChatCompletionHandlerNotFoundException(Exception):
@ -134,6 +134,7 @@ def register_chat_completion_handler(name: str):

 ### Chat Formatter ###

+
@dataclasses.dataclass
 class ChatFormatterResponse:
    """Dataclass that stores completion parameters for a given chat format and
@ -157,8 +158,7 @@ class ChatFormatter(Protocol):
        *,
        messages: List[llama_types.ChatCompletionRequestMessage],
        **kwargs: Any,
-    ) -> ChatFormatterResponse:
-        ...
+    ) -> ChatFormatterResponse: ...


 class Jinja2ChatFormatter(ChatFormatter):
@ -195,7 +195,7 @@ class Jinja2ChatFormatter(ChatFormatter):
            eos_token=self.eos_token,
            bos_token=self.bos_token,
            raise_exception=raise_exception,
-            add_generation_prompt=self.add_generation_prompt
+            add_generation_prompt=self.add_generation_prompt,
        )

        return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token])
@ -255,11 +255,13 @@ def _convert_text_completion_chunks_to_chat(
            "choices": [
                {
                    "index": 0,
-                    "delta": {
+                    "delta": (
+                        {
                            "content": chunk["choices"][0]["text"],
                        }
                        if chunk["choices"][0]["finish_reason"] is None
-                    else {},
+                        else {}
+                    ),
                    "finish_reason": chunk["choices"][0]["finish_reason"],
                }
            ],
@ -338,10 +340,12 @@ def chat_formatter_to_chat_completion_handler(
                # create grammar from json schema
                if "schema" in response_format:
                    grammar = llama_grammar.LlamaGrammar.from_json_schema(
-                        json.dumps(response_format["schema"])
+                        json.dumps(response_format["schema"]), verbose=llama.verbose
                    )
            except Exception as e:
-                grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                )

        completion_or_chunks = llama.create_completion(
            prompt=prompt,
@ -452,7 +456,9 @@ def hf_tokenizer_config_to_chat_completion_handler(
    tokenizer_config: Dict[str, Any],
    add_generation_prompt: bool = True,
 ) -> LlamaChatCompletionHandler:
-    chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config, add_generation_prompt=add_generation_prompt)
+    chat_formatter = hf_tokenizer_config_to_chat_formatter(
+        tokenizer_config, add_generation_prompt=add_generation_prompt
+    )
    return chat_formatter_to_chat_completion_handler(chat_formatter)


@ -468,6 +474,7 @@ def guess_chat_format_from_gguf_metadata(metadata: Dict[str, str]) -> Optional[s

    return None

+
 ### Utility functions for formatting chat prompts ###
 # TODO: Replace these with jinja2 templates

@ -916,9 +923,17 @@ def format_mistral_instruct(
    stop = eos
    prompt = bos
    for message in messages:
-        if message["role"] == "user" and message["content"] is not None and isinstance(message["content"], str):
+        if (
+            message["role"] == "user"
+            and message["content"] is not None
+            and isinstance(message["content"], str)
+        ):
            prompt += "[INST] " + message["content"]
-        elif message["role"] == "assistant" and message["content"] is not None and isinstance(message["content"], str):
+        elif (
+            message["role"] == "assistant"
+            and message["content"] is not None
+            and isinstance(message["content"], str)
+        ):
            prompt += " [/INST]" + message["content"] + eos
    prompt += " [/INST]"
    return ChatFormatterResponse(prompt=prompt, stop=stop)
@ -958,6 +973,7 @@ def format_openchat(
    _prompt = _format_chatml(system_message, _messages, _sep)
    return ChatFormatterResponse(prompt=_prompt, stop=_sep)

+
 # Chat format for Saiga models, see more details and available models:
 # https://huggingface.co/collections/IlyaGusev/saiga2-saigamistral-6505d4ccc3d1e53166b636cd
@register_chat_format("saiga")
@ -979,8 +995,10 @@ def format_saiga(
    _prompt += "<s>bot"
    return ChatFormatterResponse(prompt=_prompt.strip())

+
 # Tricky chat formats that require custom chat handlers

+
@register_chat_completion_handler("functionary")
 def functionary_chat_handler(
    llama: llama.Llama,
@ -1253,7 +1271,8 @@ def functionary_chat_handler(
                    json.dumps(function_body)
                )
                grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.json_schema_to_gbnf(json.dumps(function_body))
+                    llama_grammar.json_schema_to_gbnf(json.dumps(function_body)),
+                    verbose=llama.verbose,
                )
                print(grammar_text)
        except Exception as e:
@ -1264,11 +1283,14 @@ def functionary_chat_handler(
                print(e)
            with suppress_stdout_stderr(disable=llama.verbose):
                grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.JSON_GBNF
+                    llama_grammar.JSON_GBNF,
+                    verbose=llama.verbose,
                )
    else:
        with suppress_stdout_stderr(disable=llama.verbose):
-            grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+            grammar = llama_grammar.LlamaGrammar.from_string(
+                llama_grammar.JSON_GBNF, verbose=llama.verbose
+            )

    completion: llama_types.Completion = llama.create_completion(
        prompt=new_prompt,
@ -1367,7 +1389,9 @@ def functionary_v1_v2_chat_handler(
    SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""

    tokenizer = llama.tokenizer_
-    assert hasattr(tokenizer, "hf_tokenizer"), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
+    assert hasattr(
+        tokenizer, "hf_tokenizer"
+    ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
    from transformers import AutoTokenizer

    if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens:
@ -1519,7 +1543,10 @@ def functionary_v1_v2_chat_handler(
        else:
            suffix = "<|from|>assistant\n<|recipient|>"

-        return tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False) + suffix
+        return (
+            tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False)
+            + suffix
+        )

    if tools is not None:
        functions = [tool["function"] for tool in tools if tool["type"] == "function"]
@ -1529,7 +1556,9 @@ def functionary_v1_v2_chat_handler(
            tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
        )

-    prompt = prepare_messages_for_inference(messages, tokenizer, version, functions, tools)
+    prompt = prepare_messages_for_inference(
+        messages, tokenizer, version, functions, tools
+    )

    # If no tools/functions are provided
    if function_call is None and (functions is None or len(functions) == 0):
@ -1592,7 +1621,7 @@ def functionary_v1_v2_chat_handler(
                print(e)
            with suppress_stdout_stderr(disable=llama.verbose):
                grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.JSON_GBNF
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose
                )

        return grammar
@ -1632,7 +1661,9 @@ def functionary_v1_v2_chat_handler(
            stops = ["\n", END_ASSISTANT_TOKEN]
        # If tool_choice/function_call is "none"
        elif isinstance(function_call, str) and function_call == "none":
-            prompt = prepare_messages_for_inference(messages, tokenizer, version, [], [])
+            prompt = prepare_messages_for_inference(
+                messages, tokenizer, version, [], []
+            )
            stops = END_ASSISTANT_TOKEN
        # If tool_choice/function_call is provided
        elif isinstance(function_call, dict):
@ -1649,12 +1680,25 @@ def functionary_v1_v2_chat_handler(
        completion_text = completion["choices"][0]["text"]

        # If the generation does not involve a function call
-        if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text:
+        if (
+            START_FUNCTION_CALL_TOKEN not in prompt
+            and START_FUNCTION_CALL_TOKEN not in completion_text
+        ):
            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
        # If the generation involves a function call in completion, generate the parameters
-        elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text:
-            prompt += completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
-            function_calls.append(completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip())
+        elif (
+            START_FUNCTION_CALL_TOKEN not in prompt
+            and START_FUNCTION_CALL_TOKEN in completion_text
+        ):
+            prompt += (
+                completion_text.replace(
+                    f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN
+                )
+                + "\n"
+            )
+            function_calls.append(
+                completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
+            )
            grammar = get_grammar(function_calls[-1])
            completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
            function_bodies.append(completion["choices"][0]["text"].strip())
@ -1672,7 +1716,10 @@ def functionary_v1_v2_chat_handler(
                stops = CONTENT_TOKEN
            # If tool_choice/function_call is "none"
            elif isinstance(function_call, str) and function_call == "none":
-                prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>"
+                prompt = (
+                    prepare_messages_for_inference(messages, tokenizer, version, [], [])
+                    + "all\n<|content|>"
+                )
                stops = STOP_TOKEN
            # If tool_choice/function_call is provided
            elif isinstance(function_call, dict):
@ -1689,10 +1736,12 @@ def functionary_v1_v2_chat_handler(
            completion_text = completion["choices"][0]["text"]

            # If the generation does not involve a function call
-            if prompt.endswith("all\n<|content|>") and not completion_text.startswith("all"):
+            if prompt.endswith("all\n<|content|>") and not completion_text.startswith(
+                "all"
+            ):
                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
            # Generate model response if the model decides not to call any function
-            elif (prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all")):
+            elif prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all"):
                prompt += completion_text + CONTENT_TOKEN
                completion = create_completion(stop=STOP_TOKEN)
                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
@ -1727,8 +1776,12 @@ def functionary_v1_v2_chat_handler(
    for function_call, function_body in zip(function_calls, function_bodies):
        tool_calls.append(
            {
-                "id": "call_" + "".join(
-                    [random.choice(string.ascii_letters + string.digits) for _ in range(24)]
+                "id": "call_"
+                + "".join(
+                    [
+                        random.choice(string.ascii_letters + string.digits)
+                        for _ in range(24)
+                    ]
                ),
                "type": "function",
                "function": {
@ -1924,7 +1977,9 @@ class Llava15ChatHandler:
                        json.dumps(response_format["schema"])
                    )
            except Exception as e:
-                grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF
+                )

        return _convert_completion_to_chat(
            llama.create_completion(
@ -1950,3 +2005,601 @@ class Llava15ChatHandler:
            ),
            stream=stream,
        )
+
+
+@register_chat_completion_handler("chatml-function-calling")
+def chatml_function_calling(
+    llama: llama.Llama,
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    top_k: int = 40,
+    min_p: float = 0.05,
+    typical_p: float = 1.0,
+    stream: bool = False,
+    stop: Optional[Union[str, List[str]]] = [],
+    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
+    max_tokens: Optional[int] = None,
+    presence_penalty: float = 0.0,
+    frequency_penalty: float = 0.0,
+    repeat_penalty: float = 1.1,
+    tfs_z: float = 1.0,
+    mirostat_mode: int = 0,
+    mirostat_tau: float = 5.0,
+    mirostat_eta: float = 0.1,
+    model: Optional[str] = None,
+    logits_processor: Optional[llama.LogitsProcessorList] = None,
+    grammar: Optional[llama.LlamaGrammar] = None,
+    **kwargs,  # type: ignore
+) -> Union[
+    llama_types.CreateChatCompletionResponse,
+    Iterator[llama_types.CreateChatCompletionStreamResponse],
+]:
+    function_calling_template = (
+        "{% for message in messages %}"
+        "<|im_start|>{{ message.role }}\n"
+        # System message
+        "{% if message.role == 'system' %}"
+        "{{ message.content }}"
+        "{% if tool_calls %}"
+        "\n\nYou have access to the following functions:\n"
+        "{% for tool in tools %}"
+        "\nfunctions.{{ tool.function.name }}:\n"
+        "{{ tool.function.parameters | tojson }}"
+        "\n{% endfor %}"
+        "\n\nYou can respond to users messages with either a single message or one or more function calls."
+        "\n\nTo respond with a message begin the message with 'message:', use the following format:"
+        "\n\nmessage:"
+        "\n<message>"
+        "\n\nTo respond with one or more function calls begin the message with 'functions.<function_name>:', use the following format:"
+        "\n\nfunctions.<function_name>:"
+        '\n{ "arg1": "value1", "arg2": "value2" }'
+        "\nfunctions.<function_name>:"
+        '\n{ "arg1": "value1", "arg2": "value2" }'
+        "{% endif %}"
+        "\n<|im_end|>\n"
+        "{% endif %}"
+        # User message
+        "{% if message.role == 'user' %}"
+        "{{ message.content }}"
+        "\n<|im_end|>\n"
+        "{% endif %}"
+        # Assistant message
+        "{% if message.role == 'assistant' %}"
+        ## Reglar message
+        "{% if message.content and message.content | length > 0 %}"
+        "message:\n"
+        "{{ message.content }}"
+        "\n<|im_end|>\n"
+        "{% endif %}"
+        ## Function calls
+        "{% if message.tool_calls %}"
+        "{% for tool_call in message.tool_calls %}"
+        "functions.{{ tool_call.function.name }}:\n"
+        "{{ tool_call.function.arguments }}"
+        "{% endfor %}"
+        "\n<|im_end|>\n"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+    )
+    template_renderer = jinja2.Environment(
+        loader=jinja2.BaseLoader(),
+        autoescape=jinja2.select_autoescape(["html", "xml"]),
+        undefined=jinja2.StrictUndefined,
+    ).from_string(function_calling_template)
+
+    # Convert legacy functions to tools
+    if functions is not None:
+        tools = [
+            {
+                "type": "function",
+                "function": function,
+            }
+            for function in functions
+        ]
+
+    # Convert legacy function_call to tool_choice
+    if function_call is not None:
+        if isinstance(function_call, str) and (
+            function_call == "none" or function_call == "auto"
+        ):
+            tool_choice = function_call
+        if isinstance(function_call, dict) and "name" in function_call:
+            tool_choice = {
+                "type": "function",
+                "function": {
+                    "name": function_call["name"],
+                },
+            }
+
+    # Case 1: No tool choice by user
+    if (
+        tool_choice is None
+        or (isinstance(tool_choice, str) and tool_choice == "none")
+        or tools is None
+        or len(tools) == 0
+    ):
+        prompt = template_renderer.render(
+            messages=messages,
+            tools=[],
+            tool_calls=None,
+        )
+        if response_format is not None and response_format["type"] == "json_object":
+            try:
+                grammar = (
+                    llama_grammar.LlamaGrammar.from_json_schema(
+                        json.dumps(response_format["schema"])
+                    )
+                    if "schema" in response_format
+                    else None
+                )
+            except Exception as e:
+                if llama.verbose:
+                    print(
+                        "Failed to parse response format as JSON schema, falling back to default grammar"
+                    )
+                    print(e)
+            grammar = (
+                llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+                if grammar is None
+                else grammar
+            )
+        return _convert_completion_to_chat(
+            llama.create_completion(
+                prompt=prompt,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                stream=stream,
+                stop=stop,
+                max_tokens=max_tokens,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                grammar=grammar,
+            ),
+            stream=stream,
+        )
+
+    def _convert_completion_to_chat_function(
+        tool_name: str,
+        completion_or_chunks: Union[
+            llama_types.CreateCompletionResponse,
+            Iterator[llama_types.CreateCompletionStreamResponse],
+        ],
+        stream: bool,
+    ):
+        if not stream:
+            completion: llama_types.CreateCompletionResponse = completion_or_chunks  # type: ignore
+            assert "usage" in completion
+            tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
+            # TODO: Fix for legacy function calls
+            chat_completion: llama_types.CreateChatCompletionResponse = {
+                "id": "chat" + completion["id"],
+                "object": "chat.completion",
+                "created": completion["created"],
+                "model": completion["model"],
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": None,
+                            "function_call": {
+                                "name": tool_name,
+                                "arguments": completion["choices"][0]["text"],
+                            },
+                            "tool_calls": [
+                                {
+                                    "id": tool_id,
+                                    "type": "function",
+                                    "function": {
+                                        "name": tool_name,
+                                        "arguments": completion["choices"][0]["text"],
+                                    },
+                                }
+                            ],
+                        },
+                        "finish_reason": "tool_calls",
+                    }
+                ],
+                "usage": completion["usage"],
+            }
+            return chat_completion
+        else:
+            chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
+
+            def _stream_response_to_function_stream(
+                chunks: Iterator[llama_types.CreateCompletionStreamResponse],
+            ) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
+                # blank first message
+                first = True
+                id_ = None
+                created = None
+                model = None
+                tool_id = None
+                for chunk in chunks:
+                    if first:
+                        id_ = "chat" + chunk["id"]
+                        created = chunk["created"]
+                        model = chunk["model"]
+                        tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
+                        yield {
+                            "id": id_,
+                            "object": "chat.completion.chunk",
+                            "created": created,
+                            "model": model,
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "finish_reason": None,
+                                    "logprobs": None,
+                                    "delta": {
+                                        "role": "assistant",
+                                        "content": None,
+                                        "function_call": None,
+                                        "tool_calls": None,
+                                    },
+                                }
+                            ],
+                        }
+                        yield {
+                            "id": "chat" + chunk["id"],
+                            "object": "chat.completion.chunk",
+                            "created": chunk["created"],
+                            "model": chunk["model"],
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "finish_reason": None,
+                                    "logprobs": None,
+                                    "delta": {
+                                        "role": None,
+                                        "content": None,
+                                        "function_call": {
+                                            "name": tool_name,
+                                            "arguments": chunk["choices"][0]["text"],
+                                        },
+                                        "tool_calls": [
+                                            {
+                                                "index": 0,
+                                                "id": tool_id,
+                                                "type": "function",
+                                                "function": {
+                                                    "name": tool_name,
+                                                    "arguments": "",
+                                                },
+                                            }
+                                        ],
+                                    },
+                                }
+                            ],
+                        }
+                        first = False
+                        continue
+                    assert tool_id is not None
+                    yield {
+                        "id": "chat" + chunk["id"],
+                        "object": "chat.completion.chunk",
+                        "created": chunk["created"],
+                        "model": chunk["model"],
+                        "choices": [
+                            {
+                                "index": 0,
+                                "finish_reason": None,
+                                "logprobs": None,
+                                "delta": {
+                                    "role": None,
+                                    "content": None,
+                                    "function_call": {
+                                        "name": tool_name,
+                                        "arguments": chunk["choices"][0]["text"],
+                                    },
+                                    "tool_calls": [
+                                        {
+                                            "index": 0,
+                                            "id": tool_id,
+                                            "type": "function",
+                                            "function": {
+                                                "name": tool_name,
+                                                "arguments": chunk["choices"][0][
+                                                    "text"
+                                                ],
+                                            },
+                                        }
+                                    ],
+                                },
+                            }
+                        ],
+                    }
+
+                if id_ is not None and created is not None and model is not None:
+                    yield {
+                        "id": id_,
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": model,
+                        "choices": [
+                            {
+                                "index": 0,
+                                "finish_reason": "tool_calls",
+                                "logprobs": None,
+                                "delta": {
+                                    "role": None,
+                                    "content": None,
+                                    "function_call": None,
+                                    "tool_calls": None,
+                                },
+                            }
+                        ],
+                    }
+
+            return _stream_response_to_function_stream(chunks)
+
+    # Case 2: Tool choice by user
+    if isinstance(tool_choice, dict):
+        tool_name = tool_choice["function"]["name"]
+        tool = next(
+            (tool for tool in tools if tool["function"]["name"] == tool_name), None
+        )
+        if tool is None:
+            raise ValueError(f"Tool with name '{tool_name}' not found in tools")
+        prompt = template_renderer.render(
+            messages=messages,
+            tools=tools,
+            tool_calls=True,
+        )
+        prompt += f"functions.{tool_name}:\n"
+        try:
+            grammar = llama_grammar.LlamaGrammar.from_json_schema(
+                json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
+            )
+        except Exception as e:
+            grammar = llama_grammar.LlamaGrammar.from_string(
+                llama_grammar.JSON_GBNF, verbose=llama.verbose
+            )
+            if llama.verbose:
+                print(
+                    "Failed to parse function body as JSON schema, falling back to default grammar"
+                )
+                print(e)
+        completion_or_chunks = llama.create_completion(
+            prompt=prompt,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            stream=stream,
+            stop=stop,
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            grammar=grammar,
+        )
+        return _convert_completion_to_chat_function(
+            tool_name, completion_or_chunks, stream
+        )
+
+    # Case 3: Automatic tool choice
+    assert isinstance(tool_choice, str) and tool_choice == "auto"
+    function_names = " | ".join(
+        [f'''"functions.{tool['function']['name']}:"''' for tool in tools]
+    )
+    initial_gbnf_tool_grammar = (
+        """root   ::= functions | "message:"\n"""
+        f"""functions ::= {function_names}\n"""
+    )
+    follow_up_gbnf_tool_grammar = (
+        """root   ::= functions | "<|im_end|>"\n"""
+        f"""functions ::= {function_names}\n"""
+    )
+    prompt = template_renderer.render(
+        messages=messages,
+        tools=tools,
+        tool_calls=True,
+    )
+    completion_or_chunks = llama.create_completion(
+        prompt=prompt,
+        temperature=0,
+        top_p=top_p,
+        top_k=top_k,
+        min_p=min_p,
+        typical_p=typical_p,
+        stream=False,
+        stop=[":"],
+        max_tokens=None,
+        presence_penalty=presence_penalty,
+        frequency_penalty=frequency_penalty,
+        repeat_penalty=repeat_penalty,
+        tfs_z=tfs_z,
+        mirostat_mode=mirostat_mode,
+        mirostat_tau=mirostat_tau,
+        mirostat_eta=mirostat_eta,
+        model=model,
+        logits_processor=logits_processor,
+        grammar=llama_grammar.LlamaGrammar.from_string(
+            initial_gbnf_tool_grammar, verbose=llama.verbose
+        ),
+    )
+    completion: llama_types.CreateCompletionResponse = completion_or_chunks  # type: ignore
+    text = completion["choices"][0]["text"]
+    if "message" in text:
+        return _convert_completion_to_chat(
+            llama.create_completion(
+                prompt=prompt + "message:\n",
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                stream=stream,
+                stop=["<|im_end|>"],
+                max_tokens=None,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                grammar=llama_grammar.LlamaGrammar.from_string(
+                    follow_up_gbnf_tool_grammar, verbose=llama.verbose
+                ),
+            ),
+            stream=stream,
+        )
+
+    # One or more function calls
+    tool_name = text[len("functions.") :]
+    tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
+    if not stream:
+        completions = []
+        completions_tool_name = []
+        while tool is not None:
+            prompt += f"functions.{tool_name}:\n"
+            try:
+                grammar = llama_grammar.LlamaGrammar.from_json_schema(
+                    json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
+                )
+            except Exception as e:
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                )
+                if llama.verbose:
+                    print(
+                        "Failed to parse function body as JSON schema, falling back to default grammar"
+                    )
+                    print(e)
+            completion_or_chunks = llama.create_completion(
+                prompt=prompt,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                stream=False,
+                stop=stop,
+                max_tokens=None,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                grammar=grammar,
+            )
+            completions.append(completion_or_chunks)
+            completions_tool_name.append(tool_name)
+            prompt += completion_or_chunks["choices"][0]["text"]
+            prompt += "\n"
+
+            response = llama.create_completion(
+                prompt=prompt,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                min_p=min_p,
+                typical_p=typical_p,
+                stream=False,
+                stop=stop,
+                max_tokens=None,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
+                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                model=model,
+                logits_processor=logits_processor,
+                grammar=llama_grammar.LlamaGrammar.from_string(
+                    follow_up_gbnf_tool_grammar, verbose=llama.verbose
+                ),
+            )
+
+            tool_name = response["choices"][0]["text"][len("functions.") :]
+            tool = next(
+                (tool for tool in tools if tool["function"]["name"] == tool_name), None
+            )
+
+        # Merge completions
+        function_call = { 
+            "function_call": {
+                "name": tool_name,
+                "arguments": completions[0]["choices"][0]["text"],
+            }
+        } if len(completions) == 1 else {}
+        return {
+            "id": "chat" + completion["id"],
+            "object": "chat.completion",
+            "created": completion["created"],
+            "model": completion["model"],
+            "choices": [
+                {
+                    "finish_reason": "tool_calls",
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            {
+                                "id": "call_"
+                                + f"_{i}_"
+                                + tool_name
+                                + "_"
+                                + completion["id"],
+                                "type": "function",
+                                "function": {
+                                    "name": tool_name,
+                                    "arguments": completion["choices"][0]["text"],
+                                },
+                            }
+                            for i, (tool_name, completion) in enumerate(
+                                zip(completions_tool_name, completions)
+                            )
+                        ],
+                        **function_call
+                    },
+                }
+            ],
+            "usage": {
+                "completion_tokens": sum(
+                    completion["usage"]["completion_tokens"]
+                    for completion in completions
+                ),
+                "prompt_tokens": sum(
+                    completion["usage"]["prompt_tokens"] for completion in completions
+                ),
+                "total_tokens": sum(
+                    completion["usage"]["total_tokens"] for completion in completions
+                ),
+            },
+        }
+
+    raise ValueError("Automatic streaming tool choice is not supported")
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@ -97,7 +97,7 @@ class CreateChatCompletionResponse(TypedDict):


 class ChatCompletionMessageToolCallChunkFunction(TypedDict):
-    name: str
+    name: Optional[str]
    arguments: str


@ -118,12 +118,12 @@ class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):


 class ChatCompletionStreamResponseDelta(TypedDict):
-    content: NotRequired[str]
+    content: NotRequired[Optional[str]]
    function_call: NotRequired[
-        ChatCompletionStreamResponseDeltaFunctionCall
+        Optional[ChatCompletionStreamResponseDeltaFunctionCall]
    ]  # DEPRECATED
-    tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]]
-    role: NotRequired[Literal["system", "user", "assistant", "tool"]]
+    tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]]
+    role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]]


 class ChatCompletionStreamResponseChoice(TypedDict):
@ -132,6 +132,7 @@ class ChatCompletionStreamResponseChoice(TypedDict):
        ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
    ]
    finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
+    logprobs: NotRequired[Optional[CompletionLogprobs]]


 class CreateChatCompletionStreamResponse(TypedDict):