feat: Generic chatml Function Calling (#957)

* Add demo notebook * Add initial chat handler * Update OpenAI types * Add generic chatml function calling (wip) * Update chatml generic function calling. * Progress on auto-tool calls * fix streaming functions * Remove print statements * fix: Suppress output from llama.cpp init and grammar creation * Add OpenAI v1 python api compatible chat completion function * Support non-streaming multi-tool calls * Format * Include function_call in response.
2024-02-12 15:56:07 -05:00 · 2024-02-12 15:56:07 -05:00 · 153a0049d9
commit 153a0049d9
parent 69413ce08e
4 changed files with 1660 additions and 60 deletions
--- a/examples/notebooks/OpenHermesFunctionCalling.ipynb
+++ b/examples/notebooks/OpenHermesFunctionCalling.ipynb
@ -0,0 +1,910 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"name\": \"get_article_details\",\n",
      "  \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
      "  \"parameters\": {\n",
      "    \"type\": \"object\",\n",
      "    \"properties\": {\n",
      "      \"title\": {\n",
      "        \"type\": \"str\"\n",
      "      },\n",
      "      \"authors\": {\n",
      "        \"type\": \"list[str]\"\n",
      "      },\n",
      "      \"short_summary\": {\n",
      "        \"type\": \"str\"\n",
      "      },\n",
      "      \"date_published\": {\n",
      "        \"type\": \"str\"\n",
      "      },\n",
      "      \"tags\": {\n",
      "        \"type\": \"list[str]\"\n",
      "      }\n",
      "    }\n",
      "  },\n",
      "  \"returns\": \"Article\"\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import inspect\n",
    "from typing import get_type_hints\n",
    "\n",
    "class Article:\n",
    "    pass\n",
    "\n",
    "class Weather:\n",
    "    pass\n",
    "\n",
    "class Directions:\n",
    "    pass\n",
    "\n",
    "def calculate_mortgage_payment(loan_amount: int, interest_rate: float, loan_term: int) -> float:\n",
    "    \"\"\"Get the monthly mortgage payment given an interest rate percentage.\"\"\"\n",
    "    \n",
    "    # TODO: you must implement this to actually call it later\n",
    "    pass\n",
    "\n",
    "def get_article_details(title: str, authors: list[str], short_summary: str, date_published: str, tags: list[str]) -> Article:\n",
    "    '''Get article details from unstructured article text.\n",
    "date_published: formatted as \"MM/DD/YYYY\"'''\n",
    "    \n",
    "    # TODO: you must implement this to actually call it later\n",
    "    pass\n",
    "\n",
    "def get_weather(zip_code: str) -> Weather:\n",
    "    \"\"\"Get the current weather given a zip code.\"\"\"\n",
    "    \n",
    "    # TODO: you must implement this to actually call it later\n",
    "    pass\n",
    "\n",
    "def get_directions(start: str, destination: str) -> Directions:\n",
    "    \"\"\"Get directions from Google Directions API.\n",
    "start: start address as a string including zipcode (if any)\n",
    "destination: end address as a string including zipcode (if any)\"\"\"\n",
    "    \n",
    "    # TODO: you must implement this to actually call it later\n",
    "    pass\n",
    "\n",
    "def get_type_name(t):\n",
    "    name = str(t)\n",
    "    if \"list\" in name or \"dict\" in name:\n",
    "        return name\n",
    "    else:\n",
    "        return t.__name__\n",
    "\n",
    "def serialize_function_to_json(func):\n",
    "    signature = inspect.signature(func)\n",
    "    type_hints = get_type_hints(func)\n",
    "\n",
    "    function_info = {\n",
    "        \"name\": func.__name__,\n",
    "        \"description\": func.__doc__,\n",
    "        \"parameters\": {\n",
    "            \"type\": \"object\",\n",
    "            \"properties\": {}\n",
    "        },\n",
    "        \"returns\": type_hints.get('return', 'void').__name__\n",
    "    }\n",
    "\n",
    "    for name, _ in signature.parameters.items():\n",
    "        param_type = get_type_name(type_hints.get(name, type(None)))\n",
    "        function_info[\"parameters\"][\"properties\"][name] = {\"type\": param_type}\n",
    "\n",
    "    return json.dumps(function_info, indent=2)\n",
    "\n",
    "print(serialize_function_to_json(get_article_details))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xml.etree.ElementTree as ET\n",
    "import re\n",
    "\n",
    "def extract_function_calls(completion):\n",
    "    completion = completion.strip()\n",
    "    pattern = r\"(<multiplefunctions>(.*?)</multiplefunctions>)\"\n",
    "    match = re.search(pattern, completion, re.DOTALL)\n",
    "    if not match:\n",
    "        return None\n",
    "    \n",
    "    multiplefn = match.group(1)\n",
    "    root = ET.fromstring(multiplefn)\n",
    "    functions = root.findall(\"functioncall\")\n",
    "    return [json.loads(fn.text) for fn in functions]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_hermes_prompt(prompt, functions):\n",
    "    functions = \"\\n\\n\".join([serialize_function_to_json(fn) for fn in functions])\n",
    "    prompt = f\"\"\"<|im_start|>system\n",
    "You are a helpful assistant with access to the following functions:\n",
    "\n",
    "{functions}\n",
    "\n",
    "To use these functions respond with:\n",
    "<multiplefunctions>\n",
    "    <functioncall> {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} </functioncall>\n",
    "    <functioncall> {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} </functioncall>\n",
    "    ...\n",
    "</multiplefunctions>\n",
    "\n",
    "Edge cases you must handle:\n",
    "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
    "<|im_start|>user\n",
    "{prompt}<|im_end|>\n",
    "<|im_start|>assistant\"\"\"\n",
    "    return prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<|im_start|>system\n",
      "You are a helpful assistant with access to the following functions:\n",
      "\n",
      "{\n",
      "  \"name\": \"get_weather\",\n",
      "  \"description\": \"Get the current weather given a zip code.\",\n",
      "  \"parameters\": {\n",
      "    \"type\": \"object\",\n",
      "    \"properties\": {\n",
      "      \"zip_code\": {\n",
      "        \"type\": \"str\"\n",
      "      }\n",
      "    }\n",
      "  },\n",
      "  \"returns\": \"Weather\"\n",
      "}\n",
      "\n",
      "{\n",
      "  \"name\": \"calculate_mortgage_payment\",\n",
      "  \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
      "  \"parameters\": {\n",
      "    \"type\": \"object\",\n",
      "    \"properties\": {\n",
      "      \"loan_amount\": {\n",
      "        \"type\": \"int\"\n",
      "      },\n",
      "      \"interest_rate\": {\n",
      "        \"type\": \"float\"\n",
      "      },\n",
      "      \"loan_term\": {\n",
      "        \"type\": \"int\"\n",
      "      }\n",
      "    }\n",
      "  },\n",
      "  \"returns\": \"float\"\n",
      "}\n",
      "\n",
      "{\n",
      "  \"name\": \"get_article_details\",\n",
      "  \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
      "  \"parameters\": {\n",
      "    \"type\": \"object\",\n",
      "    \"properties\": {\n",
      "      \"title\": {\n",
      "        \"type\": \"str\"\n",
      "      },\n",
      "      \"authors\": {\n",
      "        \"type\": \"list[str]\"\n",
      "      },\n",
      "      \"short_summary\": {\n",
      "        \"type\": \"str\"\n",
      "      },\n",
      "      \"date_published\": {\n",
      "        \"type\": \"str\"\n",
      "      },\n",
      "      \"tags\": {\n",
      "        \"type\": \"list[str]\"\n",
      "      }\n",
      "    }\n",
      "  },\n",
      "  \"returns\": \"Article\"\n",
      "}\n",
      "\n",
      "To use these functions respond with:\n",
      "<multiplefunctions>\n",
      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
      "    ...\n",
      "</multiplefunctions>\n",
      "\n",
      "Edge cases you must handle:\n",
      "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
      "<|im_start|>user\n",
      "What's the weather in 10001?<|im_end|>\n",
      "<|im_start|>assistant\n",
      "<|im_start|>system\n",
      "You are a helpful assistant with access to the following functions:\n",
      "\n",
      "{\n",
      "  \"name\": \"get_weather\",\n",
      "  \"description\": \"Get the current weather given a zip code.\",\n",
      "  \"parameters\": {\n",
      "    \"type\": \"object\",\n",
      "    \"properties\": {\n",
      "      \"zip_code\": {\n",
      "        \"type\": \"str\"\n",
      "      }\n",
      "    }\n",
      "  },\n",
      "  \"returns\": \"Weather\"\n",
      "}\n",
      "\n",
      "{\n",
      "  \"name\": \"calculate_mortgage_payment\",\n",
      "  \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
      "  \"parameters\": {\n",
      "    \"type\": \"object\",\n",
      "    \"properties\": {\n",
      "      \"loan_amount\": {\n",
      "        \"type\": \"int\"\n",
      "      },\n",
      "      \"interest_rate\": {\n",
      "        \"type\": \"float\"\n",
      "      },\n",
      "      \"loan_term\": {\n",
      "        \"type\": \"int\"\n",
      "      }\n",
      "    }\n",
      "  },\n",
      "  \"returns\": \"float\"\n",
      "}\n",
      "\n",
      "{\n",
      "  \"name\": \"get_article_details\",\n",
      "  \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
      "  \"parameters\": {\n",
      "    \"type\": \"object\",\n",
      "    \"properties\": {\n",
      "      \"title\": {\n",
      "        \"type\": \"str\"\n",
      "      },\n",
      "      \"authors\": {\n",
      "        \"type\": \"list[str]\"\n",
      "      },\n",
      "      \"short_summary\": {\n",
      "        \"type\": \"str\"\n",
      "      },\n",
      "      \"date_published\": {\n",
      "        \"type\": \"str\"\n",
      "      },\n",
      "      \"tags\": {\n",
      "        \"type\": \"list[str]\"\n",
      "      }\n",
      "    }\n",
      "  },\n",
      "  \"returns\": \"Article\"\n",
      "}\n",
      "\n",
      "To use these functions respond with:\n",
      "<multiplefunctions>\n",
      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
      "    ...\n",
      "</multiplefunctions>\n",
      "\n",
      "Edge cases you must handle:\n",
      "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
      "<|im_start|>user\n",
      "Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.<|im_end|>\n",
      "<|im_start|>assistant\n",
      "<|im_start|>system\n",
      "You are a helpful assistant with access to the following functions:\n",
      "\n",
      "{\n",
      "  \"name\": \"get_weather\",\n",
      "  \"description\": \"Get the current weather given a zip code.\",\n",
      "  \"parameters\": {\n",
      "    \"type\": \"object\",\n",
      "    \"properties\": {\n",
      "      \"zip_code\": {\n",
      "        \"type\": \"str\"\n",
      "      }\n",
      "    }\n",
      "  },\n",
      "  \"returns\": \"Weather\"\n",
      "}\n",
      "\n",
      "{\n",
      "  \"name\": \"calculate_mortgage_payment\",\n",
      "  \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
      "  \"parameters\": {\n",
      "    \"type\": \"object\",\n",
      "    \"properties\": {\n",
      "      \"loan_amount\": {\n",
      "        \"type\": \"int\"\n",
      "      },\n",
      "      \"interest_rate\": {\n",
      "        \"type\": \"float\"\n",
      "      },\n",
      "      \"loan_term\": {\n",
      "        \"type\": \"int\"\n",
      "      }\n",
      "    }\n",
      "  },\n",
      "  \"returns\": \"float\"\n",
      "}\n",
      "\n",
      "{\n",
      "  \"name\": \"get_article_details\",\n",
      "  \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
      "  \"parameters\": {\n",
      "    \"type\": \"object\",\n",
      "    \"properties\": {\n",
      "      \"title\": {\n",
      "        \"type\": \"str\"\n",
      "      },\n",
      "      \"authors\": {\n",
      "        \"type\": \"list[str]\"\n",
      "      },\n",
      "      \"short_summary\": {\n",
      "        \"type\": \"str\"\n",
      "      },\n",
      "      \"date_published\": {\n",
      "        \"type\": \"str\"\n",
      "      },\n",
      "      \"tags\": {\n",
      "        \"type\": \"list[str]\"\n",
      "      }\n",
      "    }\n",
      "  },\n",
      "  \"returns\": \"Article\"\n",
      "}\n",
      "\n",
      "To use these functions respond with:\n",
      "<multiplefunctions>\n",
      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
      "    <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
      "    ...\n",
      "</multiplefunctions>\n",
      "\n",
      "Edge cases you must handle:\n",
      "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
      "<|im_start|>user\n",
      "What's the current exchange rate for USD to EUR?<|im_end|>\n",
      "<|im_start|>assistant\n"
     ]
    }
   ],
   "source": [
    "prompts = [\n",
    "    \"What's the weather in 10001?\",\n",
    "    \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n",
    "    \"What's the current exchange rate for USD to EUR?\"\n",
    "]\n",
    "functions = [get_weather, calculate_mortgage_payment, get_article_details]\n",
    "\n",
    "for prompt in prompts:\n",
    "    print(generate_hermes_prompt(prompt, functions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no\n",
      "ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes\n",
      "ggml_init_cublas: found 1 CUDA devices:\n",
      "  Device 0: NVIDIA GeForce RTX 2060, compute capability 7.5\n",
      "llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from ../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf (version GGUF V3 (latest))\n",
      "llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32002,     1,     1 ]\n",
      "llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor    8:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor    9:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   10:              blk.1.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   11:              blk.1.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   12:              blk.1.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   13:         blk.1.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   14:            blk.1.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   15:              blk.1.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   16:            blk.1.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   17:           blk.1.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   18:            blk.1.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   19:              blk.2.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   20:              blk.2.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   21:              blk.2.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   22:         blk.2.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   23:            blk.2.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   24:              blk.2.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   25:            blk.2.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   26:           blk.2.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   27:            blk.2.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   28:              blk.3.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   29:              blk.3.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   30:              blk.3.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   31:         blk.3.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   32:            blk.3.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   33:              blk.3.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   34:            blk.3.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   35:           blk.3.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   36:            blk.3.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   37:              blk.4.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   38:              blk.4.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   39:              blk.4.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   40:         blk.4.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   41:            blk.4.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   42:              blk.4.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   43:            blk.4.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   44:           blk.4.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   45:            blk.4.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   46:              blk.5.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   47:              blk.5.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   48:              blk.5.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   49:         blk.5.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   50:            blk.5.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   51:              blk.5.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   52:            blk.5.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   53:           blk.5.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   54:            blk.5.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   55:              blk.6.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   56:              blk.6.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   57:              blk.6.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   58:         blk.6.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   59:            blk.6.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   60:              blk.6.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   61:            blk.6.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   62:           blk.6.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   63:            blk.6.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   64:              blk.7.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   65:              blk.7.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   66:              blk.7.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   67:         blk.7.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   68:            blk.7.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   69:              blk.7.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   70:            blk.7.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   71:           blk.7.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   72:            blk.7.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   73:              blk.8.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   74:              blk.8.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   75:              blk.8.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   76:         blk.8.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   77:            blk.8.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   78:              blk.8.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   79:            blk.8.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   80:           blk.8.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   81:            blk.8.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   82:              blk.9.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   83:              blk.9.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   84:              blk.9.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   85:         blk.9.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   86:            blk.9.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   87:              blk.9.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   88:            blk.9.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   89:           blk.9.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   90:            blk.9.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   91:             blk.10.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   92:             blk.10.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   93:             blk.10.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor   94:        blk.10.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   95:           blk.10.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   96:             blk.10.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor   97:           blk.10.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor   98:          blk.10.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor   99:           blk.10.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  100:             blk.11.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  101:             blk.11.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  102:             blk.11.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  103:        blk.11.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  104:           blk.11.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  105:             blk.11.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  106:           blk.11.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  107:          blk.11.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  108:           blk.11.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  109:             blk.12.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  110:             blk.12.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  111:             blk.12.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  112:        blk.12.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  113:           blk.12.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  114:             blk.12.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  115:           blk.12.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  116:          blk.12.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  117:           blk.12.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  118:             blk.13.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  119:             blk.13.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  120:             blk.13.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  121:        blk.13.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  122:           blk.13.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  123:             blk.13.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  124:           blk.13.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  125:          blk.13.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  126:           blk.13.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  127:             blk.14.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  128:             blk.14.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  129:             blk.14.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  130:        blk.14.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  131:           blk.14.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  132:             blk.14.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  133:           blk.14.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  134:          blk.14.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  135:           blk.14.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  136:             blk.15.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  137:             blk.15.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  138:             blk.15.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  139:        blk.15.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  140:           blk.15.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  141:             blk.15.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  142:           blk.15.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  143:          blk.15.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  144:           blk.15.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  145:             blk.16.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  146:             blk.16.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  147:             blk.16.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  148:        blk.16.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  149:           blk.16.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  150:             blk.16.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  151:           blk.16.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  152:          blk.16.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  153:           blk.16.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  154:             blk.17.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  155:             blk.17.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  156:             blk.17.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  157:        blk.17.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  158:           blk.17.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  159:             blk.17.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  160:           blk.17.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  161:          blk.17.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  162:           blk.17.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  163:             blk.18.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  164:             blk.18.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  165:             blk.18.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  166:        blk.18.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  167:           blk.18.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  168:             blk.18.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  169:           blk.18.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  170:          blk.18.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  171:           blk.18.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  172:             blk.19.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  173:             blk.19.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  174:             blk.19.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  175:        blk.19.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  176:           blk.19.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  177:             blk.19.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  178:           blk.19.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  179:          blk.19.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  180:           blk.19.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  181:             blk.20.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  182:             blk.20.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  183:             blk.20.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  184:        blk.20.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  185:           blk.20.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  186:             blk.20.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  187:           blk.20.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  188:          blk.20.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  189:           blk.20.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  190:             blk.21.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  191:             blk.21.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  192:             blk.21.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  193:        blk.21.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  194:           blk.21.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  195:             blk.21.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  196:           blk.21.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  197:          blk.21.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  198:           blk.21.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  199:             blk.22.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  200:             blk.22.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  201:             blk.22.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  202:        blk.22.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  203:           blk.22.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  204:             blk.22.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  205:           blk.22.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  206:          blk.22.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  207:           blk.22.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  208:             blk.23.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  209:             blk.23.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  210:             blk.23.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  211:        blk.23.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  212:           blk.23.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  213:             blk.23.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  214:           blk.23.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  215:          blk.23.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  216:           blk.23.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  217:             blk.24.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  218:             blk.24.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  219:             blk.24.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  220:        blk.24.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  221:           blk.24.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  222:             blk.24.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  223:           blk.24.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  224:          blk.24.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  225:           blk.24.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  226:             blk.25.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  227:             blk.25.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  228:             blk.25.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  229:        blk.25.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  230:           blk.25.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  231:             blk.25.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  232:           blk.25.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  233:          blk.25.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  234:           blk.25.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  235:             blk.26.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  236:             blk.26.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  237:             blk.26.attn_v.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  238:        blk.26.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  239:           blk.26.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  240:             blk.26.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  241:           blk.26.ffn_down.weight q4_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  242:          blk.26.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  243:           blk.26.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  244:             blk.27.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  245:             blk.27.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  246:             blk.27.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  247:        blk.27.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  248:           blk.27.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  249:             blk.27.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  250:           blk.27.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  251:          blk.27.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  252:           blk.27.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  253:             blk.28.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  254:             blk.28.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  255:             blk.28.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  256:        blk.28.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  257:           blk.28.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  258:             blk.28.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  259:           blk.28.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  260:          blk.28.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  261:           blk.28.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  262:             blk.29.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  263:             blk.29.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  264:             blk.29.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  265:        blk.29.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  266:           blk.29.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  267:             blk.29.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  268:           blk.29.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  269:          blk.29.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  270:           blk.29.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  271:             blk.30.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  272:             blk.30.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  273:             blk.30.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  274:        blk.30.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  275:           blk.30.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  276:             blk.30.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  277:           blk.30.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  278:          blk.30.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  279:           blk.30.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  280:             blk.31.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  281:             blk.31.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  282:             blk.31.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]\n",
      "llama_model_loader: - tensor  283:        blk.31.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  284:           blk.31.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  285:             blk.31.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]\n",
      "llama_model_loader: - tensor  286:           blk.31.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]\n",
      "llama_model_loader: - tensor  287:          blk.31.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  288:           blk.31.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  289:               output_norm.weight f32      [  4096,     1,     1,     1 ]\n",
      "llama_model_loader: - tensor  290:                    output.weight q6_K     [  4096, 32002,     1,     1 ]\n",
      "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
      "llama_model_loader: - kv   1:                               general.name str              = teknium_openhermes-2.5-mistral-7b\n",
      "llama_model_loader: - kv   2:                       llama.context_length u32              = 32768\n",
      "llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096\n",
      "llama_model_loader: - kv   4:                          llama.block_count u32              = 32\n",
      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336\n",
      "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128\n",
      "llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32\n",
      "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 8\n",
      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010\n",
      "llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 10000.000000\n",
      "llama_model_loader: - kv  11:                          general.file_type u32              = 15\n",
      "llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama\n",
      "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32002]   = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
      "llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32002]   = [0.000000, 0.000000, 0.000000, 0.0000...\n",
      "llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32002]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
      "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1\n",
      "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 32000\n",
      "llama_model_loader: - kv  18:            tokenizer.ggml.padding_token_id u32              = 0\n",
      "llama_model_loader: - kv  19:               general.quantization_version u32              = 2\n",
      "llama_model_loader: - type  f32:   65 tensors\n",
      "llama_model_loader: - type q4_K:  193 tensors\n",
      "llama_model_loader: - type q6_K:   33 tensors\n",
      "llm_load_vocab: special tokens definition check successful ( 261/32002 ).\n",
      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
      "llm_load_print_meta: arch             = llama\n",
      "llm_load_print_meta: vocab type       = SPM\n",
      "llm_load_print_meta: n_vocab          = 32002\n",
      "llm_load_print_meta: n_merges         = 0\n",
      "llm_load_print_meta: n_ctx_train      = 32768\n",
      "llm_load_print_meta: n_embd           = 4096\n",
      "llm_load_print_meta: n_head           = 32\n",
      "llm_load_print_meta: n_head_kv        = 8\n",
      "llm_load_print_meta: n_layer          = 32\n",
      "llm_load_print_meta: n_rot            = 128\n",
      "llm_load_print_meta: n_gqa            = 4\n",
      "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
      "llm_load_print_meta: f_norm_rms_eps   = 1.0e-05\n",
      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
      "llm_load_print_meta: n_ff             = 14336\n",
      "llm_load_print_meta: rope scaling     = linear\n",
      "llm_load_print_meta: freq_base_train  = 10000.0\n",
      "llm_load_print_meta: freq_scale_train = 1\n",
      "llm_load_print_meta: n_yarn_orig_ctx  = 32768\n",
      "llm_load_print_meta: rope_finetuned   = unknown\n",
      "llm_load_print_meta: model type       = 7B\n",
      "llm_load_print_meta: model ftype      = mostly Q4_K - Medium\n",
      "llm_load_print_meta: model params     = 7.24 B\n",
      "llm_load_print_meta: model size       = 4.07 GiB (4.83 BPW) \n",
      "llm_load_print_meta: general.name   = teknium_openhermes-2.5-mistral-7b\n",
      "llm_load_print_meta: BOS token = 1 '<s>'\n",
      "llm_load_print_meta: EOS token = 32000 '<|im_end|>'\n",
      "llm_load_print_meta: UNK token = 0 '<unk>'\n",
      "llm_load_print_meta: PAD token = 0 '<unk>'\n",
      "llm_load_print_meta: LF token  = 13 '<0x0A>'\n",
      "llm_load_tensors: ggml ctx size =    0.11 MiB\n",
      "llm_load_tensors: using CUDA for GPU acceleration\n",
      "llm_load_tensors: mem required  =   70.42 MiB\n",
      "llm_load_tensors: offloading 32 repeating layers to GPU\n",
      "llm_load_tensors: offloading non-repeating layers to GPU\n",
      "llm_load_tensors: offloaded 35/35 layers to GPU\n",
      "llm_load_tensors: VRAM used: 4095.06 MiB\n",
      "...............................................................................................\n",
      "llama_new_context_with_model: n_ctx      = 2048\n",
      "llama_new_context_with_model: freq_base  = 10000.0\n",
      "llama_new_context_with_model: freq_scale = 1\n",
      "llama_kv_cache_init: offloading v cache to GPU\n",
      "llama_kv_cache_init: offloading k cache to GPU\n",
      "llama_kv_cache_init: VRAM kv self = 256.00 MiB\n",
      "llama_new_context_with_model: kv self size  =  256.00 MiB\n",
      "llama_build_graph: non-view tensors processed: 740/740\n",
      "llama_new_context_with_model: compute buffer total size = 159.07 MiB\n",
      "llama_new_context_with_model: VRAM scratch buffer: 156.00 MiB\n",
      "llama_new_context_with_model: total VRAM used: 4507.07 MiB (model: 4095.06 MiB, context: 412.00 MiB)\n"
     ]
    }
   ],
   "source": [
    "import llama_cpp\n",
    "\n",
    "llama = llama_cpp.Llama(model_path=\"../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf\", n_gpu_layers=-1, n_ctx=2048, verbose=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'name': 'get_weather', 'arguments': {'zip_code': '10001'}}]\n",
      "====================================================================================================\n",
      "[{'name': 'calculate_mortgage_payment', 'arguments': {'loan_amount': 200000, 'interest_rate': 0.04, 'loan_term': 30}}]\n",
      "====================================================================================================\n",
      "Unfortunately, I do not have a built-in function to check currency exchange rates. However, you can use third-party APIs or websites like Google Finance or XE to get this information.\n",
      "====================================================================================================\n"
     ]
    }
   ],
   "source": [
    "prompts = [\n",
    "    \"What's the weather in 10001?\",\n",
    "    \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n",
    "    \"What's the current exchange rate for USD to EUR?\"\n",
    "]\n",
    "functions = [get_weather, calculate_mortgage_payment, get_article_details]\n",
    "\n",
    "for prompt in prompts:\n",
    "    prompt = generate_hermes_prompt(prompt, functions)\n",
    "    completion = llama.create_completion(prompt, max_tokens=-1)[\"choices\"][0][\"text\"]\n",
    "    function_calls = extract_function_calls(completion)\n",
    "    if function_calls:\n",
    "        print(function_calls)\n",
    "    else:\n",
    "        print(completion.strip())\n",
    "    print(\"=\"*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "get_weather\n",
      "{'zip_code': '05751'}\n",
      "====================================================================================================\n",
      "get_weather\n",
      "{'zip_code': '05751'}\n",
      "get_weather\n",
      "{'zip_code': '07030'}\n",
      "calculate_mortgage_payment\n",
      "{'loan_amount': 250000, 'interest_rate': 4.18, 'loan_term': 30}\n",
      "====================================================================================================\n",
      "I don't have a function to get exchange rates, but I can provide some resources where you can find this information. You can check websites like Google Finance, XE.com, or Yahoo Finance for up-to-date currency exchange rates.\n",
      "====================================================================================================\n"
     ]
    }
   ],
   "source": [
    "prompts = [\n",
    "    \"What's the weather in 05751?\",\n",
    "    \"I'm planning a trip to Killington, Vermont (05751) from Hoboken, NJ (07030). Can you get me weather for both locations and directions?\",\n",
    "    \"What's the current exchange rate for USD to EUR?\"\n",
    "]\n",
    "\n",
    "for prompt in prompts:\n",
    "    completion = llama.create_completion(generate_hermes_prompt(prompt, functions), max_tokens=-1)[\"choices\"][0][\"text\"]\n",
    "    function_calls = extract_function_calls(completion)\n",
    "\n",
    "    if function_calls:\n",
    "        for function in function_calls:\n",
    "            print(function[\"name\"])\n",
    "            print(function[\"arguments\"])\n",
    "    else:\n",
    "        print(completion.strip())\n",
    "\n",
    "    print(\"=\"*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5+"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -50,6 +50,9 @@ from ._internals import (
    _LlamaSamplingContext,  # type: ignore
 )
 from ._logger import set_verbose
 from ._utils import (
    suppress_stdout_stderr
 )
 class Llama:
@ -182,7 +185,8 @@ class Llama:
        self.numa = numa
        if not Llama.__backend_initialized:
-            llama_cpp.llama_backend_init(self.numa)
+            with suppress_stdout_stderr(disable=verbose):
                llama_cpp.llama_backend_init(self.numa)
            Llama.__backend_initialized = True
        self.model_path = model_path
@ -1567,6 +1571,38 @@ class Llama:
            logit_bias=logit_bias,
        )
    def create_chat_completion_openai_v1(
        self,
        *args: Any,
        **kwargs: Any,
    ):
        """Generate a chat completion with return type based on the the OpenAI v1 API.
        OpenAI python package is required to use this method.
        You can install it with `pip install openai`.
        Args:
            *args: Positional arguments to pass to create_chat_completion.
            **kwargs: Keyword arguments to pass to create_chat_completion.
        Returns:
            Generated chat completion or a stream of chat completion chunks.
        """
        try:
            from openai.types.chat import ChatCompletion, ChatCompletionChunk
            stream = kwargs.get("stream", False) # type: ignore
            assert isinstance(stream, bool)
            if stream:
                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore
            else:
                return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore
        except ImportError:
            raise ImportError(
                "To use create_chat_completion_openai_v1, you must install the openai package."
                "You can install it with `pip install openai`."
            )
    def __getstate__(self):
        return dict(
            model_path=self.model_path,
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -31,6 +31,7 @@ MISTRAL_INSTRUCT_EOS_TOKEN = "</s>"
 ### Chat Completion Handler ###
 class LlamaChatCompletionHandler(Protocol):
    """Base Protocol for a llama chat completion handler.
@ -77,8 +78,7 @@ class LlamaChatCompletionHandler(Protocol):
    ) -> Union[
        llama_types.CreateChatCompletionResponse,
        Iterator[llama_types.CreateChatCompletionStreamResponse],
-    ]:
+    ]: ...
        ...
 class LlamaChatCompletionHandlerNotFoundException(Exception):
@ -134,6 +134,7 @@ def register_chat_completion_handler(name: str):
 ### Chat Formatter ###
@dataclasses.dataclass
 class ChatFormatterResponse:
    """Dataclass that stores completion parameters for a given chat format and
@ -157,8 +158,7 @@ class ChatFormatter(Protocol):
        *,
        messages: List[llama_types.ChatCompletionRequestMessage],
        **kwargs: Any,
-    ) -> ChatFormatterResponse:
+    ) -> ChatFormatterResponse: ...
        ...
 class Jinja2ChatFormatter(ChatFormatter):
@ -195,7 +195,7 @@ class Jinja2ChatFormatter(ChatFormatter):
            eos_token=self.eos_token,
            bos_token=self.bos_token,
            raise_exception=raise_exception,
-            add_generation_prompt=self.add_generation_prompt
+            add_generation_prompt=self.add_generation_prompt,
        )
        return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token])
@ -255,11 +255,13 @@ def _convert_text_completion_chunks_to_chat(
            "choices": [
                {
                    "index": 0,
-                    "delta": {
+                    "delta": (
-                        "content": chunk["choices"][0]["text"],
+                        {
-                    }
+                            "content": chunk["choices"][0]["text"],
-                    if chunk["choices"][0]["finish_reason"] is None
+                        }
-                    else {},
+                        if chunk["choices"][0]["finish_reason"] is None
                        else {}
                    ),
                    "finish_reason": chunk["choices"][0]["finish_reason"],
                }
            ],
@ -338,10 +340,12 @@ def chat_formatter_to_chat_completion_handler(
                # create grammar from json schema
                if "schema" in response_format:
                    grammar = llama_grammar.LlamaGrammar.from_json_schema(
-                        json.dumps(response_format["schema"])
+                        json.dumps(response_format["schema"]), verbose=llama.verbose
                    )
            except Exception as e:
-                grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+                grammar = llama_grammar.LlamaGrammar.from_string(
                    llama_grammar.JSON_GBNF, verbose=llama.verbose
                )
        completion_or_chunks = llama.create_completion(
            prompt=prompt,
@ -452,7 +456,9 @@ def hf_tokenizer_config_to_chat_completion_handler(
    tokenizer_config: Dict[str, Any],
    add_generation_prompt: bool = True,
 ) -> LlamaChatCompletionHandler:
-    chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config, add_generation_prompt=add_generation_prompt)
+    chat_formatter = hf_tokenizer_config_to_chat_formatter(
        tokenizer_config, add_generation_prompt=add_generation_prompt
    )
    return chat_formatter_to_chat_completion_handler(chat_formatter)
@ -463,11 +469,12 @@ def guess_chat_format_from_gguf_metadata(metadata: Dict[str, str]) -> Optional[s
    if metadata["tokenizer.chat_template"] == CHATML_CHAT_TEMPLATE:
        return "chatml"
-    if metadata["tokenizer.chat_template"] == MISTRAL_INSTRUCT_CHAT_TEMPLATE: 
+    if metadata["tokenizer.chat_template"] == MISTRAL_INSTRUCT_CHAT_TEMPLATE:
        return "mistral-instruct"
    return None
 ### Utility functions for formatting chat prompts ###
 # TODO: Replace these with jinja2 templates
@ -916,9 +923,17 @@ def format_mistral_instruct(
    stop = eos
    prompt = bos
    for message in messages:
-        if message["role"] == "user" and message["content"] is not None and isinstance(message["content"], str):
+        if (
            message["role"] == "user"
            and message["content"] is not None
            and isinstance(message["content"], str)
        ):
            prompt += "[INST] " + message["content"]
-        elif message["role"] == "assistant" and message["content"] is not None and isinstance(message["content"], str):
+        elif (
            message["role"] == "assistant"
            and message["content"] is not None
            and isinstance(message["content"], str)
        ):
            prompt += " [/INST]" + message["content"] + eos
    prompt += " [/INST]"
    return ChatFormatterResponse(prompt=prompt, stop=stop)
@ -958,6 +973,7 @@ def format_openchat(
    _prompt = _format_chatml(system_message, _messages, _sep)
    return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 # Chat format for Saiga models, see more details and available models:
 # https://huggingface.co/collections/IlyaGusev/saiga2-saigamistral-6505d4ccc3d1e53166b636cd
@register_chat_format("saiga")
@ -979,8 +995,10 @@ def format_saiga(
    _prompt += "<s>bot"
    return ChatFormatterResponse(prompt=_prompt.strip())
 # Tricky chat formats that require custom chat handlers
@register_chat_completion_handler("functionary")
 def functionary_chat_handler(
    llama: llama.Llama,
@ -1253,7 +1271,8 @@ def functionary_chat_handler(
                    json.dumps(function_body)
                )
                grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.json_schema_to_gbnf(json.dumps(function_body))
+                    llama_grammar.json_schema_to_gbnf(json.dumps(function_body)),
                    verbose=llama.verbose,
                )
                print(grammar_text)
        except Exception as e:
@ -1264,11 +1283,14 @@ def functionary_chat_handler(
                print(e)
            with suppress_stdout_stderr(disable=llama.verbose):
                grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.JSON_GBNF
+                    llama_grammar.JSON_GBNF,
                    verbose=llama.verbose,
                )
    else:
        with suppress_stdout_stderr(disable=llama.verbose):
-            grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+            grammar = llama_grammar.LlamaGrammar.from_string(
                llama_grammar.JSON_GBNF, verbose=llama.verbose
            )
    completion: llama_types.Completion = llama.create_completion(
        prompt=new_prompt,
@ -1365,11 +1387,13 @@ def functionary_v1_v2_chat_handler(
    **kwargs,  # type: ignore
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
    SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
-    
+
    tokenizer = llama.tokenizer_
-    assert hasattr(tokenizer, "hf_tokenizer"), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
+    assert hasattr(
        tokenizer, "hf_tokenizer"
    ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
    from transformers import AutoTokenizer
-    
+
    if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens:
        version = "v1"
        END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>"
@ -1513,13 +1537,16 @@ def functionary_v1_v2_chat_handler(
                    "name"
                ] = f"functions.{message['function_call']['name']}"
            all_messages.append(message)
-            
+
        if version == "v1":
            suffix = "assistant:\n"
        else:
            suffix = "<|from|>assistant\n<|recipient|>"
-        
+
-        return tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False) + suffix
+        return (
            tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False)
            + suffix
        )
    if tools is not None:
        functions = [tool["function"] for tool in tools if tool["type"] == "function"]
@ -1529,8 +1556,10 @@ def functionary_v1_v2_chat_handler(
            tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
        )
-    prompt = prepare_messages_for_inference(messages, tokenizer, version, functions, tools)
+    prompt = prepare_messages_for_inference(
-    
+        messages, tokenizer, version, functions, tools
    )
    # If no tools/functions are provided
    if function_call is None and (functions is None or len(functions) == 0):
        if version == "v1":
@ -1538,7 +1567,7 @@ def functionary_v1_v2_chat_handler(
        else:
            stop = STOP_TOKEN
            prompt += "all\n<|content|>"
-        
+
        completion_or_completion_chunks = llama.create_completion(
            prompt=prompt,
            temperature=temperature,
@ -1561,9 +1590,9 @@ def functionary_v1_v2_chat_handler(
            grammar=grammar,
        )
        return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore
-    
+
    assert stream is False  # TODO: support stream mode
-    
+
    def get_grammar(function_call):
        function_body = None
        for function in functions or []:
@ -1574,7 +1603,7 @@ def functionary_v1_v2_chat_handler(
            if tool["type"] == "function" and tool["function"]["name"] == function_call:
                function_body = tool["function"]["parameters"]
                break
-            
+
        try:
            with suppress_stdout_stderr(disable=llama.verbose):
                grammar_text = llama_grammar.json_schema_to_gbnf(
@ -1592,11 +1621,11 @@ def functionary_v1_v2_chat_handler(
                print(e)
            with suppress_stdout_stderr(disable=llama.verbose):
                grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.JSON_GBNF
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose
                )
-        
+
        return grammar
-    
+
    def create_completion(stop):
        completion: llama_types.Completion = llama.create_completion(
            prompt=prompt,
@ -1619,11 +1648,11 @@ def functionary_v1_v2_chat_handler(
            logits_processor=logits_processor,
            grammar=grammar,
        )
-        
+
        return completion
-    
+
    function_calls, function_bodies = [], []
-    
+
    if version == "v1":
        # If no or "auto" tool_choice/function_call
        if function_call is None or (
@ -1632,7 +1661,9 @@ def functionary_v1_v2_chat_handler(
            stops = ["\n", END_ASSISTANT_TOKEN]
        # If tool_choice/function_call is "none"
        elif isinstance(function_call, str) and function_call == "none":
-            prompt = prepare_messages_for_inference(messages, tokenizer, version, [], [])
+            prompt = prepare_messages_for_inference(
                messages, tokenizer, version, [], []
            )
            stops = END_ASSISTANT_TOKEN
        # If tool_choice/function_call is provided
        elif isinstance(function_call, dict):
@ -1647,14 +1678,27 @@ def functionary_v1_v2_chat_handler(
        completion = create_completion(stop=stops)
        completion_text = completion["choices"][0]["text"]
-        
+
        # If the generation does not involve a function call
-        if START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN not in completion_text:
+        if (
            START_FUNCTION_CALL_TOKEN not in prompt
            and START_FUNCTION_CALL_TOKEN not in completion_text
        ):
            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
        # If the generation involves a function call in completion, generate the parameters
-        elif START_FUNCTION_CALL_TOKEN not in prompt and START_FUNCTION_CALL_TOKEN in completion_text:
+        elif (
-            prompt += completion_text.replace(f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN) + "\n"
+            START_FUNCTION_CALL_TOKEN not in prompt
-            function_calls.append(completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip())
+            and START_FUNCTION_CALL_TOKEN in completion_text
        ):
            prompt += (
                completion_text.replace(
                    f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN
                )
                + "\n"
            )
            function_calls.append(
                completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
            )
            grammar = get_grammar(function_calls[-1])
            completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
            function_bodies.append(completion["choices"][0]["text"].strip())
@ -1672,7 +1716,10 @@ def functionary_v1_v2_chat_handler(
                stops = CONTENT_TOKEN
            # If tool_choice/function_call is "none"
            elif isinstance(function_call, str) and function_call == "none":
-                prompt = prepare_messages_for_inference(messages, tokenizer, version, [], []) + "all\n<|content|>"
+                prompt = (
                    prepare_messages_for_inference(messages, tokenizer, version, [], [])
                    + "all\n<|content|>"
                )
                stops = STOP_TOKEN
            # If tool_choice/function_call is provided
            elif isinstance(function_call, dict):
@ -1684,15 +1731,17 @@ def functionary_v1_v2_chat_handler(
            else:
                prompt = prompt
                stops = STOP_TOKEN
-                
+
            completion = create_completion(stop=stops)
            completion_text = completion["choices"][0]["text"]
-            
+
            # If the generation does not involve a function call
-            if prompt.endswith("all\n<|content|>") and not completion_text.startswith("all"):
+            if prompt.endswith("all\n<|content|>") and not completion_text.startswith(
                "all"
            ):
                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
            # Generate model response if the model decides not to call any function
-            elif (prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all")):
+            elif prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all"):
                prompt += completion_text + CONTENT_TOKEN
                completion = create_completion(stop=STOP_TOKEN)
                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
@ -1704,7 +1753,7 @@ def functionary_v1_v2_chat_handler(
                function_bodies.append(completion["choices"][0]["text"].strip())
                prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}"
                grammar = None
-                
+
                # Try to generate the beginning of next turn
                # If empty completion, break from loop
                next_turn_completion_text = create_completion(
@ -1718,17 +1767,21 @@ def functionary_v1_v2_chat_handler(
            else:
                function_bodies.append(completion_text.strip())
                break
-            
+
    assert "usage" in completion
    assert len(function_calls) > 0
    assert len(function_calls) == len(function_bodies)
-    
+
    tool_calls = []
    for function_call, function_body in zip(function_calls, function_bodies):
        tool_calls.append(
            {
-                "id": "call_" + "".join(
+                "id": "call_"
-                    [random.choice(string.ascii_letters + string.digits) for _ in range(24)]
+                + "".join(
                    [
                        random.choice(string.ascii_letters + string.digits)
                        for _ in range(24)
                    ]
                ),
                "type": "function",
                "function": {
@ -1924,7 +1977,9 @@ class Llava15ChatHandler:
                        json.dumps(response_format["schema"])
                    )
            except Exception as e:
-                grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+                grammar = llama_grammar.LlamaGrammar.from_string(
                    llama_grammar.JSON_GBNF
                )
        return _convert_completion_to_chat(
            llama.create_completion(
@ -1950,3 +2005,601 @@ class Llava15ChatHandler:
            ),
            stream=stream,
        )
@register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
    llama: llama.Llama,
    messages: List[llama_types.ChatCompletionRequestMessage],
    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
    temperature: float = 0.2,
    top_p: float = 0.95,
    top_k: int = 40,
    min_p: float = 0.05,
    typical_p: float = 1.0,
    stream: bool = False,
    stop: Optional[Union[str, List[str]]] = [],
    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
    max_tokens: Optional[int] = None,
    presence_penalty: float = 0.0,
    frequency_penalty: float = 0.0,
    repeat_penalty: float = 1.1,
    tfs_z: float = 1.0,
    mirostat_mode: int = 0,
    mirostat_tau: float = 5.0,
    mirostat_eta: float = 0.1,
    model: Optional[str] = None,
    logits_processor: Optional[llama.LogitsProcessorList] = None,
    grammar: Optional[llama.LlamaGrammar] = None,
    **kwargs,  # type: ignore
 ) -> Union[
    llama_types.CreateChatCompletionResponse,
    Iterator[llama_types.CreateChatCompletionStreamResponse],
 ]:
    function_calling_template = (
        "{% for message in messages %}"
        "<|im_start|>{{ message.role }}\n"
        # System message
        "{% if message.role == 'system' %}"
        "{{ message.content }}"
        "{% if tool_calls %}"
        "\n\nYou have access to the following functions:\n"
        "{% for tool in tools %}"
        "\nfunctions.{{ tool.function.name }}:\n"
        "{{ tool.function.parameters | tojson }}"
        "\n{% endfor %}"
        "\n\nYou can respond to users messages with either a single message or one or more function calls."
        "\n\nTo respond with a message begin the message with 'message:', use the following format:"
        "\n\nmessage:"
        "\n<message>"
        "\n\nTo respond with one or more function calls begin the message with 'functions.<function_name>:', use the following format:"
        "\n\nfunctions.<function_name>:"
        '\n{ "arg1": "value1", "arg2": "value2" }'
        "\nfunctions.<function_name>:"
        '\n{ "arg1": "value1", "arg2": "value2" }'
        "{% endif %}"
        "\n<|im_end|>\n"
        "{% endif %}"
        # User message
        "{% if message.role == 'user' %}"
        "{{ message.content }}"
        "\n<|im_end|>\n"
        "{% endif %}"
        # Assistant message
        "{% if message.role == 'assistant' %}"
        ## Reglar message
        "{% if message.content and message.content | length > 0 %}"
        "message:\n"
        "{{ message.content }}"
        "\n<|im_end|>\n"
        "{% endif %}"
        ## Function calls
        "{% if message.tool_calls %}"
        "{% for tool_call in message.tool_calls %}"
        "functions.{{ tool_call.function.name }}:\n"
        "{{ tool_call.function.arguments }}"
        "{% endfor %}"
        "\n<|im_end|>\n"
        "{% endif %}"
        "{% endif %}"
        "{% endfor %}"
    )
    template_renderer = jinja2.Environment(
        loader=jinja2.BaseLoader(),
        autoescape=jinja2.select_autoescape(["html", "xml"]),
        undefined=jinja2.StrictUndefined,
    ).from_string(function_calling_template)
    # Convert legacy functions to tools
    if functions is not None:
        tools = [
            {
                "type": "function",
                "function": function,
            }
            for function in functions
        ]
    # Convert legacy function_call to tool_choice
    if function_call is not None:
        if isinstance(function_call, str) and (
            function_call == "none" or function_call == "auto"
        ):
            tool_choice = function_call
        if isinstance(function_call, dict) and "name" in function_call:
            tool_choice = {
                "type": "function",
                "function": {
                    "name": function_call["name"],
                },
            }
    # Case 1: No tool choice by user
    if (
        tool_choice is None
        or (isinstance(tool_choice, str) and tool_choice == "none")
        or tools is None
        or len(tools) == 0
    ):
        prompt = template_renderer.render(
            messages=messages,
            tools=[],
            tool_calls=None,
        )
        if response_format is not None and response_format["type"] == "json_object":
            try:
                grammar = (
                    llama_grammar.LlamaGrammar.from_json_schema(
                        json.dumps(response_format["schema"])
                    )
                    if "schema" in response_format
                    else None
                )
            except Exception as e:
                if llama.verbose:
                    print(
                        "Failed to parse response format as JSON schema, falling back to default grammar"
                    )
                    print(e)
            grammar = (
                llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
                if grammar is None
                else grammar
            )
        return _convert_completion_to_chat(
            llama.create_completion(
                prompt=prompt,
                temperature=temperature,
                top_p=top_p,
                top_k=top_k,
                min_p=min_p,
                typical_p=typical_p,
                stream=stream,
                stop=stop,
                max_tokens=max_tokens,
                presence_penalty=presence_penalty,
                frequency_penalty=frequency_penalty,
                repeat_penalty=repeat_penalty,
                tfs_z=tfs_z,
                mirostat_mode=mirostat_mode,
                mirostat_tau=mirostat_tau,
                mirostat_eta=mirostat_eta,
                model=model,
                logits_processor=logits_processor,
                grammar=grammar,
            ),
            stream=stream,
        )
    def _convert_completion_to_chat_function(
        tool_name: str,
        completion_or_chunks: Union[
            llama_types.CreateCompletionResponse,
            Iterator[llama_types.CreateCompletionStreamResponse],
        ],
        stream: bool,
    ):
        if not stream:
            completion: llama_types.CreateCompletionResponse = completion_or_chunks  # type: ignore
            assert "usage" in completion
            tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
            # TODO: Fix for legacy function calls
            chat_completion: llama_types.CreateChatCompletionResponse = {
                "id": "chat" + completion["id"],
                "object": "chat.completion",
                "created": completion["created"],
                "model": completion["model"],
                "choices": [
                    {
                        "index": 0,
                        "message": {
                            "role": "assistant",
                            "content": None,
                            "function_call": {
                                "name": tool_name,
                                "arguments": completion["choices"][0]["text"],
                            },
                            "tool_calls": [
                                {
                                    "id": tool_id,
                                    "type": "function",
                                    "function": {
                                        "name": tool_name,
                                        "arguments": completion["choices"][0]["text"],
                                    },
                                }
                            ],
                        },
                        "finish_reason": "tool_calls",
                    }
                ],
                "usage": completion["usage"],
            }
            return chat_completion
        else:
            chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
            def _stream_response_to_function_stream(
                chunks: Iterator[llama_types.CreateCompletionStreamResponse],
            ) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
                # blank first message
                first = True
                id_ = None
                created = None
                model = None
                tool_id = None
                for chunk in chunks:
                    if first:
                        id_ = "chat" + chunk["id"]
                        created = chunk["created"]
                        model = chunk["model"]
                        tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
                        yield {
                            "id": id_,
                            "object": "chat.completion.chunk",
                            "created": created,
                            "model": model,
                            "choices": [
                                {
                                    "index": 0,
                                    "finish_reason": None,
                                    "logprobs": None,
                                    "delta": {
                                        "role": "assistant",
                                        "content": None,
                                        "function_call": None,
                                        "tool_calls": None,
                                    },
                                }
                            ],
                        }
                        yield {
                            "id": "chat" + chunk["id"],
                            "object": "chat.completion.chunk",
                            "created": chunk["created"],
                            "model": chunk["model"],
                            "choices": [
                                {
                                    "index": 0,
                                    "finish_reason": None,
                                    "logprobs": None,
                                    "delta": {
                                        "role": None,
                                        "content": None,
                                        "function_call": {
                                            "name": tool_name,
                                            "arguments": chunk["choices"][0]["text"],
                                        },
                                        "tool_calls": [
                                            {
                                                "index": 0,
                                                "id": tool_id,
                                                "type": "function",
                                                "function": {
                                                    "name": tool_name,
                                                    "arguments": "",
                                                },
                                            }
                                        ],
                                    },
                                }
                            ],
                        }
                        first = False
                        continue
                    assert tool_id is not None
                    yield {
                        "id": "chat" + chunk["id"],
                        "object": "chat.completion.chunk",
                        "created": chunk["created"],
                        "model": chunk["model"],
                        "choices": [
                            {
                                "index": 0,
                                "finish_reason": None,
                                "logprobs": None,
                                "delta": {
                                    "role": None,
                                    "content": None,
                                    "function_call": {
                                        "name": tool_name,
                                        "arguments": chunk["choices"][0]["text"],
                                    },
                                    "tool_calls": [
                                        {
                                            "index": 0,
                                            "id": tool_id,
                                            "type": "function",
                                            "function": {
                                                "name": tool_name,
                                                "arguments": chunk["choices"][0][
                                                    "text"
                                                ],
                                            },
                                        }
                                    ],
                                },
                            }
                        ],
                    }
                if id_ is not None and created is not None and model is not None:
                    yield {
                        "id": id_,
                        "object": "chat.completion.chunk",
                        "created": created,
                        "model": model,
                        "choices": [
                            {
                                "index": 0,
                                "finish_reason": "tool_calls",
                                "logprobs": None,
                                "delta": {
                                    "role": None,
                                    "content": None,
                                    "function_call": None,
                                    "tool_calls": None,
                                },
                            }
                        ],
                    }
            return _stream_response_to_function_stream(chunks)
    # Case 2: Tool choice by user
    if isinstance(tool_choice, dict):
        tool_name = tool_choice["function"]["name"]
        tool = next(
            (tool for tool in tools if tool["function"]["name"] == tool_name), None
        )
        if tool is None:
            raise ValueError(f"Tool with name '{tool_name}' not found in tools")
        prompt = template_renderer.render(
            messages=messages,
            tools=tools,
            tool_calls=True,
        )
        prompt += f"functions.{tool_name}:\n"
        try:
            grammar = llama_grammar.LlamaGrammar.from_json_schema(
                json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
            )
        except Exception as e:
            grammar = llama_grammar.LlamaGrammar.from_string(
                llama_grammar.JSON_GBNF, verbose=llama.verbose
            )
            if llama.verbose:
                print(
                    "Failed to parse function body as JSON schema, falling back to default grammar"
                )
                print(e)
        completion_or_chunks = llama.create_completion(
            prompt=prompt,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            min_p=min_p,
            typical_p=typical_p,
            stream=stream,
            stop=stop,
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
            frequency_penalty=frequency_penalty,
            repeat_penalty=repeat_penalty,
            tfs_z=tfs_z,
            mirostat_mode=mirostat_mode,
            mirostat_tau=mirostat_tau,
            mirostat_eta=mirostat_eta,
            model=model,
            logits_processor=logits_processor,
            grammar=grammar,
        )
        return _convert_completion_to_chat_function(
            tool_name, completion_or_chunks, stream
        )
    # Case 3: Automatic tool choice
    assert isinstance(tool_choice, str) and tool_choice == "auto"
    function_names = " | ".join(
        [f'''"functions.{tool['function']['name']}:"''' for tool in tools]
    )
    initial_gbnf_tool_grammar = (
        """root   ::= functions | "message:"\n"""
        f"""functions ::= {function_names}\n"""
    )
    follow_up_gbnf_tool_grammar = (
        """root   ::= functions | "<|im_end|>"\n"""
        f"""functions ::= {function_names}\n"""
    )
    prompt = template_renderer.render(
        messages=messages,
        tools=tools,
        tool_calls=True,
    )
    completion_or_chunks = llama.create_completion(
        prompt=prompt,
        temperature=0,
        top_p=top_p,
        top_k=top_k,
        min_p=min_p,
        typical_p=typical_p,
        stream=False,
        stop=[":"],
        max_tokens=None,
        presence_penalty=presence_penalty,
        frequency_penalty=frequency_penalty,
        repeat_penalty=repeat_penalty,
        tfs_z=tfs_z,
        mirostat_mode=mirostat_mode,
        mirostat_tau=mirostat_tau,
        mirostat_eta=mirostat_eta,
        model=model,
        logits_processor=logits_processor,
        grammar=llama_grammar.LlamaGrammar.from_string(
            initial_gbnf_tool_grammar, verbose=llama.verbose
        ),
    )
    completion: llama_types.CreateCompletionResponse = completion_or_chunks  # type: ignore
    text = completion["choices"][0]["text"]
    if "message" in text:
        return _convert_completion_to_chat(
            llama.create_completion(
                prompt=prompt + "message:\n",
                temperature=temperature,
                top_p=top_p,
                top_k=top_k,
                min_p=min_p,
                typical_p=typical_p,
                stream=stream,
                stop=["<|im_end|>"],
                max_tokens=None,
                presence_penalty=presence_penalty,
                frequency_penalty=frequency_penalty,
                repeat_penalty=repeat_penalty,
                tfs_z=tfs_z,
                mirostat_mode=mirostat_mode,
                mirostat_tau=mirostat_tau,
                mirostat_eta=mirostat_eta,
                model=model,
                logits_processor=logits_processor,
                grammar=llama_grammar.LlamaGrammar.from_string(
                    follow_up_gbnf_tool_grammar, verbose=llama.verbose
                ),
            ),
            stream=stream,
        )
    # One or more function calls
    tool_name = text[len("functions.") :]
    tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
    if not stream:
        completions = []
        completions_tool_name = []
        while tool is not None:
            prompt += f"functions.{tool_name}:\n"
            try:
                grammar = llama_grammar.LlamaGrammar.from_json_schema(
                    json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
                )
            except Exception as e:
                grammar = llama_grammar.LlamaGrammar.from_string(
                    llama_grammar.JSON_GBNF, verbose=llama.verbose
                )
                if llama.verbose:
                    print(
                        "Failed to parse function body as JSON schema, falling back to default grammar"
                    )
                    print(e)
            completion_or_chunks = llama.create_completion(
                prompt=prompt,
                temperature=temperature,
                top_p=top_p,
                top_k=top_k,
                min_p=min_p,
                typical_p=typical_p,
                stream=False,
                stop=stop,
                max_tokens=None,
                presence_penalty=presence_penalty,
                frequency_penalty=frequency_penalty,
                repeat_penalty=repeat_penalty,
                tfs_z=tfs_z,
                mirostat_mode=mirostat_mode,
                mirostat_tau=mirostat_tau,
                mirostat_eta=mirostat_eta,
                model=model,
                logits_processor=logits_processor,
                grammar=grammar,
            )
            completions.append(completion_or_chunks)
            completions_tool_name.append(tool_name)
            prompt += completion_or_chunks["choices"][0]["text"]
            prompt += "\n"
            response = llama.create_completion(
                prompt=prompt,
                temperature=temperature,
                top_p=top_p,
                top_k=top_k,
                min_p=min_p,
                typical_p=typical_p,
                stream=False,
                stop=stop,
                max_tokens=None,
                presence_penalty=presence_penalty,
                frequency_penalty=frequency_penalty,
                repeat_penalty=repeat_penalty,
                tfs_z=tfs_z,
                mirostat_mode=mirostat_mode,
                mirostat_tau=mirostat_tau,
                mirostat_eta=mirostat_eta,
                model=model,
                logits_processor=logits_processor,
                grammar=llama_grammar.LlamaGrammar.from_string(
                    follow_up_gbnf_tool_grammar, verbose=llama.verbose
                ),
            )
            tool_name = response["choices"][0]["text"][len("functions.") :]
            tool = next(
                (tool for tool in tools if tool["function"]["name"] == tool_name), None
            )
        # Merge completions
        function_call = { 
            "function_call": {
                "name": tool_name,
                "arguments": completions[0]["choices"][0]["text"],
            }
        } if len(completions) == 1 else {}
        return {
            "id": "chat" + completion["id"],
            "object": "chat.completion",
            "created": completion["created"],
            "model": completion["model"],
            "choices": [
                {
                    "finish_reason": "tool_calls",
                    "index": 0,
                    "message": {
                        "role": "assistant",
                        "content": None,
                        "tool_calls": [
                            {
                                "id": "call_"
                                + f"_{i}_"
                                + tool_name
                                + "_"
                                + completion["id"],
                                "type": "function",
                                "function": {
                                    "name": tool_name,
                                    "arguments": completion["choices"][0]["text"],
                                },
                            }
                            for i, (tool_name, completion) in enumerate(
                                zip(completions_tool_name, completions)
                            )
                        ],
                        **function_call
                    },
                }
            ],
            "usage": {
                "completion_tokens": sum(
                    completion["usage"]["completion_tokens"]
                    for completion in completions
                ),
                "prompt_tokens": sum(
                    completion["usage"]["prompt_tokens"] for completion in completions
                ),
                "total_tokens": sum(
                    completion["usage"]["total_tokens"] for completion in completions
                ),
            },
        }
    raise ValueError("Automatic streaming tool choice is not supported")
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@ -97,7 +97,7 @@ class CreateChatCompletionResponse(TypedDict):
 class ChatCompletionMessageToolCallChunkFunction(TypedDict):
-    name: str
+    name: Optional[str]
    arguments: str
@ -118,12 +118,12 @@ class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
 class ChatCompletionStreamResponseDelta(TypedDict):
-    content: NotRequired[str]
+    content: NotRequired[Optional[str]]
    function_call: NotRequired[
-        ChatCompletionStreamResponseDeltaFunctionCall
+        Optional[ChatCompletionStreamResponseDeltaFunctionCall]
    ]  # DEPRECATED
-    tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]]
+    tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]]
-    role: NotRequired[Literal["system", "user", "assistant", "tool"]]
+    role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]]
 class ChatCompletionStreamResponseChoice(TypedDict):
@ -132,6 +132,7 @@ class ChatCompletionStreamResponseChoice(TypedDict):
        ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
    ]
    finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
    logprobs: NotRequired[Optional[CompletionLogprobs]]
 class CreateChatCompletionStreamResponse(TypedDict):