diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 61027ef..63c81f1 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -16,7 +16,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- submodules: "true"
+ submodules: "recursive"
# Used to host cibuildwheel
- uses: actions/setup-python@v3
@@ -48,7 +48,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- submodules: "true"
+ submodules: "recursive"
- uses: actions/setup-python@v3
with:
python-version: "3.8"
diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index 27a6b1e..750b91e 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -14,7 +14,7 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
with:
- submodules: "true"
+ submodules: "recursive"
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
diff --git a/.github/workflows/publish-to-test.yaml b/.github/workflows/publish-to-test.yaml
index 9932d61..47e7c40 100644
--- a/.github/workflows/publish-to-test.yaml
+++ b/.github/workflows/publish-to-test.yaml
@@ -18,7 +18,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- submodules: "true"
+ submodules: "recursive"
- name: Set up Python
uses: actions/setup-python@v4
with:
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
index 7d6c970..1afdd66 100644
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@@ -12,7 +12,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- submodules: "true"
+ submodules: "recursive"
- name: Set up Python
uses: actions/setup-python@v4
with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2cc6fb0..77df546 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -19,7 +19,7 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
- submodules: "true"
+ submodules: "recursive"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
@@ -42,7 +42,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- submodules: "true"
+ submodules: "recursive"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
@@ -65,7 +65,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- submodules: "true"
+ submodules: "recursive"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
@@ -85,7 +85,7 @@ jobs:
# steps:
# - uses: actions/checkout@v3
# with:
- # submodules: "true"
+ # submodules: "recursive"
# - name: Set up Python 3.8
# uses: actions/setup-python@v4
# with:
@@ -112,7 +112,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- submodules: "true"
+ submodules: "recursive"
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9632210..39b553f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,11 +7,42 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+## [0.2.43]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@8084d554406b767d36b3250b3b787462d5dd626f
+- feat: Support batch embeddings by @iamlemec in #1186
+- fix: submodule kompute is not included in sdist by @abetlen in 7dbbfdecadebe7750be650d9409959640ff9a460
+- fix: fix: Update openbuddy prompt format by @abetlen in 07a783779a62a4aac0b11161c7e0eb983ff215f8
+
+## [0.2.42]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@ea9c8e11436ad50719987fa23a289c74b7b40d40
+- fix: sample idx off-by-one error for logit_processors by @lapp0 in #1179
+- fix: chat formatting bugs in `chatml-function-calling` by @abetlen in 4b0e3320bd8c2c209e29978d0b21e2e471cc9ee3 and 68fb71b6a26a1e57331868f959b47ab4b87851e1
+
+## [0.2.41]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@895407f31b358e3d9335e847d13f033491ec8a5b
+- fix: Don't change order of json schema object properties in generated grammar unless prop_order is passed by @abetlen in d1822fed6b706f38bd1ff0de4dec5baaa3cf84fa
+
+## [0.2.40]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@3bdc4cd0f595a6096cca4a64aa75ffa8a3503465
+- feat: Generic chatml Function Calling using chat_format="chatml-function-calling"` by @abetlen in #957
+- fix: Circular dependancy preventing early Llama object free by @notwa in #1176
+- docs: Set the correct command for compiling with syscl support by @akarshanbiswas in #1172
+- feat: use gpu backend for clip if available by @iamlemec in #1175
+
+## [0.2.39]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@b08f22c882a1443e6b97081f3ce718a4d1a741f8
+- fix: Fix destructor logging bugs by using llama_log_callback to avoid suppress_stdout_stderr by @abetlen in 59760c85eddc72dfcc1839f43760ef72c23d6874
+
## [0.2.38]
- feat: Update llama.cpp to ggerganov/llama.cpp@1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915
- feat: Add speculative decoding by @abetlen in #1120
-- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template 078cca0361bf5a94d2cf52ed04980d20e32d6f95
+- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template by @abetlen in 078cca0361bf5a94d2cf52ed04980d20e32d6f95
## [0.2.37]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 795dad7..b4df8ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,14 @@ if (LLAMA_BUILD)
)
if (LLAVA_BUILD)
+ if (LLAMA_CUBLAS)
+ add_compile_definitions(GGML_USE_CUBLAS)
+ endif()
+
+ if (LLAMA_METAL)
+ add_compile_definitions(GGML_USE_METAL)
+ endif()
+
# Building llava
add_subdirectory(vendor/llama.cpp/examples/llava)
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
diff --git a/Makefile b/Makefile
index ff1484c..e2ce4d0 100644
--- a/Makefile
+++ b/Makefile
@@ -19,10 +19,10 @@ build.opencl:
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
build.openblas:
- CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
+ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e .
build.blis:
- CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" python3 -m pip install --verbose -e .
+ CMAKE_ARGS="-DLLAMA_BLAS=on -DLLAMA_BLAS_VENDOR=FLAME" python3 -m pip install --verbose -e .
build.metal:
CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install --verbose -e .
diff --git a/README.md b/README.md
index 4131bb3..3d8d4d4 100644
--- a/README.md
+++ b/README.md
@@ -118,7 +118,8 @@ CMAKE_ARGS="-DLLAMA_KOMPUTE=on" pip install llama-cpp-python
To install with SYCL support, set the `LLAMA_SYCL=on` environment variable before installing:
```bash
-CMAKE_ARGS="-DLLAMA_SYCL=on" pip install llama-cpp-python
+source /opt/intel/oneapi/setvars.sh
+CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install llama-cpp-python
```
### Windows Notes
@@ -291,14 +292,15 @@ To constrain the response to a specific JSON Schema, you can use the `schema` pr
### Function Calling
-The high-level API also provides a simple interface for function calling.
+The high-level API also provides a simple interface for function calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.
-Note that the only model that supports full function calling at this time is "functionary".
-The gguf-converted files for this model can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
+The gguf-converted files for functionary can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
```python
>>> from llama_cpp import Llama
>>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
+>>> # or
+>>> llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling")
>>> llm.create_chat_completion(
messages = [
{
diff --git a/examples/notebooks/OpenHermesFunctionCalling.ipynb b/examples/notebooks/OpenHermesFunctionCalling.ipynb
new file mode 100644
index 0000000..c0de3fd
--- /dev/null
+++ b/examples/notebooks/OpenHermesFunctionCalling.ipynb
@@ -0,0 +1,910 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " \"name\": \"get_article_details\",\n",
+ " \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"title\": {\n",
+ " \"type\": \"str\"\n",
+ " },\n",
+ " \"authors\": {\n",
+ " \"type\": \"list[str]\"\n",
+ " },\n",
+ " \"short_summary\": {\n",
+ " \"type\": \"str\"\n",
+ " },\n",
+ " \"date_published\": {\n",
+ " \"type\": \"str\"\n",
+ " },\n",
+ " \"tags\": {\n",
+ " \"type\": \"list[str]\"\n",
+ " }\n",
+ " }\n",
+ " },\n",
+ " \"returns\": \"Article\"\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "import json\n",
+ "import inspect\n",
+ "from typing import get_type_hints\n",
+ "\n",
+ "class Article:\n",
+ " pass\n",
+ "\n",
+ "class Weather:\n",
+ " pass\n",
+ "\n",
+ "class Directions:\n",
+ " pass\n",
+ "\n",
+ "def calculate_mortgage_payment(loan_amount: int, interest_rate: float, loan_term: int) -> float:\n",
+ " \"\"\"Get the monthly mortgage payment given an interest rate percentage.\"\"\"\n",
+ " \n",
+ " # TODO: you must implement this to actually call it later\n",
+ " pass\n",
+ "\n",
+ "def get_article_details(title: str, authors: list[str], short_summary: str, date_published: str, tags: list[str]) -> Article:\n",
+ " '''Get article details from unstructured article text.\n",
+ "date_published: formatted as \"MM/DD/YYYY\"'''\n",
+ " \n",
+ " # TODO: you must implement this to actually call it later\n",
+ " pass\n",
+ "\n",
+ "def get_weather(zip_code: str) -> Weather:\n",
+ " \"\"\"Get the current weather given a zip code.\"\"\"\n",
+ " \n",
+ " # TODO: you must implement this to actually call it later\n",
+ " pass\n",
+ "\n",
+ "def get_directions(start: str, destination: str) -> Directions:\n",
+ " \"\"\"Get directions from Google Directions API.\n",
+ "start: start address as a string including zipcode (if any)\n",
+ "destination: end address as a string including zipcode (if any)\"\"\"\n",
+ " \n",
+ " # TODO: you must implement this to actually call it later\n",
+ " pass\n",
+ "\n",
+ "def get_type_name(t):\n",
+ " name = str(t)\n",
+ " if \"list\" in name or \"dict\" in name:\n",
+ " return name\n",
+ " else:\n",
+ " return t.__name__\n",
+ "\n",
+ "def serialize_function_to_json(func):\n",
+ " signature = inspect.signature(func)\n",
+ " type_hints = get_type_hints(func)\n",
+ "\n",
+ " function_info = {\n",
+ " \"name\": func.__name__,\n",
+ " \"description\": func.__doc__,\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {}\n",
+ " },\n",
+ " \"returns\": type_hints.get('return', 'void').__name__\n",
+ " }\n",
+ "\n",
+ " for name, _ in signature.parameters.items():\n",
+ " param_type = get_type_name(type_hints.get(name, type(None)))\n",
+ " function_info[\"parameters\"][\"properties\"][name] = {\"type\": param_type}\n",
+ "\n",
+ " return json.dumps(function_info, indent=2)\n",
+ "\n",
+ "print(serialize_function_to_json(get_article_details))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import xml.etree.ElementTree as ET\n",
+ "import re\n",
+ "\n",
+ "def extract_function_calls(completion):\n",
+ " completion = completion.strip()\n",
+ " pattern = r\"((.*?))\"\n",
+ " match = re.search(pattern, completion, re.DOTALL)\n",
+ " if not match:\n",
+ " return None\n",
+ " \n",
+ " multiplefn = match.group(1)\n",
+ " root = ET.fromstring(multiplefn)\n",
+ " functions = root.findall(\"functioncall\")\n",
+ " return [json.loads(fn.text) for fn in functions]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def generate_hermes_prompt(prompt, functions):\n",
+ " functions = \"\\n\\n\".join([serialize_function_to_json(fn) for fn in functions])\n",
+ " prompt = f\"\"\"<|im_start|>system\n",
+ "You are a helpful assistant with access to the following functions:\n",
+ "\n",
+ "{functions}\n",
+ "\n",
+ "To use these functions respond with:\n",
+ "\n",
+ " {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} \n",
+ " {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} \n",
+ " ...\n",
+ "\n",
+ "\n",
+ "Edge cases you must handle:\n",
+ "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
+ "<|im_start|>user\n",
+ "{prompt}<|im_end|>\n",
+ "<|im_start|>assistant\"\"\"\n",
+ " return prompt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "<|im_start|>system\n",
+ "You are a helpful assistant with access to the following functions:\n",
+ "\n",
+ "{\n",
+ " \"name\": \"get_weather\",\n",
+ " \"description\": \"Get the current weather given a zip code.\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"zip_code\": {\n",
+ " \"type\": \"str\"\n",
+ " }\n",
+ " }\n",
+ " },\n",
+ " \"returns\": \"Weather\"\n",
+ "}\n",
+ "\n",
+ "{\n",
+ " \"name\": \"calculate_mortgage_payment\",\n",
+ " \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"loan_amount\": {\n",
+ " \"type\": \"int\"\n",
+ " },\n",
+ " \"interest_rate\": {\n",
+ " \"type\": \"float\"\n",
+ " },\n",
+ " \"loan_term\": {\n",
+ " \"type\": \"int\"\n",
+ " }\n",
+ " }\n",
+ " },\n",
+ " \"returns\": \"float\"\n",
+ "}\n",
+ "\n",
+ "{\n",
+ " \"name\": \"get_article_details\",\n",
+ " \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"title\": {\n",
+ " \"type\": \"str\"\n",
+ " },\n",
+ " \"authors\": {\n",
+ " \"type\": \"list[str]\"\n",
+ " },\n",
+ " \"short_summary\": {\n",
+ " \"type\": \"str\"\n",
+ " },\n",
+ " \"date_published\": {\n",
+ " \"type\": \"str\"\n",
+ " },\n",
+ " \"tags\": {\n",
+ " \"type\": \"list[str]\"\n",
+ " }\n",
+ " }\n",
+ " },\n",
+ " \"returns\": \"Article\"\n",
+ "}\n",
+ "\n",
+ "To use these functions respond with:\n",
+ "\n",
+ " {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} \n",
+ " {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} \n",
+ " ...\n",
+ "\n",
+ "\n",
+ "Edge cases you must handle:\n",
+ "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
+ "<|im_start|>user\n",
+ "What's the weather in 10001?<|im_end|>\n",
+ "<|im_start|>assistant\n",
+ "<|im_start|>system\n",
+ "You are a helpful assistant with access to the following functions:\n",
+ "\n",
+ "{\n",
+ " \"name\": \"get_weather\",\n",
+ " \"description\": \"Get the current weather given a zip code.\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"zip_code\": {\n",
+ " \"type\": \"str\"\n",
+ " }\n",
+ " }\n",
+ " },\n",
+ " \"returns\": \"Weather\"\n",
+ "}\n",
+ "\n",
+ "{\n",
+ " \"name\": \"calculate_mortgage_payment\",\n",
+ " \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"loan_amount\": {\n",
+ " \"type\": \"int\"\n",
+ " },\n",
+ " \"interest_rate\": {\n",
+ " \"type\": \"float\"\n",
+ " },\n",
+ " \"loan_term\": {\n",
+ " \"type\": \"int\"\n",
+ " }\n",
+ " }\n",
+ " },\n",
+ " \"returns\": \"float\"\n",
+ "}\n",
+ "\n",
+ "{\n",
+ " \"name\": \"get_article_details\",\n",
+ " \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"title\": {\n",
+ " \"type\": \"str\"\n",
+ " },\n",
+ " \"authors\": {\n",
+ " \"type\": \"list[str]\"\n",
+ " },\n",
+ " \"short_summary\": {\n",
+ " \"type\": \"str\"\n",
+ " },\n",
+ " \"date_published\": {\n",
+ " \"type\": \"str\"\n",
+ " },\n",
+ " \"tags\": {\n",
+ " \"type\": \"list[str]\"\n",
+ " }\n",
+ " }\n",
+ " },\n",
+ " \"returns\": \"Article\"\n",
+ "}\n",
+ "\n",
+ "To use these functions respond with:\n",
+ "\n",
+ " {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} \n",
+ " {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} \n",
+ " ...\n",
+ "\n",
+ "\n",
+ "Edge cases you must handle:\n",
+ "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
+ "<|im_start|>user\n",
+ "Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.<|im_end|>\n",
+ "<|im_start|>assistant\n",
+ "<|im_start|>system\n",
+ "You are a helpful assistant with access to the following functions:\n",
+ "\n",
+ "{\n",
+ " \"name\": \"get_weather\",\n",
+ " \"description\": \"Get the current weather given a zip code.\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"zip_code\": {\n",
+ " \"type\": \"str\"\n",
+ " }\n",
+ " }\n",
+ " },\n",
+ " \"returns\": \"Weather\"\n",
+ "}\n",
+ "\n",
+ "{\n",
+ " \"name\": \"calculate_mortgage_payment\",\n",
+ " \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"loan_amount\": {\n",
+ " \"type\": \"int\"\n",
+ " },\n",
+ " \"interest_rate\": {\n",
+ " \"type\": \"float\"\n",
+ " },\n",
+ " \"loan_term\": {\n",
+ " \"type\": \"int\"\n",
+ " }\n",
+ " }\n",
+ " },\n",
+ " \"returns\": \"float\"\n",
+ "}\n",
+ "\n",
+ "{\n",
+ " \"name\": \"get_article_details\",\n",
+ " \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"title\": {\n",
+ " \"type\": \"str\"\n",
+ " },\n",
+ " \"authors\": {\n",
+ " \"type\": \"list[str]\"\n",
+ " },\n",
+ " \"short_summary\": {\n",
+ " \"type\": \"str\"\n",
+ " },\n",
+ " \"date_published\": {\n",
+ " \"type\": \"str\"\n",
+ " },\n",
+ " \"tags\": {\n",
+ " \"type\": \"list[str]\"\n",
+ " }\n",
+ " }\n",
+ " },\n",
+ " \"returns\": \"Article\"\n",
+ "}\n",
+ "\n",
+ "To use these functions respond with:\n",
+ "\n",
+ " {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} \n",
+ " {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} \n",
+ " ...\n",
+ "\n",
+ "\n",
+ "Edge cases you must handle:\n",
+ "- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
+ "<|im_start|>user\n",
+ "What's the current exchange rate for USD to EUR?<|im_end|>\n",
+ "<|im_start|>assistant\n"
+ ]
+ }
+ ],
+ "source": [
+ "prompts = [\n",
+ " \"What's the weather in 10001?\",\n",
+ " \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n",
+ " \"What's the current exchange rate for USD to EUR?\"\n",
+ "]\n",
+ "functions = [get_weather, calculate_mortgage_payment, get_article_details]\n",
+ "\n",
+ "for prompt in prompts:\n",
+ " print(generate_hermes_prompt(prompt, functions))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "ggml_init_cublas: GGML_CUDA_FORCE_MMQ: no\n",
+ "ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes\n",
+ "ggml_init_cublas: found 1 CUDA devices:\n",
+ " Device 0: NVIDIA GeForce RTX 2060, compute capability 7.5\n",
+ "llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from ../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf (version GGUF V3 (latest))\n",
+ "llama_model_loader: - tensor 0: token_embd.weight q4_K [ 4096, 32002, 1, 1 ]\n",
+ "llama_model_loader: - tensor 1: blk.0.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 2: blk.0.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 3: blk.0.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 4: blk.0.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 5: blk.0.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 6: blk.0.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 7: blk.0.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 8: blk.0.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 9: blk.0.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 10: blk.1.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 11: blk.1.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 12: blk.1.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 13: blk.1.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 14: blk.1.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 15: blk.1.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 16: blk.1.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 17: blk.1.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 18: blk.1.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 19: blk.2.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 20: blk.2.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 21: blk.2.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 22: blk.2.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 23: blk.2.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 24: blk.2.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 25: blk.2.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 26: blk.2.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 27: blk.2.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 28: blk.3.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 29: blk.3.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 30: blk.3.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 31: blk.3.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 32: blk.3.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 33: blk.3.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 34: blk.3.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 35: blk.3.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 36: blk.3.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 37: blk.4.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 38: blk.4.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 39: blk.4.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 40: blk.4.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 41: blk.4.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 42: blk.4.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 43: blk.4.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 44: blk.4.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 45: blk.4.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 46: blk.5.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 47: blk.5.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 48: blk.5.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 49: blk.5.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 50: blk.5.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 51: blk.5.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 52: blk.5.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 53: blk.5.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 54: blk.5.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 55: blk.6.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 56: blk.6.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 57: blk.6.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 58: blk.6.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 59: blk.6.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 60: blk.6.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 61: blk.6.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 62: blk.6.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 63: blk.6.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 64: blk.7.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 65: blk.7.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 66: blk.7.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 67: blk.7.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 68: blk.7.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 69: blk.7.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 70: blk.7.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 71: blk.7.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 72: blk.7.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 73: blk.8.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 74: blk.8.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 75: blk.8.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 76: blk.8.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 77: blk.8.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 78: blk.8.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 79: blk.8.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 80: blk.8.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 81: blk.8.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 82: blk.9.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 83: blk.9.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 84: blk.9.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 85: blk.9.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 86: blk.9.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 87: blk.9.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 88: blk.9.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 89: blk.9.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 90: blk.9.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 91: blk.10.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 92: blk.10.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 93: blk.10.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 94: blk.10.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 95: blk.10.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 96: blk.10.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 97: blk.10.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 98: blk.10.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 99: blk.10.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 100: blk.11.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 101: blk.11.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 102: blk.11.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 103: blk.11.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 104: blk.11.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 105: blk.11.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 106: blk.11.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 107: blk.11.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 108: blk.11.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 109: blk.12.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 110: blk.12.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 111: blk.12.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 112: blk.12.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 113: blk.12.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 114: blk.12.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 115: blk.12.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 116: blk.12.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 117: blk.12.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 118: blk.13.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 119: blk.13.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 120: blk.13.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 121: blk.13.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 122: blk.13.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 123: blk.13.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 124: blk.13.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 125: blk.13.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 126: blk.13.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 127: blk.14.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 128: blk.14.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 129: blk.14.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 130: blk.14.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 131: blk.14.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 132: blk.14.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 133: blk.14.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 134: blk.14.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 135: blk.14.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 136: blk.15.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 137: blk.15.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 138: blk.15.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 139: blk.15.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 140: blk.15.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 141: blk.15.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 142: blk.15.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 143: blk.15.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 144: blk.15.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 145: blk.16.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 146: blk.16.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 147: blk.16.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 148: blk.16.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 149: blk.16.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 150: blk.16.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 151: blk.16.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 152: blk.16.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 153: blk.16.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 154: blk.17.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 155: blk.17.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 156: blk.17.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 157: blk.17.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 158: blk.17.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 159: blk.17.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 160: blk.17.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 161: blk.17.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 162: blk.17.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 163: blk.18.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 164: blk.18.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 165: blk.18.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 166: blk.18.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 167: blk.18.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 168: blk.18.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 169: blk.18.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 170: blk.18.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 171: blk.18.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 172: blk.19.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 173: blk.19.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 174: blk.19.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 175: blk.19.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 176: blk.19.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 177: blk.19.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 178: blk.19.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 179: blk.19.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 180: blk.19.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 181: blk.20.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 182: blk.20.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 183: blk.20.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 184: blk.20.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 185: blk.20.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 186: blk.20.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 187: blk.20.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 188: blk.20.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 189: blk.20.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 190: blk.21.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 191: blk.21.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 192: blk.21.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 193: blk.21.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 194: blk.21.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 195: blk.21.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 196: blk.21.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 197: blk.21.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 198: blk.21.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 199: blk.22.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 200: blk.22.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 201: blk.22.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 202: blk.22.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 203: blk.22.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 204: blk.22.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 205: blk.22.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 206: blk.22.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 207: blk.22.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 208: blk.23.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 209: blk.23.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 210: blk.23.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 211: blk.23.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 212: blk.23.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 213: blk.23.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 214: blk.23.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 215: blk.23.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 216: blk.23.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 217: blk.24.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 218: blk.24.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 219: blk.24.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 220: blk.24.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 221: blk.24.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 222: blk.24.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 223: blk.24.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 224: blk.24.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 225: blk.24.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 226: blk.25.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 227: blk.25.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 228: blk.25.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 229: blk.25.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 230: blk.25.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 231: blk.25.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 232: blk.25.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 233: blk.25.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 234: blk.25.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 235: blk.26.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 236: blk.26.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 237: blk.26.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 238: blk.26.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 239: blk.26.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 240: blk.26.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 241: blk.26.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 242: blk.26.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 243: blk.26.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 244: blk.27.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 245: blk.27.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 246: blk.27.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 247: blk.27.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 248: blk.27.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 249: blk.27.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 250: blk.27.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 251: blk.27.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 252: blk.27.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 253: blk.28.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 254: blk.28.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 255: blk.28.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 256: blk.28.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 257: blk.28.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 258: blk.28.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 259: blk.28.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 260: blk.28.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 261: blk.28.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 262: blk.29.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 263: blk.29.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 264: blk.29.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 265: blk.29.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 266: blk.29.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 267: blk.29.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 268: blk.29.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 269: blk.29.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 270: blk.29.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 271: blk.30.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 272: blk.30.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 273: blk.30.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 274: blk.30.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 275: blk.30.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 276: blk.30.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 277: blk.30.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 278: blk.30.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 279: blk.30.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 280: blk.31.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 281: blk.31.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 282: blk.31.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
+ "llama_model_loader: - tensor 283: blk.31.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 284: blk.31.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 285: blk.31.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
+ "llama_model_loader: - tensor 286: blk.31.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
+ "llama_model_loader: - tensor 287: blk.31.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 288: blk.31.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 289: output_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
+ "llama_model_loader: - tensor 290: output.weight q6_K [ 4096, 32002, 1, 1 ]\n",
+ "llama_model_loader: - kv 0: general.architecture str = llama\n",
+ "llama_model_loader: - kv 1: general.name str = teknium_openhermes-2.5-mistral-7b\n",
+ "llama_model_loader: - kv 2: llama.context_length u32 = 32768\n",
+ "llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n",
+ "llama_model_loader: - kv 4: llama.block_count u32 = 32\n",
+ "llama_model_loader: - kv 5: llama.feed_forward_length u32 = 14336\n",
+ "llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n",
+ "llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n",
+ "llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8\n",
+ "llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n",
+ "llama_model_loader: - kv 10: llama.rope.freq_base f32 = 10000.000000\n",
+ "llama_model_loader: - kv 11: general.file_type u32 = 15\n",
+ "llama_model_loader: - kv 12: tokenizer.ggml.model str = llama\n",
+ "llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32002] = [\"\", \"\", \"\", \"<0x00>\", \"<...\n",
+ "llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32002] = [0.000000, 0.000000, 0.000000, 0.0000...\n",
+ "llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32002] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
+ "llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n",
+ "llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 32000\n",
+ "llama_model_loader: - kv 18: tokenizer.ggml.padding_token_id u32 = 0\n",
+ "llama_model_loader: - kv 19: general.quantization_version u32 = 2\n",
+ "llama_model_loader: - type f32: 65 tensors\n",
+ "llama_model_loader: - type q4_K: 193 tensors\n",
+ "llama_model_loader: - type q6_K: 33 tensors\n",
+ "llm_load_vocab: special tokens definition check successful ( 261/32002 ).\n",
+ "llm_load_print_meta: format = GGUF V3 (latest)\n",
+ "llm_load_print_meta: arch = llama\n",
+ "llm_load_print_meta: vocab type = SPM\n",
+ "llm_load_print_meta: n_vocab = 32002\n",
+ "llm_load_print_meta: n_merges = 0\n",
+ "llm_load_print_meta: n_ctx_train = 32768\n",
+ "llm_load_print_meta: n_embd = 4096\n",
+ "llm_load_print_meta: n_head = 32\n",
+ "llm_load_print_meta: n_head_kv = 8\n",
+ "llm_load_print_meta: n_layer = 32\n",
+ "llm_load_print_meta: n_rot = 128\n",
+ "llm_load_print_meta: n_gqa = 4\n",
+ "llm_load_print_meta: f_norm_eps = 0.0e+00\n",
+ "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n",
+ "llm_load_print_meta: f_clamp_kqv = 0.0e+00\n",
+ "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
+ "llm_load_print_meta: n_ff = 14336\n",
+ "llm_load_print_meta: rope scaling = linear\n",
+ "llm_load_print_meta: freq_base_train = 10000.0\n",
+ "llm_load_print_meta: freq_scale_train = 1\n",
+ "llm_load_print_meta: n_yarn_orig_ctx = 32768\n",
+ "llm_load_print_meta: rope_finetuned = unknown\n",
+ "llm_load_print_meta: model type = 7B\n",
+ "llm_load_print_meta: model ftype = mostly Q4_K - Medium\n",
+ "llm_load_print_meta: model params = 7.24 B\n",
+ "llm_load_print_meta: model size = 4.07 GiB (4.83 BPW) \n",
+ "llm_load_print_meta: general.name = teknium_openhermes-2.5-mistral-7b\n",
+ "llm_load_print_meta: BOS token = 1 ''\n",
+ "llm_load_print_meta: EOS token = 32000 '<|im_end|>'\n",
+ "llm_load_print_meta: UNK token = 0 ''\n",
+ "llm_load_print_meta: PAD token = 0 ''\n",
+ "llm_load_print_meta: LF token = 13 '<0x0A>'\n",
+ "llm_load_tensors: ggml ctx size = 0.11 MiB\n",
+ "llm_load_tensors: using CUDA for GPU acceleration\n",
+ "llm_load_tensors: mem required = 70.42 MiB\n",
+ "llm_load_tensors: offloading 32 repeating layers to GPU\n",
+ "llm_load_tensors: offloading non-repeating layers to GPU\n",
+ "llm_load_tensors: offloaded 35/35 layers to GPU\n",
+ "llm_load_tensors: VRAM used: 4095.06 MiB\n",
+ "...............................................................................................\n",
+ "llama_new_context_with_model: n_ctx = 2048\n",
+ "llama_new_context_with_model: freq_base = 10000.0\n",
+ "llama_new_context_with_model: freq_scale = 1\n",
+ "llama_kv_cache_init: offloading v cache to GPU\n",
+ "llama_kv_cache_init: offloading k cache to GPU\n",
+ "llama_kv_cache_init: VRAM kv self = 256.00 MiB\n",
+ "llama_new_context_with_model: kv self size = 256.00 MiB\n",
+ "llama_build_graph: non-view tensors processed: 740/740\n",
+ "llama_new_context_with_model: compute buffer total size = 159.07 MiB\n",
+ "llama_new_context_with_model: VRAM scratch buffer: 156.00 MiB\n",
+ "llama_new_context_with_model: total VRAM used: 4507.07 MiB (model: 4095.06 MiB, context: 412.00 MiB)\n"
+ ]
+ }
+ ],
+ "source": [
+ "import llama_cpp\n",
+ "\n",
+ "llama = llama_cpp.Llama(model_path=\"../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf\", n_gpu_layers=-1, n_ctx=2048, verbose=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[{'name': 'get_weather', 'arguments': {'zip_code': '10001'}}]\n",
+ "====================================================================================================\n",
+ "[{'name': 'calculate_mortgage_payment', 'arguments': {'loan_amount': 200000, 'interest_rate': 0.04, 'loan_term': 30}}]\n",
+ "====================================================================================================\n",
+ "Unfortunately, I do not have a built-in function to check currency exchange rates. However, you can use third-party APIs or websites like Google Finance or XE to get this information.\n",
+ "====================================================================================================\n"
+ ]
+ }
+ ],
+ "source": [
+ "prompts = [\n",
+ " \"What's the weather in 10001?\",\n",
+ " \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n",
+ " \"What's the current exchange rate for USD to EUR?\"\n",
+ "]\n",
+ "functions = [get_weather, calculate_mortgage_payment, get_article_details]\n",
+ "\n",
+ "for prompt in prompts:\n",
+ " prompt = generate_hermes_prompt(prompt, functions)\n",
+ " completion = llama.create_completion(prompt, max_tokens=-1)[\"choices\"][0][\"text\"]\n",
+ " function_calls = extract_function_calls(completion)\n",
+ " if function_calls:\n",
+ " print(function_calls)\n",
+ " else:\n",
+ " print(completion.strip())\n",
+ " print(\"=\"*100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "get_weather\n",
+ "{'zip_code': '05751'}\n",
+ "====================================================================================================\n",
+ "get_weather\n",
+ "{'zip_code': '05751'}\n",
+ "get_weather\n",
+ "{'zip_code': '07030'}\n",
+ "calculate_mortgage_payment\n",
+ "{'loan_amount': 250000, 'interest_rate': 4.18, 'loan_term': 30}\n",
+ "====================================================================================================\n",
+ "I don't have a function to get exchange rates, but I can provide some resources where you can find this information. You can check websites like Google Finance, XE.com, or Yahoo Finance for up-to-date currency exchange rates.\n",
+ "====================================================================================================\n"
+ ]
+ }
+ ],
+ "source": [
+ "prompts = [\n",
+ " \"What's the weather in 05751?\",\n",
+ " \"I'm planning a trip to Killington, Vermont (05751) from Hoboken, NJ (07030). Can you get me weather for both locations and directions?\",\n",
+ " \"What's the current exchange rate for USD to EUR?\"\n",
+ "]\n",
+ "\n",
+ "for prompt in prompts:\n",
+ " completion = llama.create_completion(generate_hermes_prompt(prompt, functions), max_tokens=-1)[\"choices\"][0][\"text\"]\n",
+ " function_calls = extract_function_calls(completion)\n",
+ "\n",
+ " if function_calls:\n",
+ " for function in function_calls:\n",
+ " print(function[\"name\"])\n",
+ " print(function[\"arguments\"])\n",
+ " else:\n",
+ " print(completion.strip())\n",
+ "\n",
+ " print(\"=\"*100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.5+"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 94cd401..e0bd254 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *
-__version__ = "0.2.38"
\ No newline at end of file
+__version__ = "0.2.43"
\ No newline at end of file
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 3a71ef0..c60fdff 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -42,6 +42,8 @@ class _LlamaModel:
self._llama_free_model = llama_cpp._lib.llama_free_model # type: ignore
+ self.model = None
+
if not os.path.exists(path_model):
raise ValueError(f"Model path does not exist: {path_model}")
@@ -248,6 +250,7 @@ class _LlamaContext:
self.verbose = verbose
self._llama_free = llama_cpp._lib.llama_free # type: ignore
+ self.ctx = None
assert self.model.model is not None
@@ -497,6 +500,7 @@ class _LlamaBatch:
self._llama_batch_free = llama_cpp._lib.llama_batch_free # type: ignore
+ self.batch = None
self.batch = llama_cpp.llama_batch_init(
self.n_tokens, self.embd, self.n_seq_max
)
@@ -506,6 +510,14 @@ class _LlamaBatch:
self._llama_batch_free(self.batch)
self.batch = None
+ def n_tokens(self) -> int:
+ assert self.batch is not None
+ return self.batch.n_tokens
+
+ def reset(self):
+ assert self.batch is not None
+ self.batch.n_tokens = 0
+
def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
assert self.batch is not None
n_tokens = len(batch)
@@ -518,6 +530,20 @@ class _LlamaBatch:
self.batch.logits[i] = logits_all
self.batch.logits[n_tokens - 1] = True
+ def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
+ assert self.batch is not None
+ n_tokens = len(batch)
+ n_tokens0 = self.batch.n_tokens
+ self.batch.n_tokens += n_tokens
+ for i in range(n_tokens):
+ j = n_tokens0 + i
+ self.batch.token[j] = batch[i]
+ self.batch.pos[j] = i
+ self.batch.seq_id[j][0] = seq_id
+ self.batch.n_seq_id[j] = 1
+ self.batch.logits[j] = logits_all
+ self.batch.logits[n_tokens - 1] = True
+
class _LlamaTokenDataArray:
def __init__(self, *, n_vocab: int):
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 85943db..3e09a20 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -19,6 +19,8 @@ from collections import deque
import ctypes
+from llama_cpp.llama_types import List
+
from .llama_types import *
from .llama_grammar import LlamaGrammar
from .llama_cache import (
@@ -27,6 +29,10 @@ from .llama_cache import (
LlamaDiskCache, # type: ignore
LlamaRAMCache, # type: ignore
)
+from .llama_tokenizer import (
+ BaseLlamaTokenizer,
+ LlamaTokenizer
+)
import llama_cpp.llama_cpp as llama_cpp
import llama_cpp.llama_chat_format as llama_chat_format
@@ -44,6 +50,9 @@ from ._internals import (
_LlamaSamplingContext, # type: ignore
)
from ._logger import set_verbose
+from ._utils import (
+ suppress_stdout_stderr
+)
class Llama:
@@ -95,6 +104,8 @@ class Llama:
chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
# Speculative Decoding
draft_model: Optional[LlamaDraftModel] = None,
+ # Tokenizer Override
+ tokenizer: Optional[BaseLlamaTokenizer] = None,
# Misc
verbose: bool = True,
# Extra Params
@@ -159,6 +170,7 @@ class Llama:
chat_format: String specifying the chat format to use when calling create_chat_completion.
chat_handler: Optional chat handler to use when calling create_chat_completion.
draft_model: Optional draft model to use for speculative decoding.
+ tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
verbose: Print verbose output to stderr.
Raises:
@@ -173,7 +185,8 @@ class Llama:
self.numa = numa
if not Llama.__backend_initialized:
- llama_cpp.llama_backend_init(self.numa)
+ with suppress_stdout_stderr(disable=verbose):
+ llama_cpp.llama_backend_init(self.numa)
Llama.__backend_initialized = True
self.model_path = model_path
@@ -235,6 +248,7 @@ class Llama:
self.n_threads_batch = n_threads_batch or max(
multiprocessing.cpu_count() // 2, 1
)
+
# Context Params
self.context_params = llama_cpp.llama_context_default_params()
self.context_params.seed = seed
@@ -267,7 +281,7 @@ class Llama:
)
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
self.context_params.mul_mat_q = mul_mat_q
- self.context_params.logits_all = logits_all
+ self.context_params.logits_all = logits_all if draft_model is None else True # Must be set to True for speculative decoding
self.context_params.embedding = embedding
self.context_params.offload_kqv = offload_kqv
@@ -286,6 +300,10 @@ class Llama:
self._model = _LlamaModel(
path_model=self.model_path, params=self.model_params, verbose=self.verbose
)
+
+ # Override tokenizer
+ self.tokenizer_ = tokenizer or LlamaTokenizer(self)
+
# Set the default value for the context and correct the batch
if n_ctx == 0:
n_ctx = self._model.n_ctx_train()
@@ -431,18 +449,19 @@ class Llama:
Returns:
A list of tokens.
"""
- return self._model.tokenize(text, add_bos, special)
+ return self.tokenizer_.tokenize(text, add_bos, special)
- def detokenize(self, tokens: List[int]) -> bytes:
+ def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
"""Detokenize a list of tokens.
Args:
tokens: The list of tokens to detokenize.
+ prev_tokens: The list of previous tokens. Offset mapping will be performed if provided
Returns:
The detokenized string.
"""
- return self._model.detokenize(tokens)
+ return self.tokenizer_.detokenize(tokens, prev_tokens)
def set_cache(self, cache: Optional[BaseLlamaCache]):
"""Set the cache.
@@ -538,7 +557,7 @@ class Llama:
logits[:] = (
logits_processor(self._input_ids, logits)
if idx is None
- else logits_processor(self._input_ids[:idx], logits)
+ else logits_processor(self._input_ids[:idx + 1], logits)
)
sampling_params = _LlamaSamplingParams(
@@ -698,10 +717,53 @@ class Llama:
Returns:
An embedding object.
"""
- assert self._ctx.ctx is not None
assert self._model.model is not None
model_name: str = model if model is not None else self.model_path
+ # get numeric embeddings
+ embeds: List[List[float]]
+ total_tokens: int
+ embeds, total_tokens = self.embed(input, return_count=True) # type: ignore
+
+ # convert to CreateEmbeddingResponse
+ data: List[Embedding] = [
+ {
+ "object": "embedding",
+ "embedding": emb,
+ "index": idx,
+ }
+ for idx, emb in enumerate(embeds)
+ ]
+
+ return {
+ "object": "list",
+ "data": data,
+ "model": model_name,
+ "usage": {
+ "prompt_tokens": total_tokens,
+ "total_tokens": total_tokens,
+ },
+ }
+
+ def embed(
+ self,
+ input: Union[str, List[str]],
+ normalize: bool = True,
+ truncate: bool = True,
+ return_count: bool = False,
+ ):
+ """Embed a string.
+
+ Args:
+ input: The utf-8 encoded string to embed.
+
+ Returns:
+ A list of embeddings
+ """
+ assert self._ctx.ctx is not None
+ n_embd = self.n_embd()
+ n_ctx = self.n_ctx()
+
if self.context_params.embedding == False:
raise RuntimeError(
"Llama model must be created with embedding=True to call this method"
@@ -715,48 +777,72 @@ class Llama:
else:
inputs = input
- data: List[Embedding] = []
+ # reset batch
+ self._batch.reset()
+
+ # decode and fetch embeddings
+ data: List[List[float]] = []
+ def decode_batch(sizes: List[int]):
+ assert self._ctx.ctx is not None
+ llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
+ self._ctx.decode(self._batch)
+ self._batch.reset()
+
+ # store embeddings
+ for i, s in enumerate(sizes):
+ embedding = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[
+ :n_embd
+ ]
+ norm = np.linalg.norm(embedding) if normalize else s
+ embedding: List[float] = [v / float(norm) for v in embedding]
+ data.append(embedding)
+
+ # init state
total_tokens = 0
- for index, input in enumerate(inputs):
- tokens = self.tokenize(input.encode("utf-8"), special=True)
- self.reset()
- self.eval(tokens)
+ t_batch = 0
+ s_sizes: List[int] = []
+
+ # accumulate batches and encode
+ for text in inputs:
+ tokens = self.tokenize(text.encode("utf-8"))
+ if truncate:
+ tokens = tokens[:n_ctx]
+
n_tokens = len(tokens)
total_tokens += n_tokens
- embedding = llama_cpp.llama_get_embeddings(self._ctx.ctx)[
- : llama_cpp.llama_n_embd(self._model.model)
- ]
- data.append(
- {
- "object": "embedding",
- "embedding": embedding,
- "index": index,
- }
- )
+ # check for overrun
+ if n_tokens > n_ctx:
+ raise ValueError(
+ f"Requested tokens ({n_tokens}) exceed context window of {n_ctx}"
+ )
+
+ # time to eval batch
+ if t_batch + n_tokens > self._n_ctx:
+ decode_batch(s_sizes)
+ t_batch = 0
+ s_sizes = []
+
+ # add to batch
+ self._batch.add_sequence(tokens, len(s_sizes), False)
+ t_batch += n_tokens
+ s_sizes.append(n_tokens)
+
+ # hanlde last batch
+ decode_batch(s_sizes)
+
if self.verbose:
llama_cpp.llama_print_timings(self._ctx.ctx)
- return {
- "object": "list",
- "data": data,
- "model": model_name,
- "usage": {
- "prompt_tokens": total_tokens,
- "total_tokens": total_tokens,
- },
- }
+ output = data[0] if isinstance(input, str) else data
- def embed(self, input: str) -> List[float]:
- """Embed a string.
+ llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
+ self.reset()
- Args:
- input: The utf-8 encoded string to embed.
-
- Returns:
- A list of embeddings
- """
- return list(map(float, self.create_embedding(input)["data"][0]["embedding"]))
+ if return_count:
+ return output, total_tokens
+ else:
+ return output
def _create_completion(
self,
@@ -1552,6 +1638,38 @@ class Llama:
logit_bias=logit_bias,
)
+ def create_chat_completion_openai_v1(
+ self,
+ *args: Any,
+ **kwargs: Any,
+ ):
+ """Generate a chat completion with return type based on the the OpenAI v1 API.
+
+ OpenAI python package is required to use this method.
+
+ You can install it with `pip install openai`.
+
+ Args:
+ *args: Positional arguments to pass to create_chat_completion.
+ **kwargs: Keyword arguments to pass to create_chat_completion.
+
+ Returns:
+ Generated chat completion or a stream of chat completion chunks.
+ """
+ try:
+ from openai.types.chat import ChatCompletion, ChatCompletionChunk
+ stream = kwargs.get("stream", False) # type: ignore
+ assert isinstance(stream, bool)
+ if stream:
+ return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore
+ else:
+ return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore
+ except ImportError:
+ raise ImportError(
+ "To use create_chat_completion_openai_v1, you must install the openai package."
+ "You can install it with `pip install openai`."
+ )
+
def __getstate__(self):
return dict(
model_path=self.model_path,
@@ -1693,8 +1811,8 @@ class Llama:
"""Return the vocabulary size."""
return self._model.n_vocab()
- def tokenizer(self) -> "LlamaTokenizer":
- """Return the tokenizer for this model."""
+ def tokenizer(self) -> LlamaTokenizer:
+ """Return the llama tokenizer for this model."""
return LlamaTokenizer(self)
def token_eos(self) -> int:
@@ -1738,21 +1856,6 @@ class Llama:
return longest_prefix
-class LlamaTokenizer:
- def __init__(self, llama: Llama):
- self.llama = llama
-
- def encode(self, text: str, add_bos: bool = True) -> List[int]:
- return self.llama.tokenize(
- text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=True
- )
-
- def decode(self, tokens: List[int]) -> str:
- return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
-
- @classmethod
- def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
- return cls(Llama(model_path=path, vocab_only=True))
class LlamaState:
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 08f991b..8dd0ddf 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -4,7 +4,9 @@ import os
import json
import ctypes
import dataclasses
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol
+import random
+import string
+from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol
import jinja2
@@ -29,6 +31,7 @@ MISTRAL_INSTRUCT_EOS_TOKEN = ""
### Chat Completion Handler ###
+
class LlamaChatCompletionHandler(Protocol):
"""Base Protocol for a llama chat completion handler.
@@ -75,8 +78,7 @@ class LlamaChatCompletionHandler(Protocol):
) -> Union[
llama_types.CreateChatCompletionResponse,
Iterator[llama_types.CreateChatCompletionStreamResponse],
- ]:
- ...
+ ]: ...
class LlamaChatCompletionHandlerNotFoundException(Exception):
@@ -132,6 +134,7 @@ def register_chat_completion_handler(name: str):
### Chat Formatter ###
+
@dataclasses.dataclass
class ChatFormatterResponse:
"""Dataclass that stores completion parameters for a given chat format and
@@ -155,8 +158,7 @@ class ChatFormatter(Protocol):
*,
messages: List[llama_types.ChatCompletionRequestMessage],
**kwargs: Any,
- ) -> ChatFormatterResponse:
- ...
+ ) -> ChatFormatterResponse: ...
class Jinja2ChatFormatter(ChatFormatter):
@@ -193,7 +195,7 @@ class Jinja2ChatFormatter(ChatFormatter):
eos_token=self.eos_token,
bos_token=self.bos_token,
raise_exception=raise_exception,
- add_generation_prompt=self.add_generation_prompt
+ add_generation_prompt=self.add_generation_prompt,
)
return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token])
@@ -253,11 +255,13 @@ def _convert_text_completion_chunks_to_chat(
"choices": [
{
"index": 0,
- "delta": {
- "content": chunk["choices"][0]["text"],
- }
- if chunk["choices"][0]["finish_reason"] is None
- else {},
+ "delta": (
+ {
+ "content": chunk["choices"][0]["text"],
+ }
+ if chunk["choices"][0]["finish_reason"] is None
+ else {}
+ ),
"finish_reason": chunk["choices"][0]["finish_reason"],
}
],
@@ -336,10 +340,12 @@ def chat_formatter_to_chat_completion_handler(
# create grammar from json schema
if "schema" in response_format:
grammar = llama_grammar.LlamaGrammar.from_json_schema(
- json.dumps(response_format["schema"])
+ json.dumps(response_format["schema"]), verbose=llama.verbose
)
except Exception as e:
- grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+ grammar = llama_grammar.LlamaGrammar.from_string(
+ llama_grammar.JSON_GBNF, verbose=llama.verbose
+ )
completion_or_chunks = llama.create_completion(
prompt=prompt,
@@ -450,7 +456,9 @@ def hf_tokenizer_config_to_chat_completion_handler(
tokenizer_config: Dict[str, Any],
add_generation_prompt: bool = True,
) -> LlamaChatCompletionHandler:
- chat_formatter = hf_tokenizer_config_to_chat_formatter(tokenizer_config, add_generation_prompt=add_generation_prompt)
+ chat_formatter = hf_tokenizer_config_to_chat_formatter(
+ tokenizer_config, add_generation_prompt=add_generation_prompt
+ )
return chat_formatter_to_chat_completion_handler(chat_formatter)
@@ -461,11 +469,12 @@ def guess_chat_format_from_gguf_metadata(metadata: Dict[str, str]) -> Optional[s
if metadata["tokenizer.chat_template"] == CHATML_CHAT_TEMPLATE:
return "chatml"
- if metadata["tokenizer.chat_template"] == MISTRAL_INSTRUCT_CHAT_TEMPLATE:
+ if metadata["tokenizer.chat_template"] == MISTRAL_INSTRUCT_CHAT_TEMPLATE:
return "mistral-instruct"
return None
+
### Utility functions for formatting chat prompts ###
# TODO: Replace these with jinja2 templates
@@ -725,17 +734,14 @@ def format_openbuddy(
messages: List[llama_types.ChatCompletionRequestMessage],
**kwargs: Any,
) -> ChatFormatterResponse:
- _system_message = """Consider a conversation between User (a human) and Assistant (named Buddy).
-Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
-Buddy cannot access the Internet.
-Buddy can fluently speak the user's language (e.g. English, Chinese).
-Buddy can generate poems, stories, code, essays, songs, parodies, and more.
-Buddy possesses vast knowledge about the world, history, and culture.
-Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
-Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
+ _system_message = """You are a helpful, respectful and honest INTP-T AI Assistant named Buddy. You are talking to a human User.
+Always answer as helpfully and logically as possible, while being safe. Your answers should not include any harmful, political, religious, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
+You can speak fluently in many languages, for example: English, Chinese.
+You cannot access the internet, but you have vast knowledge, cutoff: 2021-09.
+You are trained by OpenBuddy team, (https://openbuddy.ai, https://github.com/OpenBuddy/OpenBuddy), you are based on LLaMA and Falcon transformers model, not related to GPT or OpenAI.
-User: Hi.
-Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?"""
+"""
_roles = dict(user="User", assistant="Assistant")
_sep = "\n"
system_message = _system_message
@@ -914,9 +920,17 @@ def format_mistral_instruct(
stop = eos
prompt = bos
for message in messages:
- if message["role"] == "user" and message["content"] is not None and isinstance(message["content"], str):
+ if (
+ message["role"] == "user"
+ and message["content"] is not None
+ and isinstance(message["content"], str)
+ ):
prompt += "[INST] " + message["content"]
- elif message["role"] == "assistant" and message["content"] is not None and isinstance(message["content"], str):
+ elif (
+ message["role"] == "assistant"
+ and message["content"] is not None
+ and isinstance(message["content"], str)
+ ):
prompt += " [/INST]" + message["content"] + eos
prompt += " [/INST]"
return ChatFormatterResponse(prompt=prompt, stop=stop)
@@ -956,6 +970,7 @@ def format_openchat(
_prompt = _format_chatml(system_message, _messages, _sep)
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
+
# Chat format for Saiga models, see more details and available models:
# https://huggingface.co/collections/IlyaGusev/saiga2-saigamistral-6505d4ccc3d1e53166b636cd
@register_chat_format("saiga")
@@ -977,8 +992,10 @@ def format_saiga(
_prompt += "bot"
return ChatFormatterResponse(prompt=_prompt.strip())
+
# Tricky chat formats that require custom chat handlers
+
@register_chat_completion_handler("functionary")
def functionary_chat_handler(
llama: llama.Llama,
@@ -1251,7 +1268,8 @@ def functionary_chat_handler(
json.dumps(function_body)
)
grammar = llama_grammar.LlamaGrammar.from_string(
- llama_grammar.json_schema_to_gbnf(json.dumps(function_body))
+ llama_grammar.json_schema_to_gbnf(json.dumps(function_body)),
+ verbose=llama.verbose,
)
print(grammar_text)
except Exception as e:
@@ -1262,11 +1280,14 @@ def functionary_chat_handler(
print(e)
with suppress_stdout_stderr(disable=llama.verbose):
grammar = llama_grammar.LlamaGrammar.from_string(
- llama_grammar.JSON_GBNF
+ llama_grammar.JSON_GBNF,
+ verbose=llama.verbose,
)
else:
with suppress_stdout_stderr(disable=llama.verbose):
- grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+ grammar = llama_grammar.LlamaGrammar.from_string(
+ llama_grammar.JSON_GBNF, verbose=llama.verbose
+ )
completion: llama_types.Completion = llama.create_completion(
prompt=new_prompt,
@@ -1332,6 +1353,466 @@ def functionary_chat_handler(
)
+@register_chat_completion_handler("functionary-v1")
+@register_chat_completion_handler("functionary-v2")
+def functionary_v1_v2_chat_handler(
+ llama: llama.Llama,
+ messages: List[llama_types.ChatCompletionRequestMessage],
+ functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+ function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+ tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+ tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+ temperature: float = 0.2,
+ top_p: float = 0.95,
+ top_k: int = 40,
+ min_p: float = 0.05,
+ typical_p: float = 1.0,
+ stream: bool = False,
+ stop: Optional[Union[str, List[str]]] = [],
+ response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
+ max_tokens: Optional[int] = None,
+ presence_penalty: float = 0.0,
+ frequency_penalty: float = 0.0,
+ repeat_penalty: float = 1.1,
+ tfs_z: float = 1.0,
+ mirostat_mode: int = 0,
+ mirostat_tau: float = 5.0,
+ mirostat_eta: float = 0.1,
+ model: Optional[str] = None,
+ logits_processor: Optional[llama.LogitsProcessorList] = None,
+ grammar: Optional[llama.LlamaGrammar] = None,
+ **kwargs, # type: ignore
+) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+ SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
+
+ tokenizer = llama.tokenizer_
+ assert hasattr(
+ tokenizer, "hf_tokenizer"
+ ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
+ from transformers import AutoTokenizer
+
+ if "<|START_OF_FUNCTION_CALL|>" in tokenizer.hf_tokenizer.additional_special_tokens:
+ version = "v1"
+ END_SYSTEM_TOKEN = "<|END_OF_SYSTEM|>"
+ END_USER_TOKEN = "<|END_OF_USER|>"
+ END_ASSISTANT_TOKEN = "<|END_OF_ASSISTANT|>"
+ END_FUNCTION_RESULT_TOKEN = "<|END_OF_FUNCTION_RESULT|>"
+ START_FUNCTION_CALL_TOKEN = "<|START_OF_FUNCTION_CALL|>"
+ END_FUNCTION_CALL_TOKEN = "<|END_OF_FUNCTION_CALL|>"
+ else:
+ version = "v2"
+ RECIPIENT_TOKEN = "<|recipient|>"
+ FROM_TOKEN = "<|from|>"
+ STOP_TOKEN = "<|stop|>"
+ CONTENT_TOKEN = "<|content|>"
+
+ def generate_type_definition(
+ param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
+ ) -> str:
+ indent = " " * indent_level
+ if "$ref" in param:
+ # Reference to a shared definition
+ ref_name = param["$ref"].split("/")[
+ -1
+ ] # Extract the type name from the reference
+ return ref_name
+ elif param.get("type") == "array":
+ items = param.get("items", {})
+ item_type = generate_type_definition(items, indent_level + 1, shared_defs)
+ return f"Array<{item_type}>"
+ elif param.get("type") == "object":
+ properties = param.get("properties", {})
+ nested_schema = "{\n"
+ for nested_param_name, nested_param in properties.items():
+ nested_param_type = generate_type_definition(
+ nested_param, indent_level + 1, shared_defs
+ )
+ nested_schema += (
+ f"{indent} {nested_param_name}: {nested_param_type},\n"
+ )
+ nested_schema += indent + "}"
+ return nested_schema
+ elif "enum" in param:
+ # Enum type
+ return " | ".join([f'"{enum_value}"' for enum_value in param["enum"]])
+ else:
+ # Simple type
+ return param.get("type", "any")
+
+ def generate_shared_definitions(shared_defs, indent_level: int) -> str:
+ indent = " " * indent_level
+ shared_definitions = ""
+ for def_name, def_properties in shared_defs.items():
+ shared_definitions += f"{indent}type {def_name} = "
+ if def_properties.get("type") == "object":
+ shared_definitions += generate_type_definition(
+ def_properties, indent_level, shared_defs
+ )
+ elif "enum" in def_properties:
+ # Enum type
+ shared_definitions += " | ".join(
+ [f'"{enum_value}"' for enum_value in def_properties["enum"]]
+ )
+ shared_definitions += ";\n"
+ return shared_definitions
+
+ def generate_schema_from_functions(functions, namespace="functions") -> str:
+ schema = (
+ "// Supported function definitions that should be called when necessary.\n"
+ )
+ schema += f"namespace {namespace} {{\n\n"
+
+ # Generate shared definitions
+ shared_definitions = {}
+ for function in functions:
+ parameters = function.get("parameters", {})
+ shared_definitions.update(parameters.get("$defs", {}))
+
+ schema += generate_shared_definitions(shared_definitions, 1)
+
+ for function in functions:
+ function_name = function["name"]
+ description = function.get("description", "")
+ parameters = function.get("parameters", {})
+ required_params = parameters.get("required", [])
+
+ schema += f"// {description}\n"
+ schema += f"type {function_name} = (_: {{\n"
+
+ for param_name, param in parameters.get("properties", {}).items():
+ param_description = param.get("description", "")
+ param_type = generate_type_definition(param, 2, shared_definitions)
+ optional_indicator = "" if param_name in required_params else "?"
+ schema += f"// {param_description}\n"
+ schema += f"{param_name}{optional_indicator}: {param_type},\n"
+ schema += "}) => any;\n\n"
+
+ schema += "}} // namespace {}".format(namespace)
+ return schema
+
+ def prepare_messages_for_inference(
+ messages: List[llama_types.ChatCompletionRequestMessage],
+ tokenizer: AutoTokenizer,
+ version: Literal["v1", "v2"],
+ functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
+ tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+ ):
+ all_messages: List[llama_types.ChatCompletionRequestMessage] = []
+ if functions is not None:
+ all_messages.append(
+ llama_types.ChatCompletionRequestSystemMessage(
+ role="system", content=generate_schema_from_functions(functions)
+ )
+ )
+ elif tools is not None:
+ all_messages.append(
+ llama_types.ChatCompletionRequestSystemMessage(
+ role="system",
+ content=generate_schema_from_functions(
+ [
+ tool["function"]
+ for tool in tools
+ if tool["type"] == "function"
+ ]
+ ),
+ )
+ )
+
+ all_messages.append(
+ llama_types.ChatCompletionRequestSystemMessage(
+ role="system", content=SYSTEM_MESSAGE
+ )
+ )
+
+ for message in messages:
+ # Function call responses
+ if message["role"] == "function" and "name" in message:
+ message["name"] = f"functions.{message['name']}"
+ # Function call requests by assistant
+ if "function_call" in message:
+ message["function_call"][
+ "name"
+ ] = f"functions.{message['function_call']['name']}"
+ all_messages.append(message)
+
+ if version == "v1":
+ suffix = "assistant:\n"
+ else:
+ suffix = "<|from|>assistant\n<|recipient|>"
+
+ return (
+ tokenizer.hf_tokenizer.apply_chat_template(all_messages, tokenize=False)
+ + suffix
+ )
+
+ if tools is not None:
+ functions = [tool["function"] for tool in tools if tool["type"] == "function"]
+
+ if tool_choice is not None:
+ function_call = (
+ tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
+ )
+
+ prompt = prepare_messages_for_inference(
+ messages, tokenizer, version, functions, tools
+ )
+
+ # If no tools/functions are provided
+ if function_call is None and (functions is None or len(functions) == 0):
+ if version == "v1":
+ stop = END_ASSISTANT_TOKEN
+ else:
+ stop = STOP_TOKEN
+ prompt += "all\n<|content|>"
+
+ completion_or_completion_chunks = llama.create_completion(
+ prompt=prompt,
+ temperature=temperature,
+ top_p=top_p,
+ top_k=top_k,
+ min_p=min_p,
+ typical_p=typical_p,
+ stream=stream,
+ stop=stop,
+ max_tokens=max_tokens,
+ presence_penalty=presence_penalty,
+ frequency_penalty=frequency_penalty,
+ repeat_penalty=repeat_penalty,
+ tfs_z=tfs_z,
+ mirostat_mode=mirostat_mode,
+ mirostat_tau=mirostat_tau,
+ mirostat_eta=mirostat_eta,
+ model=model,
+ logits_processor=logits_processor,
+ grammar=grammar,
+ )
+ return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream) # type: ignore
+
+ assert stream is False # TODO: support stream mode
+
+ def get_grammar(function_call):
+ function_body = None
+ for function in functions or []:
+ if function["name"] == function_call:
+ function_body = function["parameters"]
+ break
+ for tool in tools or []:
+ if tool["type"] == "function" and tool["function"]["name"] == function_call:
+ function_body = tool["function"]["parameters"]
+ break
+
+ try:
+ with suppress_stdout_stderr(disable=llama.verbose):
+ grammar_text = llama_grammar.json_schema_to_gbnf(
+ json.dumps(function_body)
+ )
+ grammar = llama_grammar.LlamaGrammar.from_string(
+ llama_grammar.json_schema_to_gbnf(json.dumps(function_body))
+ )
+ print(grammar_text)
+ except Exception as e:
+ if llama.verbose:
+ print(
+ "Failed to parse function body as JSON schema, falling back to default grammar"
+ )
+ print(e)
+ with suppress_stdout_stderr(disable=llama.verbose):
+ grammar = llama_grammar.LlamaGrammar.from_string(
+ llama_grammar.JSON_GBNF, verbose=llama.verbose
+ )
+
+ return grammar
+
+ def create_completion(stop):
+ completion: llama_types.Completion = llama.create_completion(
+ prompt=prompt,
+ temperature=temperature,
+ top_p=top_p,
+ top_k=top_k,
+ min_p=min_p,
+ typical_p=typical_p,
+ stream=stream,
+ stop=stop,
+ max_tokens=max_tokens,
+ presence_penalty=presence_penalty,
+ frequency_penalty=frequency_penalty,
+ repeat_penalty=repeat_penalty,
+ tfs_z=tfs_z,
+ mirostat_mode=mirostat_mode,
+ mirostat_tau=mirostat_tau,
+ mirostat_eta=mirostat_eta,
+ model=model,
+ logits_processor=logits_processor,
+ grammar=grammar,
+ )
+
+ return completion
+
+ function_calls, function_bodies = [], []
+
+ if version == "v1":
+ # If no or "auto" tool_choice/function_call
+ if function_call is None or (
+ isinstance(function_call, str) and function_call == "auto"
+ ):
+ stops = ["\n", END_ASSISTANT_TOKEN]
+ # If tool_choice/function_call is "none"
+ elif isinstance(function_call, str) and function_call == "none":
+ prompt = prepare_messages_for_inference(
+ messages, tokenizer, version, [], []
+ )
+ stops = END_ASSISTANT_TOKEN
+ # If tool_choice/function_call is provided
+ elif isinstance(function_call, dict):
+ prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
+ stops = END_FUNCTION_CALL_TOKEN
+ function_call = function_call["name"]
+ function_calls.append(function_call)
+ grammar = get_grammar(function_call)
+ else:
+ prompt = prompt
+ stops = ["\n", END_ASSISTANT_TOKEN]
+
+ completion = create_completion(stop=stops)
+ completion_text = completion["choices"][0]["text"]
+
+ # If the generation does not involve a function call
+ if (
+ START_FUNCTION_CALL_TOKEN not in prompt
+ and START_FUNCTION_CALL_TOKEN not in completion_text
+ ):
+ return _convert_completion_to_chat(completion, stream=stream) # type: ignore
+ # If the generation involves a function call in completion, generate the parameters
+ elif (
+ START_FUNCTION_CALL_TOKEN not in prompt
+ and START_FUNCTION_CALL_TOKEN in completion_text
+ ):
+ prompt += (
+ completion_text.replace(
+ f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN
+ )
+ + "\n"
+ )
+ function_calls.append(
+ completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
+ )
+ grammar = get_grammar(function_calls[-1])
+ completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
+ function_bodies.append(completion["choices"][0]["text"].strip())
+ # If the prompt involves a function call, just append generated parameters to function_bodies
+ else:
+ function_bodies.append(completion_text.strip())
+ else:
+ # Loop until all parallel function calls are generated
+ while True:
+ # If no or "auto" tool_choice/function_call
+ if function_call is None or (
+ isinstance(function_call, str) and function_call == "auto"
+ ):
+ grammar = None
+ stops = CONTENT_TOKEN
+ # If tool_choice/function_call is "none"
+ elif isinstance(function_call, str) and function_call == "none":
+ prompt = (
+ prepare_messages_for_inference(messages, tokenizer, version, [], [])
+ + "all\n<|content|>"
+ )
+ stops = STOP_TOKEN
+ # If tool_choice/function_call is provided
+ elif isinstance(function_call, dict):
+ prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
+ stops = STOP_TOKEN
+ function_call = function_call["name"]
+ function_calls.append(function_call)
+ grammar = get_grammar(function_call)
+ else:
+ prompt = prompt
+ stops = STOP_TOKEN
+
+ completion = create_completion(stop=stops)
+ completion_text = completion["choices"][0]["text"]
+
+ # If the generation does not involve a function call
+ if prompt.endswith("all\n<|content|>") and not completion_text.startswith(
+ "all"
+ ):
+ return _convert_completion_to_chat(completion, stream=stream) # type: ignore
+ # Generate model response if the model decides not to call any function
+ elif prompt.endswith(RECIPIENT_TOKEN) and completion_text.startswith("all"):
+ prompt += completion_text + CONTENT_TOKEN
+ completion = create_completion(stop=STOP_TOKEN)
+ return _convert_completion_to_chat(completion, stream=stream) # type: ignore
+ # Generate parameters if model decides to call a function
+ elif prompt.endswith(RECIPIENT_TOKEN):
+ function_calls.append(completion_text[:-1])
+ grammar = get_grammar(function_calls[-1])
+ completion = create_completion(stop=[STOP_TOKEN, "\n"])
+ function_bodies.append(completion["choices"][0]["text"].strip())
+ prompt += f"{function_calls[-1]}\n{CONTENT_TOKEN}{function_bodies[-1]}"
+ grammar = None
+
+ # Try to generate the beginning of next turn
+ # If empty completion, break from loop
+ next_turn_completion_text = create_completion(
+ stop=[STOP_TOKEN, RECIPIENT_TOKEN]
+ )["choices"][0]["text"]
+ if len(next_turn_completion_text) > 0:
+ prompt += f"\n{FROM_TOKEN}assistant\n{RECIPIENT_TOKEN}"
+ else:
+ break
+ # Break from loop if tool_choice/function_call is provided as a dict
+ else:
+ function_bodies.append(completion_text.strip())
+ break
+
+ assert "usage" in completion
+ assert len(function_calls) > 0
+ assert len(function_calls) == len(function_bodies)
+
+ tool_calls = []
+ for function_call, function_body in zip(function_calls, function_bodies):
+ tool_calls.append(
+ {
+ "id": "call_"
+ + "".join(
+ [
+ random.choice(string.ascii_letters + string.digits)
+ for _ in range(24)
+ ]
+ ),
+ "type": "function",
+ "function": {
+ "name": function_call,
+ "arguments": function_body,
+ },
+ }
+ )
+
+ # TODO: support stream mode
+ return llama_types.CreateChatCompletionResponse(
+ id="chat" + completion["id"],
+ object="chat.completion",
+ created=completion["created"],
+ model=completion["model"],
+ choices=[
+ {
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": None,
+ "function_call": {
+ "name": tool_calls[0]["function"]["name"],
+ "arguments": tool_calls[0]["function"]["arguments"],
+ },
+ "tool_calls": tool_calls,
+ },
+ "finish_reason": "tool_calls",
+ }
+ ],
+ usage=completion["usage"],
+ )
+
+
class Llava15ChatHandler:
_clip_free = None
@@ -1493,7 +1974,9 @@ class Llava15ChatHandler:
json.dumps(response_format["schema"])
)
except Exception as e:
- grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+ grammar = llama_grammar.LlamaGrammar.from_string(
+ llama_grammar.JSON_GBNF
+ )
return _convert_completion_to_chat(
llama.create_completion(
@@ -1519,3 +2002,609 @@ class Llava15ChatHandler:
),
stream=stream,
)
+
+
+@register_chat_completion_handler("chatml-function-calling")
+def chatml_function_calling(
+ llama: llama.Llama,
+ messages: List[llama_types.ChatCompletionRequestMessage],
+ functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+ function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+ tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+ tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+ temperature: float = 0.2,
+ top_p: float = 0.95,
+ top_k: int = 40,
+ min_p: float = 0.05,
+ typical_p: float = 1.0,
+ stream: bool = False,
+ stop: Optional[Union[str, List[str]]] = [],
+ response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
+ max_tokens: Optional[int] = None,
+ presence_penalty: float = 0.0,
+ frequency_penalty: float = 0.0,
+ repeat_penalty: float = 1.1,
+ tfs_z: float = 1.0,
+ mirostat_mode: int = 0,
+ mirostat_tau: float = 5.0,
+ mirostat_eta: float = 0.1,
+ model: Optional[str] = None,
+ logits_processor: Optional[llama.LogitsProcessorList] = None,
+ grammar: Optional[llama.LlamaGrammar] = None,
+ **kwargs, # type: ignore
+) -> Union[
+ llama_types.CreateChatCompletionResponse,
+ Iterator[llama_types.CreateChatCompletionStreamResponse],
+]:
+ function_calling_template = (
+ "{% for message in messages %}"
+ "<|im_start|>{{ message.role }}\n"
+ # System message
+ "{% if message.role == 'system' %}"
+ "{{ message.content }}"
+ "{% if tool_calls %}"
+ "\n\nYou have access to the following functions:\n"
+ "{% for tool in tools %}"
+ "\nfunctions.{{ tool.function.name }}:\n"
+ "{{ tool.function.parameters | tojson }}"
+ "\n{% endfor %}"
+ "\n\nYou can respond to users messages with either a single message or one or more function calls."
+ "\n\nTo respond with a message begin the message with 'message:', use the following format:"
+ "\n\nmessage:"
+ "\n"
+ "\n\nTo respond with one or more function calls begin the message with 'functions.:', use the following format:"
+ "\n\nfunctions.:"
+ '\n{ "arg1": "value1", "arg2": "value2" }'
+ "\nfunctions.:"
+ '\n{ "arg1": "value1", "arg2": "value2" }'
+ "{% endif %}"
+ "<|im_end|>\n"
+ "{% endif %}"
+ # User message
+ "{% if message.role == 'user' %}"
+ "{{ message.content }}"
+ "<|im_end|>\n"
+ "{% endif %}"
+ # Assistant message
+ "{% if message.role == 'assistant' %}"
+ ## Reglar message
+ "{% if message.content and message.content | length > 0 %}"
+ "{% if tool_calls %}"
+ "message:\n"
+ "{% endif %}"
+ "{{ message.content }}"
+ "<|im_end|>\n"
+ "{% endif %}"
+ ## Function calls
+ "{% if 'tool_calls' in message %}"
+ "{% for tool_call in message.tool_calls %}"
+ "functions.{{ tool_call.function.name }}:\n"
+ "{{ tool_call.function.arguments }}"
+ "{% endfor %}"
+ "<|im_end|>\n"
+ "{% endif %}"
+ "{% endif %}"
+ "{% endfor %}"
+ "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+ )
+ template_renderer = jinja2.Environment(
+ loader=jinja2.BaseLoader(),
+ autoescape=jinja2.select_autoescape(["html", "xml"]),
+ undefined=jinja2.StrictUndefined,
+ ).from_string(function_calling_template)
+
+ # Convert legacy functions to tools
+ if functions is not None:
+ tools = [
+ {
+ "type": "function",
+ "function": function,
+ }
+ for function in functions
+ ]
+
+ # Convert legacy function_call to tool_choice
+ if function_call is not None:
+ if isinstance(function_call, str) and (
+ function_call == "none" or function_call == "auto"
+ ):
+ tool_choice = function_call
+ if isinstance(function_call, dict) and "name" in function_call:
+ tool_choice = {
+ "type": "function",
+ "function": {
+ "name": function_call["name"],
+ },
+ }
+
+ stop = [stop, "<|im_end|>"] if isinstance(stop, str) else stop + ["<|im_end|>"] if stop else ["<|im_end|>"]
+
+ # Case 1: No tool choice by user
+ if (
+ tool_choice is None
+ or (isinstance(tool_choice, str) and tool_choice == "none")
+ or tools is None
+ or len(tools) == 0
+ ):
+ prompt = template_renderer.render(
+ messages=messages,
+ tools=[],
+ tool_calls=None,
+ add_generation_prompt=True,
+ )
+ if response_format is not None and response_format["type"] == "json_object":
+ try:
+ grammar = (
+ llama_grammar.LlamaGrammar.from_json_schema(
+ json.dumps(response_format["schema"])
+ )
+ if "schema" in response_format
+ else None
+ )
+ except Exception as e:
+ if llama.verbose:
+ print(
+ "Failed to parse response format as JSON schema, falling back to default grammar"
+ )
+ print(e)
+ grammar = (
+ llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+ if grammar is None
+ else grammar
+ )
+ return _convert_completion_to_chat(
+ llama.create_completion(
+ prompt=prompt,
+ temperature=temperature,
+ top_p=top_p,
+ top_k=top_k,
+ min_p=min_p,
+ typical_p=typical_p,
+ stream=stream,
+ stop=stop,
+ max_tokens=max_tokens,
+ presence_penalty=presence_penalty,
+ frequency_penalty=frequency_penalty,
+ repeat_penalty=repeat_penalty,
+ tfs_z=tfs_z,
+ mirostat_mode=mirostat_mode,
+ mirostat_tau=mirostat_tau,
+ mirostat_eta=mirostat_eta,
+ model=model,
+ logits_processor=logits_processor,
+ grammar=grammar,
+ ),
+ stream=stream,
+ )
+
+ def _convert_completion_to_chat_function(
+ tool_name: str,
+ completion_or_chunks: Union[
+ llama_types.CreateCompletionResponse,
+ Iterator[llama_types.CreateCompletionStreamResponse],
+ ],
+ stream: bool,
+ ):
+ if not stream:
+ completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore
+ assert "usage" in completion
+ tool_id = "call_" + "_0_" + tool_name + "_" + completion["id"]
+ # TODO: Fix for legacy function calls
+ chat_completion: llama_types.CreateChatCompletionResponse = {
+ "id": "chat" + completion["id"],
+ "object": "chat.completion",
+ "created": completion["created"],
+ "model": completion["model"],
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": None,
+ "function_call": {
+ "name": tool_name,
+ "arguments": completion["choices"][0]["text"],
+ },
+ "tool_calls": [
+ {
+ "id": tool_id,
+ "type": "function",
+ "function": {
+ "name": tool_name,
+ "arguments": completion["choices"][0]["text"],
+ },
+ }
+ ],
+ },
+ "finish_reason": "tool_calls",
+ }
+ ],
+ "usage": completion["usage"],
+ }
+ return chat_completion
+ else:
+ chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks # type: ignore
+
+ def _stream_response_to_function_stream(
+ chunks: Iterator[llama_types.CreateCompletionStreamResponse],
+ ) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
+ # blank first message
+ first = True
+ id_ = None
+ created = None
+ model = None
+ tool_id = None
+ for chunk in chunks:
+ if first:
+ id_ = "chat" + chunk["id"]
+ created = chunk["created"]
+ model = chunk["model"]
+ tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
+ yield {
+ "id": id_,
+ "object": "chat.completion.chunk",
+ "created": created,
+ "model": model,
+ "choices": [
+ {
+ "index": 0,
+ "finish_reason": None,
+ "logprobs": None,
+ "delta": {
+ "role": "assistant",
+ "content": None,
+ "function_call": None,
+ "tool_calls": None,
+ },
+ }
+ ],
+ }
+ yield {
+ "id": "chat" + chunk["id"],
+ "object": "chat.completion.chunk",
+ "created": chunk["created"],
+ "model": chunk["model"],
+ "choices": [
+ {
+ "index": 0,
+ "finish_reason": None,
+ "logprobs": None,
+ "delta": {
+ "role": None,
+ "content": None,
+ "function_call": {
+ "name": tool_name,
+ "arguments": chunk["choices"][0]["text"],
+ },
+ "tool_calls": [
+ {
+ "index": 0,
+ "id": tool_id,
+ "type": "function",
+ "function": {
+ "name": tool_name,
+ "arguments": "",
+ },
+ }
+ ],
+ },
+ }
+ ],
+ }
+ first = False
+ continue
+ assert tool_id is not None
+ yield {
+ "id": "chat" + chunk["id"],
+ "object": "chat.completion.chunk",
+ "created": chunk["created"],
+ "model": chunk["model"],
+ "choices": [
+ {
+ "index": 0,
+ "finish_reason": None,
+ "logprobs": None,
+ "delta": {
+ "role": None,
+ "content": None,
+ "function_call": {
+ "name": tool_name,
+ "arguments": chunk["choices"][0]["text"],
+ },
+ "tool_calls": [
+ {
+ "index": 0,
+ "id": tool_id,
+ "type": "function",
+ "function": {
+ "name": tool_name,
+ "arguments": chunk["choices"][0][
+ "text"
+ ],
+ },
+ }
+ ],
+ },
+ }
+ ],
+ }
+
+ if id_ is not None and created is not None and model is not None:
+ yield {
+ "id": id_,
+ "object": "chat.completion.chunk",
+ "created": created,
+ "model": model,
+ "choices": [
+ {
+ "index": 0,
+ "finish_reason": "tool_calls",
+ "logprobs": None,
+ "delta": {
+ "role": None,
+ "content": None,
+ "function_call": None,
+ "tool_calls": None,
+ },
+ }
+ ],
+ }
+
+ return _stream_response_to_function_stream(chunks)
+
+ # Case 2: Tool choice by user
+ if isinstance(tool_choice, dict):
+ tool_name = tool_choice["function"]["name"]
+ tool = next(
+ (tool for tool in tools if tool["function"]["name"] == tool_name), None
+ )
+ if tool is None:
+ raise ValueError(f"Tool with name '{tool_name}' not found in tools")
+ prompt = template_renderer.render(
+ messages=messages,
+ tools=tools,
+ tool_calls=True,
+ add_generation_prompt=True,
+ )
+ prompt += f"functions.{tool_name}:\n"
+ try:
+ grammar = llama_grammar.LlamaGrammar.from_json_schema(
+ json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
+ )
+ except Exception as e:
+ grammar = llama_grammar.LlamaGrammar.from_string(
+ llama_grammar.JSON_GBNF, verbose=llama.verbose
+ )
+ if llama.verbose:
+ print(
+ "Failed to parse function body as JSON schema, falling back to default grammar"
+ )
+ print(e)
+ completion_or_chunks = llama.create_completion(
+ prompt=prompt,
+ temperature=temperature,
+ top_p=top_p,
+ top_k=top_k,
+ min_p=min_p,
+ typical_p=typical_p,
+ stream=stream,
+ stop=stop,
+ max_tokens=max_tokens,
+ presence_penalty=presence_penalty,
+ frequency_penalty=frequency_penalty,
+ repeat_penalty=repeat_penalty,
+ tfs_z=tfs_z,
+ mirostat_mode=mirostat_mode,
+ mirostat_tau=mirostat_tau,
+ mirostat_eta=mirostat_eta,
+ model=model,
+ logits_processor=logits_processor,
+ grammar=grammar,
+ )
+ return _convert_completion_to_chat_function(
+ tool_name, completion_or_chunks, stream
+ )
+
+ # Case 3: Automatic tool choice
+ assert isinstance(tool_choice, str) and tool_choice == "auto"
+ function_names = " | ".join(
+ [f'''"functions.{tool['function']['name']}:"''' for tool in tools]
+ )
+ initial_gbnf_tool_grammar = (
+ """root ::= functions | "message:"\n"""
+ f"""functions ::= {function_names}\n"""
+ )
+ follow_up_gbnf_tool_grammar = (
+ """root ::= functions | "<|im_end|>"\n"""
+ f"""functions ::= {function_names}\n"""
+ )
+ prompt = template_renderer.render(
+ messages=messages,
+ tools=tools,
+ tool_calls=True,
+ add_generation_prompt=True,
+ )
+ completion_or_chunks = llama.create_completion(
+ prompt=prompt,
+ temperature=0,
+ top_p=top_p,
+ top_k=top_k,
+ min_p=min_p,
+ typical_p=typical_p,
+ stream=False,
+ stop=[":"],
+ max_tokens=None,
+ presence_penalty=presence_penalty,
+ frequency_penalty=frequency_penalty,
+ repeat_penalty=repeat_penalty,
+ tfs_z=tfs_z,
+ mirostat_mode=mirostat_mode,
+ mirostat_tau=mirostat_tau,
+ mirostat_eta=mirostat_eta,
+ model=model,
+ logits_processor=logits_processor,
+ grammar=llama_grammar.LlamaGrammar.from_string(
+ initial_gbnf_tool_grammar, verbose=llama.verbose
+ ),
+ )
+ completion: llama_types.CreateCompletionResponse = completion_or_chunks # type: ignore
+ text = completion["choices"][0]["text"]
+ if "message" in text:
+ return _convert_completion_to_chat(
+ llama.create_completion(
+ prompt=prompt + "message:\n",
+ temperature=temperature,
+ top_p=top_p,
+ top_k=top_k,
+ min_p=min_p,
+ typical_p=typical_p,
+ stream=stream,
+ stop=["<|im_end|>"],
+ max_tokens=None,
+ presence_penalty=presence_penalty,
+ frequency_penalty=frequency_penalty,
+ repeat_penalty=repeat_penalty,
+ tfs_z=tfs_z,
+ mirostat_mode=mirostat_mode,
+ mirostat_tau=mirostat_tau,
+ mirostat_eta=mirostat_eta,
+ model=model,
+ logits_processor=logits_processor,
+ grammar=llama_grammar.LlamaGrammar.from_string(
+ follow_up_gbnf_tool_grammar, verbose=llama.verbose
+ ),
+ ),
+ stream=stream,
+ )
+
+ # One or more function calls
+ tool_name = text[len("functions.") :]
+ tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
+ if not stream:
+ completions = []
+ completions_tool_name = []
+ while tool is not None:
+ prompt += f"functions.{tool_name}:\n"
+ try:
+ grammar = llama_grammar.LlamaGrammar.from_json_schema(
+ json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
+ )
+ except Exception as e:
+ grammar = llama_grammar.LlamaGrammar.from_string(
+ llama_grammar.JSON_GBNF, verbose=llama.verbose
+ )
+ if llama.verbose:
+ print(
+ "Failed to parse function body as JSON schema, falling back to default grammar"
+ )
+ print(e)
+ completion_or_chunks = llama.create_completion(
+ prompt=prompt,
+ temperature=temperature,
+ top_p=top_p,
+ top_k=top_k,
+ min_p=min_p,
+ typical_p=typical_p,
+ stream=False,
+ stop=stop,
+ max_tokens=None,
+ presence_penalty=presence_penalty,
+ frequency_penalty=frequency_penalty,
+ repeat_penalty=repeat_penalty,
+ tfs_z=tfs_z,
+ mirostat_mode=mirostat_mode,
+ mirostat_tau=mirostat_tau,
+ mirostat_eta=mirostat_eta,
+ model=model,
+ logits_processor=logits_processor,
+ grammar=grammar,
+ )
+ completions.append(completion_or_chunks)
+ completions_tool_name.append(tool_name)
+ prompt += completion_or_chunks["choices"][0]["text"]
+ prompt += "\n"
+
+ response = llama.create_completion(
+ prompt=prompt,
+ temperature=temperature,
+ top_p=top_p,
+ top_k=top_k,
+ min_p=min_p,
+ typical_p=typical_p,
+ stream=False,
+ stop=stop,
+ max_tokens=None,
+ presence_penalty=presence_penalty,
+ frequency_penalty=frequency_penalty,
+ repeat_penalty=repeat_penalty,
+ tfs_z=tfs_z,
+ mirostat_mode=mirostat_mode,
+ mirostat_tau=mirostat_tau,
+ mirostat_eta=mirostat_eta,
+ model=model,
+ logits_processor=logits_processor,
+ grammar=llama_grammar.LlamaGrammar.from_string(
+ follow_up_gbnf_tool_grammar, verbose=llama.verbose
+ ),
+ )
+
+ tool_name = response["choices"][0]["text"][len("functions.") :]
+ tool = next(
+ (tool for tool in tools if tool["function"]["name"] == tool_name), None
+ )
+
+ # Merge completions
+ function_call = {
+ "function_call": {
+ "name": tool_name,
+ "arguments": completions[0]["choices"][0]["text"],
+ }
+ } if len(completions) == 1 else {}
+ return {
+ "id": "chat" + completion["id"],
+ "object": "chat.completion",
+ "created": completion["created"],
+ "model": completion["model"],
+ "choices": [
+ {
+ "finish_reason": "tool_calls",
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": None,
+ "tool_calls": [
+ {
+ "id": "call_"
+ + f"_{i}_"
+ + tool_name
+ + "_"
+ + completion["id"],
+ "type": "function",
+ "function": {
+ "name": tool_name,
+ "arguments": completion["choices"][0]["text"],
+ },
+ }
+ for i, (tool_name, completion) in enumerate(
+ zip(completions_tool_name, completions)
+ )
+ ],
+ **function_call
+ },
+ }
+ ],
+ "usage": {
+ "completion_tokens": sum(
+ completion["usage"]["completion_tokens"]
+ for completion in completions
+ ),
+ "prompt_tokens": sum(
+ completion["usage"]["prompt_tokens"] for completion in completions
+ ),
+ "total_tokens": sum(
+ completion["usage"]["total_tokens"] for completion in completions
+ ),
+ },
+ }
+
+ raise ValueError("Automatic streaming tool choice is not supported")
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index da2a7f3..9979a67 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -139,9 +139,11 @@ llama_seq_id = c_int32
# enum llama_vocab_type {
# LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
# LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
+# LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
# };
LLAMA_VOCAB_TYPE_SPM = 0
LLAMA_VOCAB_TYPE_BPE = 1
+LLAMA_VOCAB_TYPE_WPM = 2
# enum llama_token_type {
@@ -468,6 +470,7 @@ class llama_model_params(Structure):
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
# bool embedding; // embedding mode only
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+# bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
# };
class llama_context_params(Structure):
"""Parameters for llama_context
@@ -494,6 +497,7 @@ class llama_context_params(Structure):
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
embedding (bool): embedding mode only
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
+ do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
"""
_fields_ = [
@@ -518,6 +522,7 @@ class llama_context_params(Structure):
("logits_all", c_bool),
("embedding", c_bool),
("offload_kqv", c_bool),
+ ("do_pooling", c_bool),
]
@@ -1697,6 +1702,21 @@ _lib.llama_get_embeddings.argtypes = [llama_context_p]
_lib.llama_get_embeddings.restype = c_float_p
+# // Get the embeddings for the ith sequence
+# // llama_get_embeddings(ctx) + i*n_embd
+# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
+def llama_get_embeddings_ith(
+ ctx: llama_context_p, i: Union[c_int32, int]
+): # type: (...) -> Array[float] # type: ignore
+ """Get the embeddings for the ith sequence
+ llama_get_embeddings(ctx) + i*n_embd"""
+ return _lib.llama_get_embeddings_ith(ctx, i)
+
+
+_lib.llama_get_embeddings_ith.argtypes = [llama_context_p, c_int32]
+_lib.llama_get_embeddings_ith.restype = c_float_p
+
+
# //
# // Vocab
# //
diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py
index d8ef563..3eb3b96 100644
--- a/llama_cpp/llama_grammar.py
+++ b/llama_cpp/llama_grammar.py
@@ -1471,12 +1471,15 @@ class SchemaConverter:
if schema_type == "object" and "properties" in schema:
# TODO: `required` keyword
- prop_order = self._prop_order
- prop_pairs = sorted(
- schema["properties"].items(),
- # sort by position in prop_order (if specified) then by key
- key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
- )
+ if self._prop_order:
+ prop_order = self._prop_order
+ prop_pairs = sorted(
+ schema["properties"].items(),
+ # sort by position in prop_order (if specified) then by key
+ key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
+ )
+ else:
+ prop_pairs = schema["properties"].items()
rule = '"{" space'
for i, (prop_name, prop_schema) in enumerate(prop_pairs):
diff --git a/llama_cpp/llama_tokenizer.py b/llama_cpp/llama_tokenizer.py
new file mode 100644
index 0000000..c2aad47
--- /dev/null
+++ b/llama_cpp/llama_tokenizer.py
@@ -0,0 +1,95 @@
+from __future__ import annotations
+
+import abc
+from typing import (
+ List,
+ Optional,
+ Any,
+)
+
+import llama_cpp
+from llama_cpp.llama_types import List
+
+
+class BaseLlamaTokenizer(abc.ABC):
+ @abc.abstractmethod
+ def tokenize(
+ self, text: bytes, add_bos: bool = True, special: bool = True
+ ) -> List[int]:
+ raise NotImplementedError
+
+ @abc.abstractmethod
+ def detokenize(
+ self, tokens: List[int], prev_tokens: Optional[List[int]] = None
+ ) -> bytes:
+ raise NotImplementedError
+
+
+class LlamaTokenizer(BaseLlamaTokenizer):
+ def __init__(self, llama: llama_cpp.Llama):
+ self._model = llama._model # type: ignore
+
+ def tokenize(
+ self, text: bytes, add_bos: bool = True, special: bool = True
+ ) -> List[int]:
+ return self._model.tokenize(text, add_bos=add_bos, special=special)
+
+ def detokenize(
+ self, tokens: List[int], prev_tokens: Optional[List[int]] = None
+ ) -> bytes:
+ if prev_tokens is not None:
+ return self._model.detokenize(tokens[len(prev_tokens) :])
+ else:
+ return self._model.detokenize(tokens)
+
+ def encode(
+ self, text: str, add_bos: bool = True, special: bool = True
+ ) -> List[int]:
+ return self.tokenize(
+ text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
+ )
+
+ def decode(self, tokens: List[int]) -> str:
+ return self.detokenize(tokens).decode("utf-8", errors="ignore")
+
+ @classmethod
+ def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
+ return cls(llama_cpp.Llama(model_path=path, vocab_only=True))
+
+
+class LlamaHFTokenizer(BaseLlamaTokenizer):
+ def __init__(self, hf_tokenizer: Any):
+ self.hf_tokenizer = hf_tokenizer
+
+ def tokenize(
+ self, text: bytes, add_bos: bool = True, special: bool = True
+ ) -> List[int]:
+ return self.hf_tokenizer.encode(
+ text.decode("utf-8", errors="ignore"), add_special_tokens=special
+ )
+
+ def detokenize(
+ self, tokens: List[int], prev_tokens: Optional[List[int]] = None
+ ) -> bytes:
+ if prev_tokens is not None:
+ text = self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
+ prev_text = self.hf_tokenizer.decode(prev_tokens).encode(
+ "utf-8", errors="ignore"
+ )
+ return text[len(prev_text) :]
+ else:
+ return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
+ try:
+ from transformers import AutoTokenizer
+ except ImportError:
+ raise ImportError(
+ "The `transformers` library is required to use the `HFTokenizer`."
+ "You can install it with `pip install transformers`."
+ )
+ hf_tokenizer = AutoTokenizer.from_pretrained(
+ pretrained_model_name_or_path=pretrained_model_name_or_path
+ )
+ return cls(hf_tokenizer)
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index c3deba8..1b1befe 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -97,7 +97,7 @@ class CreateChatCompletionResponse(TypedDict):
class ChatCompletionMessageToolCallChunkFunction(TypedDict):
- name: str
+ name: Optional[str]
arguments: str
@@ -118,12 +118,12 @@ class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
class ChatCompletionStreamResponseDelta(TypedDict):
- content: NotRequired[str]
+ content: NotRequired[Optional[str]]
function_call: NotRequired[
- ChatCompletionStreamResponseDeltaFunctionCall
+ Optional[ChatCompletionStreamResponseDeltaFunctionCall]
] # DEPRECATED
- tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]]
- role: NotRequired[Literal["system", "user", "assistant", "tool"]]
+ tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]]
+ role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]]
class ChatCompletionStreamResponseChoice(TypedDict):
@@ -132,6 +132,7 @@ class ChatCompletionStreamResponseChoice(TypedDict):
ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
]
finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
+ logprobs: NotRequired[Optional[CompletionLogprobs]]
class CreateChatCompletionStreamResponse(TypedDict):
diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
index b1f90b9..8195bd4 100644
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@@ -146,30 +146,8 @@ _libllava.llava_eval_image_embed.restype = c_bool
################################################
-# struct clip_vision_hparams {
-# int32_t image_size;
-# int32_t patch_size;
-# int32_t hidden_size;
-# int32_t n_intermediate;
-# int32_t projection_dim;
-# int32_t n_head;
-# int32_t n_layer;
-# float eps;
-# };
-class clip_vision_hparams(Structure):
- _fields_ = [
- ("image_size", c_int32),
- ("patch_size", c_int32),
- ("hidden_size", c_int32),
- ("n_intermediate", c_int32),
- ("projection_dim", c_int32),
- ("n_head", c_int32),
- ("n_layer", c_int32),
- ("eps", c_float),
- ]
-
# /** load mmproj model */
-# CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
+# CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
def clip_model_load(fname: bytes, verbosity: Union[c_int, int]) -> clip_ctx_p:
return _libllava.clip_model_load(fname, verbosity)
@@ -183,50 +161,3 @@ def clip_free(ctx: clip_ctx_p):
_libllava.clip_free.argtypes = [clip_ctx_p]
_libllava.clip_free.restype = None
-
-# size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-# int clip_n_patches(const struct clip_ctx * ctx);
-# int clip_n_mmproj_embd(const struct clip_ctx * ctx);
-
-# // RGB uint8 image
-# struct clip_image_u8 {
-# int nx;
-# int ny;
-# uint8_t * data = NULL;
-# size_t size;
-# };
-
-# // RGB float32 image (NHWC)
-# // Memory layout: RGBRGBRGB...
-# struct clip_image_f32 {
-# int nx;
-# int ny;
-# float * data = NULL;
-# size_t size;
-# };
-
-# struct clip_image_u8_batch {
-# struct clip_image_u8 * data;
-# size_t size;
-# };
-
-# struct clip_image_f32_batch {
-# struct clip_image_f32 * data;
-# size_t size;
-# };
-
-# struct clip_image_u8 * make_clip_image_u8();
-# struct clip_image_f32 * make_clip_image_f32();
-# CLIP_API void clip_image_u8_free(clip_image_u8 * img);
-# CLIP_API void clip_image_f32_free(clip_image_f32 * img);
-# CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
-# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
-# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
-
-# bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
-# bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
-
-# bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
-# float * vec);
-
-# bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
\ No newline at end of file
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 925ab99..5308dc2 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -6,6 +6,7 @@ from typing import Dict, Optional, Union, List
import llama_cpp
import llama_cpp.llama_speculative as llama_speculative
+import llama_cpp.llama_tokenizer as llama_tokenizer
from llama_cpp.server.settings import ModelSettings
@@ -93,6 +94,10 @@ class LlamaProxy:
)
)
+ tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
+ if settings.hf_pretrained_model_name_or_path is not None:
+ tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(settings.hf_pretrained_model_name_or_path)
+
draft_model = None
if settings.draft_model is not None:
draft_model = llama_speculative.LlamaPromptLookupDecoding(
@@ -156,6 +161,8 @@ class LlamaProxy:
chat_handler=chat_handler,
# Speculative Decoding
draft_model=draft_model,
+ # Tokenizer
+ tokenizer=tokenizer,
# Misc
verbose=settings.verbose,
)
diff --git a/tests/test_grammar.py b/tests/test_llama_grammar.py
similarity index 100%
rename from tests/test_grammar.py
rename to tests/test_llama_grammar.py
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 098f6d7..8084d55 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 098f6d737b65134cf220d12b9b706e8cfc5e4610
+Subproject commit 8084d554406b767d36b3250b3b787462d5dd626f