This commit is contained in:
commit
21ac214a38
22 changed files with 2414 additions and 188 deletions
4
.github/workflows/build-and-release.yaml
vendored
4
.github/workflows/build-and-release.yaml
vendored
|
@ -16,7 +16,7 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: "true"
|
||||
submodules: "recursive"
|
||||
|
||||
# Used to host cibuildwheel
|
||||
- uses: actions/setup-python@v3
|
||||
|
@ -48,7 +48,7 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: "true"
|
||||
submodules: "recursive"
|
||||
- uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: "3.8"
|
||||
|
|
2
.github/workflows/build-docker.yaml
vendored
2
.github/workflows/build-docker.yaml
vendored
|
@ -14,7 +14,7 @@ jobs:
|
|||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: "true"
|
||||
submodules: "recursive"
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
|
2
.github/workflows/publish-to-test.yaml
vendored
2
.github/workflows/publish-to-test.yaml
vendored
|
@ -18,7 +18,7 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: "true"
|
||||
submodules: "recursive"
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
|
|
2
.github/workflows/publish.yaml
vendored
2
.github/workflows/publish.yaml
vendored
|
@ -12,7 +12,7 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: "true"
|
||||
submodules: "recursive"
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
|
|
10
.github/workflows/test.yaml
vendored
10
.github/workflows/test.yaml
vendored
|
@ -19,7 +19,7 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: "true"
|
||||
submodules: "recursive"
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
|
@ -42,7 +42,7 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: "true"
|
||||
submodules: "recursive"
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
|
@ -65,7 +65,7 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: "true"
|
||||
submodules: "recursive"
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
|
@ -85,7 +85,7 @@ jobs:
|
|||
# steps:
|
||||
# - uses: actions/checkout@v3
|
||||
# with:
|
||||
# submodules: "true"
|
||||
# submodules: "recursive"
|
||||
# - name: Set up Python 3.8
|
||||
# uses: actions/setup-python@v4
|
||||
# with:
|
||||
|
@ -112,7 +112,7 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: "true"
|
||||
submodules: "recursive"
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
|
|
33
CHANGELOG.md
33
CHANGELOG.md
|
@ -7,11 +7,42 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.2.43]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@8084d554406b767d36b3250b3b787462d5dd626f
|
||||
- feat: Support batch embeddings by @iamlemec in #1186
|
||||
- fix: submodule kompute is not included in sdist by @abetlen in 7dbbfdecadebe7750be650d9409959640ff9a460
|
||||
- fix: fix: Update openbuddy prompt format by @abetlen in 07a783779a62a4aac0b11161c7e0eb983ff215f8
|
||||
|
||||
## [0.2.42]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@ea9c8e11436ad50719987fa23a289c74b7b40d40
|
||||
- fix: sample idx off-by-one error for logit_processors by @lapp0 in #1179
|
||||
- fix: chat formatting bugs in `chatml-function-calling` by @abetlen in 4b0e3320bd8c2c209e29978d0b21e2e471cc9ee3 and 68fb71b6a26a1e57331868f959b47ab4b87851e1
|
||||
|
||||
## [0.2.41]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@895407f31b358e3d9335e847d13f033491ec8a5b
|
||||
- fix: Don't change order of json schema object properties in generated grammar unless prop_order is passed by @abetlen in d1822fed6b706f38bd1ff0de4dec5baaa3cf84fa
|
||||
|
||||
## [0.2.40]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@3bdc4cd0f595a6096cca4a64aa75ffa8a3503465
|
||||
- feat: Generic chatml Function Calling using chat_format="chatml-function-calling"` by @abetlen in #957
|
||||
- fix: Circular dependancy preventing early Llama object free by @notwa in #1176
|
||||
- docs: Set the correct command for compiling with syscl support by @akarshanbiswas in #1172
|
||||
- feat: use gpu backend for clip if available by @iamlemec in #1175
|
||||
|
||||
## [0.2.39]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@b08f22c882a1443e6b97081f3ce718a4d1a741f8
|
||||
- fix: Fix destructor logging bugs by using llama_log_callback to avoid suppress_stdout_stderr by @abetlen in 59760c85eddc72dfcc1839f43760ef72c23d6874
|
||||
|
||||
## [0.2.38]
|
||||
|
||||
- feat: Update llama.cpp to ggerganov/llama.cpp@1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915
|
||||
- feat: Add speculative decoding by @abetlen in #1120
|
||||
- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template 078cca0361bf5a94d2cf52ed04980d20e32d6f95
|
||||
- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template by @abetlen in 078cca0361bf5a94d2cf52ed04980d20e32d6f95
|
||||
|
||||
## [0.2.37]
|
||||
|
||||
|
|
|
@ -46,6 +46,14 @@ if (LLAMA_BUILD)
|
|||
)
|
||||
|
||||
if (LLAVA_BUILD)
|
||||
if (LLAMA_CUBLAS)
|
||||
add_compile_definitions(GGML_USE_CUBLAS)
|
||||
endif()
|
||||
|
||||
if (LLAMA_METAL)
|
||||
add_compile_definitions(GGML_USE_METAL)
|
||||
endif()
|
||||
|
||||
# Building llava
|
||||
add_subdirectory(vendor/llama.cpp/examples/llava)
|
||||
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
|
||||
|
|
4
Makefile
4
Makefile
|
@ -19,10 +19,10 @@ build.opencl:
|
|||
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
|
||||
|
||||
build.openblas:
|
||||
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
|
||||
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e .
|
||||
|
||||
build.blis:
|
||||
CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" python3 -m pip install --verbose -e .
|
||||
CMAKE_ARGS="-DLLAMA_BLAS=on -DLLAMA_BLAS_VENDOR=FLAME" python3 -m pip install --verbose -e .
|
||||
|
||||
build.metal:
|
||||
CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install --verbose -e .
|
||||
|
|
10
README.md
10
README.md
|
@ -118,7 +118,8 @@ CMAKE_ARGS="-DLLAMA_KOMPUTE=on" pip install llama-cpp-python
|
|||
To install with SYCL support, set the `LLAMA_SYCL=on` environment variable before installing:
|
||||
|
||||
```bash
|
||||
CMAKE_ARGS="-DLLAMA_SYCL=on" pip install llama-cpp-python
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install llama-cpp-python
|
||||
```
|
||||
|
||||
### Windows Notes
|
||||
|
@ -291,14 +292,15 @@ To constrain the response to a specific JSON Schema, you can use the `schema` pr
|
|||
|
||||
### Function Calling
|
||||
|
||||
The high-level API also provides a simple interface for function calling.
|
||||
The high-level API also provides a simple interface for function calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.
|
||||
|
||||
Note that the only model that supports full function calling at this time is "functionary".
|
||||
The gguf-converted files for this model can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
|
||||
The gguf-converted files for functionary can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
|
||||
|
||||
```python
|
||||
>>> from llama_cpp import Llama
|
||||
>>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
|
||||
>>> # or
|
||||
>>> llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling")
|
||||
>>> llm.create_chat_completion(
|
||||
messages = [
|
||||
{
|
||||
|
|
910
examples/notebooks/OpenHermesFunctionCalling.ipynb
Normal file
910
examples/notebooks/OpenHermesFunctionCalling.ipynb
Normal file
|
@ -0,0 +1,910 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{\n",
|
||||
" \"name\": \"get_article_details\",\n",
|
||||
" \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"title\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" },\n",
|
||||
" \"authors\": {\n",
|
||||
" \"type\": \"list[str]\"\n",
|
||||
" },\n",
|
||||
" \"short_summary\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" },\n",
|
||||
" \"date_published\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" },\n",
|
||||
" \"tags\": {\n",
|
||||
" \"type\": \"list[str]\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"returns\": \"Article\"\n",
|
||||
"}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import inspect\n",
|
||||
"from typing import get_type_hints\n",
|
||||
"\n",
|
||||
"class Article:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
"class Weather:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
"class Directions:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
"def calculate_mortgage_payment(loan_amount: int, interest_rate: float, loan_term: int) -> float:\n",
|
||||
" \"\"\"Get the monthly mortgage payment given an interest rate percentage.\"\"\"\n",
|
||||
" \n",
|
||||
" # TODO: you must implement this to actually call it later\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
"def get_article_details(title: str, authors: list[str], short_summary: str, date_published: str, tags: list[str]) -> Article:\n",
|
||||
" '''Get article details from unstructured article text.\n",
|
||||
"date_published: formatted as \"MM/DD/YYYY\"'''\n",
|
||||
" \n",
|
||||
" # TODO: you must implement this to actually call it later\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
"def get_weather(zip_code: str) -> Weather:\n",
|
||||
" \"\"\"Get the current weather given a zip code.\"\"\"\n",
|
||||
" \n",
|
||||
" # TODO: you must implement this to actually call it later\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
"def get_directions(start: str, destination: str) -> Directions:\n",
|
||||
" \"\"\"Get directions from Google Directions API.\n",
|
||||
"start: start address as a string including zipcode (if any)\n",
|
||||
"destination: end address as a string including zipcode (if any)\"\"\"\n",
|
||||
" \n",
|
||||
" # TODO: you must implement this to actually call it later\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
"def get_type_name(t):\n",
|
||||
" name = str(t)\n",
|
||||
" if \"list\" in name or \"dict\" in name:\n",
|
||||
" return name\n",
|
||||
" else:\n",
|
||||
" return t.__name__\n",
|
||||
"\n",
|
||||
"def serialize_function_to_json(func):\n",
|
||||
" signature = inspect.signature(func)\n",
|
||||
" type_hints = get_type_hints(func)\n",
|
||||
"\n",
|
||||
" function_info = {\n",
|
||||
" \"name\": func.__name__,\n",
|
||||
" \"description\": func.__doc__,\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {}\n",
|
||||
" },\n",
|
||||
" \"returns\": type_hints.get('return', 'void').__name__\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" for name, _ in signature.parameters.items():\n",
|
||||
" param_type = get_type_name(type_hints.get(name, type(None)))\n",
|
||||
" function_info[\"parameters\"][\"properties\"][name] = {\"type\": param_type}\n",
|
||||
"\n",
|
||||
" return json.dumps(function_info, indent=2)\n",
|
||||
"\n",
|
||||
"print(serialize_function_to_json(get_article_details))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import xml.etree.ElementTree as ET\n",
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def extract_function_calls(completion):\n",
|
||||
" completion = completion.strip()\n",
|
||||
" pattern = r\"(<multiplefunctions>(.*?)</multiplefunctions>)\"\n",
|
||||
" match = re.search(pattern, completion, re.DOTALL)\n",
|
||||
" if not match:\n",
|
||||
" return None\n",
|
||||
" \n",
|
||||
" multiplefn = match.group(1)\n",
|
||||
" root = ET.fromstring(multiplefn)\n",
|
||||
" functions = root.findall(\"functioncall\")\n",
|
||||
" return [json.loads(fn.text) for fn in functions]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def generate_hermes_prompt(prompt, functions):\n",
|
||||
" functions = \"\\n\\n\".join([serialize_function_to_json(fn) for fn in functions])\n",
|
||||
" prompt = f\"\"\"<|im_start|>system\n",
|
||||
"You are a helpful assistant with access to the following functions:\n",
|
||||
"\n",
|
||||
"{functions}\n",
|
||||
"\n",
|
||||
"To use these functions respond with:\n",
|
||||
"<multiplefunctions>\n",
|
||||
" <functioncall> {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} </functioncall>\n",
|
||||
" <functioncall> {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} </functioncall>\n",
|
||||
" ...\n",
|
||||
"</multiplefunctions>\n",
|
||||
"\n",
|
||||
"Edge cases you must handle:\n",
|
||||
"- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
|
||||
"<|im_start|>user\n",
|
||||
"{prompt}<|im_end|>\n",
|
||||
"<|im_start|>assistant\"\"\"\n",
|
||||
" return prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<|im_start|>system\n",
|
||||
"You are a helpful assistant with access to the following functions:\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"name\": \"get_weather\",\n",
|
||||
" \"description\": \"Get the current weather given a zip code.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"zip_code\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"returns\": \"Weather\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"name\": \"calculate_mortgage_payment\",\n",
|
||||
" \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"loan_amount\": {\n",
|
||||
" \"type\": \"int\"\n",
|
||||
" },\n",
|
||||
" \"interest_rate\": {\n",
|
||||
" \"type\": \"float\"\n",
|
||||
" },\n",
|
||||
" \"loan_term\": {\n",
|
||||
" \"type\": \"int\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"returns\": \"float\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"name\": \"get_article_details\",\n",
|
||||
" \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"title\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" },\n",
|
||||
" \"authors\": {\n",
|
||||
" \"type\": \"list[str]\"\n",
|
||||
" },\n",
|
||||
" \"short_summary\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" },\n",
|
||||
" \"date_published\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" },\n",
|
||||
" \"tags\": {\n",
|
||||
" \"type\": \"list[str]\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"returns\": \"Article\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"To use these functions respond with:\n",
|
||||
"<multiplefunctions>\n",
|
||||
" <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
|
||||
" <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
|
||||
" ...\n",
|
||||
"</multiplefunctions>\n",
|
||||
"\n",
|
||||
"Edge cases you must handle:\n",
|
||||
"- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
|
||||
"<|im_start|>user\n",
|
||||
"What's the weather in 10001?<|im_end|>\n",
|
||||
"<|im_start|>assistant\n",
|
||||
"<|im_start|>system\n",
|
||||
"You are a helpful assistant with access to the following functions:\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"name\": \"get_weather\",\n",
|
||||
" \"description\": \"Get the current weather given a zip code.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"zip_code\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"returns\": \"Weather\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"name\": \"calculate_mortgage_payment\",\n",
|
||||
" \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"loan_amount\": {\n",
|
||||
" \"type\": \"int\"\n",
|
||||
" },\n",
|
||||
" \"interest_rate\": {\n",
|
||||
" \"type\": \"float\"\n",
|
||||
" },\n",
|
||||
" \"loan_term\": {\n",
|
||||
" \"type\": \"int\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"returns\": \"float\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"name\": \"get_article_details\",\n",
|
||||
" \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"title\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" },\n",
|
||||
" \"authors\": {\n",
|
||||
" \"type\": \"list[str]\"\n",
|
||||
" },\n",
|
||||
" \"short_summary\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" },\n",
|
||||
" \"date_published\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" },\n",
|
||||
" \"tags\": {\n",
|
||||
" \"type\": \"list[str]\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"returns\": \"Article\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"To use these functions respond with:\n",
|
||||
"<multiplefunctions>\n",
|
||||
" <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
|
||||
" <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
|
||||
" ...\n",
|
||||
"</multiplefunctions>\n",
|
||||
"\n",
|
||||
"Edge cases you must handle:\n",
|
||||
"- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
|
||||
"<|im_start|>user\n",
|
||||
"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.<|im_end|>\n",
|
||||
"<|im_start|>assistant\n",
|
||||
"<|im_start|>system\n",
|
||||
"You are a helpful assistant with access to the following functions:\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"name\": \"get_weather\",\n",
|
||||
" \"description\": \"Get the current weather given a zip code.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"zip_code\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"returns\": \"Weather\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"name\": \"calculate_mortgage_payment\",\n",
|
||||
" \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"loan_amount\": {\n",
|
||||
" \"type\": \"int\"\n",
|
||||
" },\n",
|
||||
" \"interest_rate\": {\n",
|
||||
" \"type\": \"float\"\n",
|
||||
" },\n",
|
||||
" \"loan_term\": {\n",
|
||||
" \"type\": \"int\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"returns\": \"float\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"name\": \"get_article_details\",\n",
|
||||
" \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"title\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" },\n",
|
||||
" \"authors\": {\n",
|
||||
" \"type\": \"list[str]\"\n",
|
||||
" },\n",
|
||||
" \"short_summary\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" },\n",
|
||||
" \"date_published\": {\n",
|
||||
" \"type\": \"str\"\n",
|
||||
" },\n",
|
||||
" \"tags\": {\n",
|
||||
" \"type\": \"list[str]\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"returns\": \"Article\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"To use these functions respond with:\n",
|
||||
"<multiplefunctions>\n",
|
||||
" <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
|
||||
" <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
|
||||
" ...\n",
|
||||
"</multiplefunctions>\n",
|
||||
"\n",
|
||||
"Edge cases you must handle:\n",
|
||||
"- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
|
||||
"<|im_start|>user\n",
|
||||
"What's the current exchange rate for USD to EUR?<|im_end|>\n",
|
||||
"<|im_start|>assistant\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"What's the weather in 10001?\",\n",
|
||||
" \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n",
|
||||
" \"What's the current exchange rate for USD to EUR?\"\n",
|
||||
"]\n",
|
||||
"functions = [get_weather, calculate_mortgage_payment, get_article_details]\n",
|
||||
"\n",
|
||||
"for prompt in prompts:\n",
|
||||
" print(generate_hermes_prompt(prompt, functions))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ggml_init_cublas: GGML_CUDA_FORCE_MMQ: no\n",
|
||||
"ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes\n",
|
||||
"ggml_init_cublas: found 1 CUDA devices:\n",
|
||||
" Device 0: NVIDIA GeForce RTX 2060, compute capability 7.5\n",
|
||||
"llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from ../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf (version GGUF V3 (latest))\n",
|
||||
"llama_model_loader: - tensor 0: token_embd.weight q4_K [ 4096, 32002, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 1: blk.0.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 2: blk.0.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 3: blk.0.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 4: blk.0.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 5: blk.0.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 6: blk.0.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 7: blk.0.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 8: blk.0.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 9: blk.0.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 10: blk.1.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 11: blk.1.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 12: blk.1.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 13: blk.1.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 14: blk.1.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 15: blk.1.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 16: blk.1.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 17: blk.1.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 18: blk.1.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 19: blk.2.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 20: blk.2.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 21: blk.2.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 22: blk.2.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 23: blk.2.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 24: blk.2.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 25: blk.2.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 26: blk.2.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 27: blk.2.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 28: blk.3.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 29: blk.3.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 30: blk.3.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 31: blk.3.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 32: blk.3.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 33: blk.3.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 34: blk.3.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 35: blk.3.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 36: blk.3.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 37: blk.4.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 38: blk.4.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 39: blk.4.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 40: blk.4.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 41: blk.4.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 42: blk.4.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 43: blk.4.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 44: blk.4.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 45: blk.4.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 46: blk.5.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 47: blk.5.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 48: blk.5.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 49: blk.5.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 50: blk.5.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 51: blk.5.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 52: blk.5.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 53: blk.5.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 54: blk.5.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 55: blk.6.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 56: blk.6.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 57: blk.6.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 58: blk.6.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 59: blk.6.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 60: blk.6.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 61: blk.6.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 62: blk.6.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 63: blk.6.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 64: blk.7.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 65: blk.7.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 66: blk.7.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 67: blk.7.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 68: blk.7.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 69: blk.7.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 70: blk.7.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 71: blk.7.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 72: blk.7.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 73: blk.8.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 74: blk.8.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 75: blk.8.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 76: blk.8.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 77: blk.8.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 78: blk.8.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 79: blk.8.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 80: blk.8.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 81: blk.8.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 82: blk.9.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 83: blk.9.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 84: blk.9.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 85: blk.9.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 86: blk.9.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 87: blk.9.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 88: blk.9.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 89: blk.9.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 90: blk.9.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 91: blk.10.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 92: blk.10.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 93: blk.10.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 94: blk.10.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 95: blk.10.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 96: blk.10.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 97: blk.10.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 98: blk.10.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 99: blk.10.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 100: blk.11.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 101: blk.11.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 102: blk.11.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 103: blk.11.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 104: blk.11.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 105: blk.11.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 106: blk.11.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 107: blk.11.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 108: blk.11.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 109: blk.12.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 110: blk.12.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 111: blk.12.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 112: blk.12.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 113: blk.12.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 114: blk.12.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 115: blk.12.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 116: blk.12.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 117: blk.12.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 118: blk.13.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 119: blk.13.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 120: blk.13.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 121: blk.13.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 122: blk.13.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 123: blk.13.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 124: blk.13.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 125: blk.13.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 126: blk.13.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 127: blk.14.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 128: blk.14.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 129: blk.14.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 130: blk.14.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 131: blk.14.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 132: blk.14.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 133: blk.14.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 134: blk.14.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 135: blk.14.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 136: blk.15.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 137: blk.15.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 138: blk.15.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 139: blk.15.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 140: blk.15.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 141: blk.15.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 142: blk.15.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 143: blk.15.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 144: blk.15.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 145: blk.16.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 146: blk.16.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 147: blk.16.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 148: blk.16.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 149: blk.16.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 150: blk.16.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 151: blk.16.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 152: blk.16.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 153: blk.16.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 154: blk.17.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 155: blk.17.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 156: blk.17.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 157: blk.17.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 158: blk.17.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 159: blk.17.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 160: blk.17.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 161: blk.17.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 162: blk.17.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 163: blk.18.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 164: blk.18.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 165: blk.18.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 166: blk.18.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 167: blk.18.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 168: blk.18.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 169: blk.18.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 170: blk.18.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 171: blk.18.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 172: blk.19.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 173: blk.19.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 174: blk.19.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 175: blk.19.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 176: blk.19.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 177: blk.19.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 178: blk.19.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 179: blk.19.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 180: blk.19.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 181: blk.20.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 182: blk.20.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 183: blk.20.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 184: blk.20.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 185: blk.20.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 186: blk.20.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 187: blk.20.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 188: blk.20.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 189: blk.20.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 190: blk.21.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 191: blk.21.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 192: blk.21.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 193: blk.21.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 194: blk.21.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 195: blk.21.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 196: blk.21.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 197: blk.21.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 198: blk.21.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 199: blk.22.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 200: blk.22.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 201: blk.22.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 202: blk.22.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 203: blk.22.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 204: blk.22.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 205: blk.22.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 206: blk.22.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 207: blk.22.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 208: blk.23.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 209: blk.23.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 210: blk.23.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 211: blk.23.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 212: blk.23.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 213: blk.23.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 214: blk.23.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 215: blk.23.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 216: blk.23.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 217: blk.24.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 218: blk.24.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 219: blk.24.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 220: blk.24.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 221: blk.24.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 222: blk.24.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 223: blk.24.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 224: blk.24.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 225: blk.24.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 226: blk.25.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 227: blk.25.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 228: blk.25.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 229: blk.25.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 230: blk.25.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 231: blk.25.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 232: blk.25.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 233: blk.25.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 234: blk.25.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 235: blk.26.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 236: blk.26.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 237: blk.26.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 238: blk.26.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 239: blk.26.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 240: blk.26.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 241: blk.26.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 242: blk.26.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 243: blk.26.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 244: blk.27.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 245: blk.27.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 246: blk.27.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 247: blk.27.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 248: blk.27.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 249: blk.27.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 250: blk.27.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 251: blk.27.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 252: blk.27.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 253: blk.28.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 254: blk.28.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 255: blk.28.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 256: blk.28.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 257: blk.28.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 258: blk.28.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 259: blk.28.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 260: blk.28.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 261: blk.28.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 262: blk.29.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 263: blk.29.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 264: blk.29.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 265: blk.29.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 266: blk.29.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 267: blk.29.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 268: blk.29.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 269: blk.29.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 270: blk.29.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 271: blk.30.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 272: blk.30.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 273: blk.30.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 274: blk.30.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 275: blk.30.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 276: blk.30.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 277: blk.30.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 278: blk.30.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 279: blk.30.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 280: blk.31.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 281: blk.31.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 282: blk.31.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 283: blk.31.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 284: blk.31.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 285: blk.31.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 286: blk.31.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 287: blk.31.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 288: blk.31.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 289: output_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
||||
"llama_model_loader: - tensor 290: output.weight q6_K [ 4096, 32002, 1, 1 ]\n",
|
||||
"llama_model_loader: - kv 0: general.architecture str = llama\n",
|
||||
"llama_model_loader: - kv 1: general.name str = teknium_openhermes-2.5-mistral-7b\n",
|
||||
"llama_model_loader: - kv 2: llama.context_length u32 = 32768\n",
|
||||
"llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n",
|
||||
"llama_model_loader: - kv 4: llama.block_count u32 = 32\n",
|
||||
"llama_model_loader: - kv 5: llama.feed_forward_length u32 = 14336\n",
|
||||
"llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n",
|
||||
"llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n",
|
||||
"llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8\n",
|
||||
"llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n",
|
||||
"llama_model_loader: - kv 10: llama.rope.freq_base f32 = 10000.000000\n",
|
||||
"llama_model_loader: - kv 11: general.file_type u32 = 15\n",
|
||||
"llama_model_loader: - kv 12: tokenizer.ggml.model str = llama\n",
|
||||
"llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32002] = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
|
||||
"llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32002] = [0.000000, 0.000000, 0.000000, 0.0000...\n",
|
||||
"llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32002] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
|
||||
"llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n",
|
||||
"llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 32000\n",
|
||||
"llama_model_loader: - kv 18: tokenizer.ggml.padding_token_id u32 = 0\n",
|
||||
"llama_model_loader: - kv 19: general.quantization_version u32 = 2\n",
|
||||
"llama_model_loader: - type f32: 65 tensors\n",
|
||||
"llama_model_loader: - type q4_K: 193 tensors\n",
|
||||
"llama_model_loader: - type q6_K: 33 tensors\n",
|
||||
"llm_load_vocab: special tokens definition check successful ( 261/32002 ).\n",
|
||||
"llm_load_print_meta: format = GGUF V3 (latest)\n",
|
||||
"llm_load_print_meta: arch = llama\n",
|
||||
"llm_load_print_meta: vocab type = SPM\n",
|
||||
"llm_load_print_meta: n_vocab = 32002\n",
|
||||
"llm_load_print_meta: n_merges = 0\n",
|
||||
"llm_load_print_meta: n_ctx_train = 32768\n",
|
||||
"llm_load_print_meta: n_embd = 4096\n",
|
||||
"llm_load_print_meta: n_head = 32\n",
|
||||
"llm_load_print_meta: n_head_kv = 8\n",
|
||||
"llm_load_print_meta: n_layer = 32\n",
|
||||
"llm_load_print_meta: n_rot = 128\n",
|
||||
"llm_load_print_meta: n_gqa = 4\n",
|
||||
"llm_load_print_meta: f_norm_eps = 0.0e+00\n",
|
||||
"llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n",
|
||||
"llm_load_print_meta: f_clamp_kqv = 0.0e+00\n",
|
||||
"llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
|
||||
"llm_load_print_meta: n_ff = 14336\n",
|
||||
"llm_load_print_meta: rope scaling = linear\n",
|
||||
"llm_load_print_meta: freq_base_train = 10000.0\n",
|
||||
"llm_load_print_meta: freq_scale_train = 1\n",
|
||||
"llm_load_print_meta: n_yarn_orig_ctx = 32768\n",
|
||||
"llm_load_print_meta: rope_finetuned = unknown\n",
|
||||
"llm_load_print_meta: model type = 7B\n",
|
||||
"llm_load_print_meta: model ftype = mostly Q4_K - Medium\n",
|
||||
"llm_load_print_meta: model params = 7.24 B\n",
|
||||
"llm_load_print_meta: model size = 4.07 GiB (4.83 BPW) \n",
|
||||
"llm_load_print_meta: general.name = teknium_openhermes-2.5-mistral-7b\n",
|
||||
"llm_load_print_meta: BOS token = 1 '<s>'\n",
|
||||
"llm_load_print_meta: EOS token = 32000 '<|im_end|>'\n",
|
||||
"llm_load_print_meta: UNK token = 0 '<unk>'\n",
|
||||
"llm_load_print_meta: PAD token = 0 '<unk>'\n",
|
||||
"llm_load_print_meta: LF token = 13 '<0x0A>'\n",
|
||||
"llm_load_tensors: ggml ctx size = 0.11 MiB\n",
|
||||
"llm_load_tensors: using CUDA for GPU acceleration\n",
|
||||
"llm_load_tensors: mem required = 70.42 MiB\n",
|
||||
"llm_load_tensors: offloading 32 repeating layers to GPU\n",
|
||||
"llm_load_tensors: offloading non-repeating layers to GPU\n",
|
||||
"llm_load_tensors: offloaded 35/35 layers to GPU\n",
|
||||
"llm_load_tensors: VRAM used: 4095.06 MiB\n",
|
||||
"...............................................................................................\n",
|
||||
"llama_new_context_with_model: n_ctx = 2048\n",
|
||||
"llama_new_context_with_model: freq_base = 10000.0\n",
|
||||
"llama_new_context_with_model: freq_scale = 1\n",
|
||||
"llama_kv_cache_init: offloading v cache to GPU\n",
|
||||
"llama_kv_cache_init: offloading k cache to GPU\n",
|
||||
"llama_kv_cache_init: VRAM kv self = 256.00 MiB\n",
|
||||
"llama_new_context_with_model: kv self size = 256.00 MiB\n",
|
||||
"llama_build_graph: non-view tensors processed: 740/740\n",
|
||||
"llama_new_context_with_model: compute buffer total size = 159.07 MiB\n",
|
||||
"llama_new_context_with_model: VRAM scratch buffer: 156.00 MiB\n",
|
||||
"llama_new_context_with_model: total VRAM used: 4507.07 MiB (model: 4095.06 MiB, context: 412.00 MiB)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import llama_cpp\n",
|
||||
"\n",
|
||||
"llama = llama_cpp.Llama(model_path=\"../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf\", n_gpu_layers=-1, n_ctx=2048, verbose=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[{'name': 'get_weather', 'arguments': {'zip_code': '10001'}}]\n",
|
||||
"====================================================================================================\n",
|
||||
"[{'name': 'calculate_mortgage_payment', 'arguments': {'loan_amount': 200000, 'interest_rate': 0.04, 'loan_term': 30}}]\n",
|
||||
"====================================================================================================\n",
|
||||
"Unfortunately, I do not have a built-in function to check currency exchange rates. However, you can use third-party APIs or websites like Google Finance or XE to get this information.\n",
|
||||
"====================================================================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"What's the weather in 10001?\",\n",
|
||||
" \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n",
|
||||
" \"What's the current exchange rate for USD to EUR?\"\n",
|
||||
"]\n",
|
||||
"functions = [get_weather, calculate_mortgage_payment, get_article_details]\n",
|
||||
"\n",
|
||||
"for prompt in prompts:\n",
|
||||
" prompt = generate_hermes_prompt(prompt, functions)\n",
|
||||
" completion = llama.create_completion(prompt, max_tokens=-1)[\"choices\"][0][\"text\"]\n",
|
||||
" function_calls = extract_function_calls(completion)\n",
|
||||
" if function_calls:\n",
|
||||
" print(function_calls)\n",
|
||||
" else:\n",
|
||||
" print(completion.strip())\n",
|
||||
" print(\"=\"*100)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"get_weather\n",
|
||||
"{'zip_code': '05751'}\n",
|
||||
"====================================================================================================\n",
|
||||
"get_weather\n",
|
||||
"{'zip_code': '05751'}\n",
|
||||
"get_weather\n",
|
||||
"{'zip_code': '07030'}\n",
|
||||
"calculate_mortgage_payment\n",
|
||||
"{'loan_amount': 250000, 'interest_rate': 4.18, 'loan_term': 30}\n",
|
||||
"====================================================================================================\n",
|
||||
"I don't have a function to get exchange rates, but I can provide some resources where you can find this information. You can check websites like Google Finance, XE.com, or Yahoo Finance for up-to-date currency exchange rates.\n",
|
||||
"====================================================================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"prompts = [\n",
|
||||
" \"What's the weather in 05751?\",\n",
|
||||
" \"I'm planning a trip to Killington, Vermont (05751) from Hoboken, NJ (07030). Can you get me weather for both locations and directions?\",\n",
|
||||
" \"What's the current exchange rate for USD to EUR?\"\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"for prompt in prompts:\n",
|
||||
" completion = llama.create_completion(generate_hermes_prompt(prompt, functions), max_tokens=-1)[\"choices\"][0][\"text\"]\n",
|
||||
" function_calls = extract_function_calls(completion)\n",
|
||||
"\n",
|
||||
" if function_calls:\n",
|
||||
" for function in function_calls:\n",
|
||||
" print(function[\"name\"])\n",
|
||||
" print(function[\"arguments\"])\n",
|
||||
" else:\n",
|
||||
" print(completion.strip())\n",
|
||||
"\n",
|
||||
" print(\"=\"*100)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5+"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
from .llama_cpp import *
|
||||
from .llama import *
|
||||
|
||||
__version__ = "0.2.38"
|
||||
__version__ = "0.2.43"
|
|
@ -42,6 +42,8 @@ class _LlamaModel:
|
|||
|
||||
self._llama_free_model = llama_cpp._lib.llama_free_model # type: ignore
|
||||
|
||||
self.model = None
|
||||
|
||||
if not os.path.exists(path_model):
|
||||
raise ValueError(f"Model path does not exist: {path_model}")
|
||||
|
||||
|
@ -248,6 +250,7 @@ class _LlamaContext:
|
|||
self.verbose = verbose
|
||||
|
||||
self._llama_free = llama_cpp._lib.llama_free # type: ignore
|
||||
self.ctx = None
|
||||
|
||||
assert self.model.model is not None
|
||||
|
||||
|
@ -497,6 +500,7 @@ class _LlamaBatch:
|
|||
|
||||
self._llama_batch_free = llama_cpp._lib.llama_batch_free # type: ignore
|
||||
|
||||
self.batch = None
|
||||
self.batch = llama_cpp.llama_batch_init(
|
||||
self.n_tokens, self.embd, self.n_seq_max
|
||||
)
|
||||
|
@ -506,6 +510,14 @@ class _LlamaBatch:
|
|||
self._llama_batch_free(self.batch)
|
||||
self.batch = None
|
||||
|
||||
def n_tokens(self) -> int:
|
||||
assert self.batch is not None
|
||||
return self.batch.n_tokens
|
||||
|
||||
def reset(self):
|
||||
assert self.batch is not None
|
||||
self.batch.n_tokens = 0
|
||||
|
||||
def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
|
||||
assert self.batch is not None
|
||||
n_tokens = len(batch)
|
||||
|
@ -518,6 +530,20 @@ class _LlamaBatch:
|
|||
self.batch.logits[i] = logits_all
|
||||
self.batch.logits[n_tokens - 1] = True
|
||||
|
||||
def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
|
||||
assert self.batch is not None
|
||||
n_tokens = len(batch)
|
||||
n_tokens0 = self.batch.n_tokens
|
||||
self.batch.n_tokens += n_tokens
|
||||
for i in range(n_tokens):
|
||||
j = n_tokens0 + i
|
||||
self.batch.token[j] = batch[i]
|
||||
self.batch.pos[j] = i
|
||||
self.batch.seq_id[j][0] = seq_id
|
||||
self.batch.n_seq_id[j] = 1
|
||||
self.batch.logits[j] = logits_all
|
||||
self.batch.logits[n_tokens - 1] = True
|
||||
|
||||
|
||||
class _LlamaTokenDataArray:
|
||||
def __init__(self, *, n_vocab: int):
|
||||
|
|
|
@ -19,6 +19,8 @@ from collections import deque
|
|||
|
||||
import ctypes
|
||||
|
||||
from llama_cpp.llama_types import List
|
||||
|
||||
from .llama_types import *
|
||||
from .llama_grammar import LlamaGrammar
|
||||
from .llama_cache import (
|
||||
|
@ -27,6 +29,10 @@ from .llama_cache import (
|
|||
LlamaDiskCache, # type: ignore
|
||||
LlamaRAMCache, # type: ignore
|
||||
)
|
||||
from .llama_tokenizer import (
|
||||
BaseLlamaTokenizer,
|
||||
LlamaTokenizer
|
||||
)
|
||||
import llama_cpp.llama_cpp as llama_cpp
|
||||
import llama_cpp.llama_chat_format as llama_chat_format
|
||||
|
||||
|
@ -44,6 +50,9 @@ from ._internals import (
|
|||
_LlamaSamplingContext, # type: ignore
|
||||
)
|
||||
from ._logger import set_verbose
|
||||
from ._utils import (
|
||||
suppress_stdout_stderr
|
||||
)
|
||||
|
||||
|
||||
class Llama:
|
||||
|
@ -95,6 +104,8 @@ class Llama:
|
|||
chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
|
||||
# Speculative Decoding
|
||||
draft_model: Optional[LlamaDraftModel] = None,
|
||||
# Tokenizer Override
|
||||
tokenizer: Optional[BaseLlamaTokenizer] = None,
|
||||
# Misc
|
||||
verbose: bool = True,
|
||||
# Extra Params
|
||||
|
@ -159,6 +170,7 @@ class Llama:
|
|||
chat_format: String specifying the chat format to use when calling create_chat_completion.
|
||||
chat_handler: Optional chat handler to use when calling create_chat_completion.
|
||||
draft_model: Optional draft model to use for speculative decoding.
|
||||
tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
|
||||
verbose: Print verbose output to stderr.
|
||||
|
||||
Raises:
|
||||
|
@ -173,7 +185,8 @@ class Llama:
|
|||
|
||||
self.numa = numa
|
||||
if not Llama.__backend_initialized:
|
||||
llama_cpp.llama_backend_init(self.numa)
|
||||
with suppress_stdout_stderr(disable=verbose):
|
||||
llama_cpp.llama_backend_init(self.numa)
|
||||
Llama.__backend_initialized = True
|
||||
|
||||
self.model_path = model_path
|
||||
|
@ -235,6 +248,7 @@ class Llama:
|
|||
self.n_threads_batch = n_threads_batch or max(
|
||||
multiprocessing.cpu_count() // 2, 1
|
||||
)
|
||||
|
||||
# Context Params
|
||||
self.context_params = llama_cpp.llama_context_default_params()
|
||||
self.context_params.seed = seed
|
||||
|
@ -267,7 +281,7 @@ class Llama:
|
|||
)
|
||||
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
|
||||
self.context_params.mul_mat_q = mul_mat_q
|
||||
self.context_params.logits_all = logits_all
|
||||
self.context_params.logits_all = logits_all if draft_model is None else True # Must be set to True for speculative decoding
|
||||
self.context_params.embedding = embedding
|
||||
self.context_params.offload_kqv = offload_kqv
|
||||
|
||||
|
@ -286,6 +300,10 @@ class Llama:
|
|||
self._model = _LlamaModel(
|
||||
path_model=self.model_path, params=self.model_params, verbose=self.verbose
|
||||
)
|
||||
|
||||
# Override tokenizer
|
||||
self.tokenizer_ = tokenizer or LlamaTokenizer(self)
|
||||
|
||||
# Set the default value for the context and correct the batch
|
||||
if n_ctx == 0:
|
||||
n_ctx = self._model.n_ctx_train()
|
||||
|
@ -431,18 +449,19 @@ class Llama:
|
|||
Returns:
|
||||
A list of tokens.
|
||||
"""
|
||||
return self._model.tokenize(text, add_bos, special)
|
||||
return self.tokenizer_.tokenize(text, add_bos, special)
|
||||
|
||||
def detokenize(self, tokens: List[int]) -> bytes:
|
||||
def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
|
||||
"""Detokenize a list of tokens.
|
||||
|
||||
Args:
|
||||
tokens: The list of tokens to detokenize.
|
||||
prev_tokens: The list of previous tokens. Offset mapping will be performed if provided
|
||||
|
||||
Returns:
|
||||
The detokenized string.
|
||||
"""
|
||||
return self._model.detokenize(tokens)
|
||||
return self.tokenizer_.detokenize(tokens, prev_tokens)
|
||||
|
||||
def set_cache(self, cache: Optional[BaseLlamaCache]):
|
||||
"""Set the cache.
|
||||
|
@ -538,7 +557,7 @@ class Llama:
|
|||
logits[:] = (
|
||||
logits_processor(self._input_ids, logits)
|
||||
if idx is None
|
||||
else logits_processor(self._input_ids[:idx], logits)
|
||||
else logits_processor(self._input_ids[:idx + 1], logits)
|
||||
)
|
||||
|
||||
sampling_params = _LlamaSamplingParams(
|
||||
|
@ -698,10 +717,53 @@ class Llama:
|
|||
Returns:
|
||||
An embedding object.
|
||||
"""
|
||||
assert self._ctx.ctx is not None
|
||||
assert self._model.model is not None
|
||||
model_name: str = model if model is not None else self.model_path
|
||||
|
||||
# get numeric embeddings
|
||||
embeds: List[List[float]]
|
||||
total_tokens: int
|
||||
embeds, total_tokens = self.embed(input, return_count=True) # type: ignore
|
||||
|
||||
# convert to CreateEmbeddingResponse
|
||||
data: List[Embedding] = [
|
||||
{
|
||||
"object": "embedding",
|
||||
"embedding": emb,
|
||||
"index": idx,
|
||||
}
|
||||
for idx, emb in enumerate(embeds)
|
||||
]
|
||||
|
||||
return {
|
||||
"object": "list",
|
||||
"data": data,
|
||||
"model": model_name,
|
||||
"usage": {
|
||||
"prompt_tokens": total_tokens,
|
||||
"total_tokens": total_tokens,
|
||||
},
|
||||
}
|
||||
|
||||
def embed(
|
||||
self,
|
||||
input: Union[str, List[str]],
|
||||
normalize: bool = True,
|
||||
truncate: bool = True,
|
||||
return_count: bool = False,
|
||||
):
|
||||
"""Embed a string.
|
||||
|
||||
Args:
|
||||
input: The utf-8 encoded string to embed.
|
||||
|
||||
Returns:
|
||||
A list of embeddings
|
||||
"""
|
||||
assert self._ctx.ctx is not None
|
||||
n_embd = self.n_embd()
|
||||
n_ctx = self.n_ctx()
|
||||
|
||||
if self.context_params.embedding == False:
|
||||
raise RuntimeError(
|
||||
"Llama model must be created with embedding=True to call this method"
|
||||
|
@ -715,48 +777,72 @@ class Llama:
|
|||
else:
|
||||
inputs = input
|
||||
|
||||
data: List[Embedding] = []
|
||||
# reset batch
|
||||
self._batch.reset()
|
||||
|
||||
# decode and fetch embeddings
|
||||
data: List[List[float]] = []
|
||||
def decode_batch(sizes: List[int]):
|
||||
assert self._ctx.ctx is not None
|
||||
llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
|
||||
self._ctx.decode(self._batch)
|
||||
self._batch.reset()
|
||||
|
||||
# store embeddings
|
||||
for i, s in enumerate(sizes):
|
||||
embedding = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[
|
||||
:n_embd
|
||||
]
|
||||
norm = np.linalg.norm(embedding) if normalize else s
|
||||
embedding: List[float] = [v / float(norm) for v in embedding]
|
||||
data.append(embedding)
|
||||
|
||||
# init state
|
||||
total_tokens = 0
|
||||
for index, input in enumerate(inputs):
|
||||
tokens = self.tokenize(input.encode("utf-8"), special=True)
|
||||
self.reset()
|
||||
self.eval(tokens)
|
||||
t_batch = 0
|
||||
s_sizes: List[int] = []
|
||||
|
||||
# accumulate batches and encode
|
||||
for text in inputs:
|
||||
tokens = self.tokenize(text.encode("utf-8"))
|
||||
if truncate:
|
||||
tokens = tokens[:n_ctx]
|
||||
|
||||
n_tokens = len(tokens)
|
||||
total_tokens += n_tokens
|
||||
embedding = llama_cpp.llama_get_embeddings(self._ctx.ctx)[
|
||||
: llama_cpp.llama_n_embd(self._model.model)
|
||||
]
|
||||
|
||||
data.append(
|
||||
{
|
||||
"object": "embedding",
|
||||
"embedding": embedding,
|
||||
"index": index,
|
||||
}
|
||||
)
|
||||
# check for overrun
|
||||
if n_tokens > n_ctx:
|
||||
raise ValueError(
|
||||
f"Requested tokens ({n_tokens}) exceed context window of {n_ctx}"
|
||||
)
|
||||
|
||||
# time to eval batch
|
||||
if t_batch + n_tokens > self._n_ctx:
|
||||
decode_batch(s_sizes)
|
||||
t_batch = 0
|
||||
s_sizes = []
|
||||
|
||||
# add to batch
|
||||
self._batch.add_sequence(tokens, len(s_sizes), False)
|
||||
t_batch += n_tokens
|
||||
s_sizes.append(n_tokens)
|
||||
|
||||
# hanlde last batch
|
||||
decode_batch(s_sizes)
|
||||
|
||||
if self.verbose:
|
||||
llama_cpp.llama_print_timings(self._ctx.ctx)
|
||||
|
||||
return {
|
||||
"object": "list",
|
||||
"data": data,
|
||||
"model": model_name,
|
||||
"usage": {
|
||||
"prompt_tokens": total_tokens,
|
||||
"total_tokens": total_tokens,
|
||||
},
|
||||
}
|
||||
output = data[0] if isinstance(input, str) else data
|
||||
|
||||
def embed(self, input: str) -> List[float]:
|
||||
"""Embed a string.
|
||||
llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
|
||||
self.reset()
|
||||
|
||||
Args:
|
||||
input: The utf-8 encoded string to embed.
|
||||
|
||||
Returns:
|
||||
A list of embeddings
|
||||
"""
|
||||
return list(map(float, self.create_embedding(input)["data"][0]["embedding"]))
|
||||
if return_count:
|
||||
return output, total_tokens
|
||||
else:
|
||||
return output
|
||||
|
||||
def _create_completion(
|
||||
self,
|
||||
|
@ -1552,6 +1638,38 @@ class Llama:
|
|||
logit_bias=logit_bias,
|
||||
)
|
||||
|
||||
def create_chat_completion_openai_v1(
|
||||
self,
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Generate a chat completion with return type based on the the OpenAI v1 API.
|
||||
|
||||
OpenAI python package is required to use this method.
|
||||
|
||||
You can install it with `pip install openai`.
|
||||
|
||||
Args:
|
||||
*args: Positional arguments to pass to create_chat_completion.
|
||||
**kwargs: Keyword arguments to pass to create_chat_completion.
|
||||
|
||||
Returns:
|
||||
Generated chat completion or a stream of chat completion chunks.
|
||||
"""
|
||||
try:
|
||||
from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
||||
stream = kwargs.get("stream", False) # type: ignore
|
||||
assert isinstance(stream, bool)
|
||||
if stream:
|
||||
return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore
|
||||
else:
|
||||
return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"To use create_chat_completion_openai_v1, you must install the openai package."
|
||||
"You can install it with `pip install openai`."
|
||||
)
|
||||
|
||||
def __getstate__(self):
|
||||
return dict(
|
||||
model_path=self.model_path,
|
||||
|
@ -1693,8 +1811,8 @@ class Llama:
|
|||
"""Return the vocabulary size."""
|
||||
return self._model.n_vocab()
|
||||
|
||||
def tokenizer(self) -> "LlamaTokenizer":
|
||||
"""Return the tokenizer for this model."""
|
||||
def tokenizer(self) -> LlamaTokenizer:
|
||||
"""Return the llama tokenizer for this model."""
|
||||
return LlamaTokenizer(self)
|
||||
|
||||
def token_eos(self) -> int:
|
||||
|
@ -1738,21 +1856,6 @@ class Llama:
|
|||
return longest_prefix
|
||||
|
||||
|
||||
class LlamaTokenizer:
|
||||
def __init__(self, llama: Llama):
|
||||
self.llama = llama
|
||||
|
||||
def encode(self, text: str, add_bos: bool = True) -> List[int]:
|
||||
return self.llama.tokenize(
|
||||
text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=True
|
||||
)
|
||||
|
||||
def decode(self, tokens: List[int]) -> str:
|
||||
return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
|
||||
|
||||
@classmethod
|
||||
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
|
||||
return cls(Llama(model_path=path, vocab_only=True))
|
||||
|
||||
|
||||
class LlamaState:
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -139,9 +139,11 @@ llama_seq_id = c_int32
|
|||
# enum llama_vocab_type {
|
||||
# LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
||||
# LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
||||
# LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
||||
# };
|
||||
LLAMA_VOCAB_TYPE_SPM = 0
|
||||
LLAMA_VOCAB_TYPE_BPE = 1
|
||||
LLAMA_VOCAB_TYPE_WPM = 2
|
||||
|
||||
|
||||
# enum llama_token_type {
|
||||
|
@ -468,6 +470,7 @@ class llama_model_params(Structure):
|
|||
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
# bool embedding; // embedding mode only
|
||||
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
# bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
|
||||
# };
|
||||
class llama_context_params(Structure):
|
||||
"""Parameters for llama_context
|
||||
|
@ -494,6 +497,7 @@ class llama_context_params(Structure):
|
|||
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
embedding (bool): embedding mode only
|
||||
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
|
||||
do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
|
||||
"""
|
||||
|
||||
_fields_ = [
|
||||
|
@ -518,6 +522,7 @@ class llama_context_params(Structure):
|
|||
("logits_all", c_bool),
|
||||
("embedding", c_bool),
|
||||
("offload_kqv", c_bool),
|
||||
("do_pooling", c_bool),
|
||||
]
|
||||
|
||||
|
||||
|
@ -1697,6 +1702,21 @@ _lib.llama_get_embeddings.argtypes = [llama_context_p]
|
|||
_lib.llama_get_embeddings.restype = c_float_p
|
||||
|
||||
|
||||
# // Get the embeddings for the ith sequence
|
||||
# // llama_get_embeddings(ctx) + i*n_embd
|
||||
# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||
def llama_get_embeddings_ith(
|
||||
ctx: llama_context_p, i: Union[c_int32, int]
|
||||
): # type: (...) -> Array[float] # type: ignore
|
||||
"""Get the embeddings for the ith sequence
|
||||
llama_get_embeddings(ctx) + i*n_embd"""
|
||||
return _lib.llama_get_embeddings_ith(ctx, i)
|
||||
|
||||
|
||||
_lib.llama_get_embeddings_ith.argtypes = [llama_context_p, c_int32]
|
||||
_lib.llama_get_embeddings_ith.restype = c_float_p
|
||||
|
||||
|
||||
# //
|
||||
# // Vocab
|
||||
# //
|
||||
|
|
|
@ -1471,12 +1471,15 @@ class SchemaConverter:
|
|||
|
||||
if schema_type == "object" and "properties" in schema:
|
||||
# TODO: `required` keyword
|
||||
prop_order = self._prop_order
|
||||
prop_pairs = sorted(
|
||||
schema["properties"].items(),
|
||||
# sort by position in prop_order (if specified) then by key
|
||||
key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
|
||||
)
|
||||
if self._prop_order:
|
||||
prop_order = self._prop_order
|
||||
prop_pairs = sorted(
|
||||
schema["properties"].items(),
|
||||
# sort by position in prop_order (if specified) then by key
|
||||
key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
|
||||
)
|
||||
else:
|
||||
prop_pairs = schema["properties"].items()
|
||||
|
||||
rule = '"{" space'
|
||||
for i, (prop_name, prop_schema) in enumerate(prop_pairs):
|
||||
|
|
95
llama_cpp/llama_tokenizer.py
Normal file
95
llama_cpp/llama_tokenizer.py
Normal file
|
@ -0,0 +1,95 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import abc
|
||||
from typing import (
|
||||
List,
|
||||
Optional,
|
||||
Any,
|
||||
)
|
||||
|
||||
import llama_cpp
|
||||
from llama_cpp.llama_types import List
|
||||
|
||||
|
||||
class BaseLlamaTokenizer(abc.ABC):
|
||||
@abc.abstractmethod
|
||||
def tokenize(
|
||||
self, text: bytes, add_bos: bool = True, special: bool = True
|
||||
) -> List[int]:
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
def detokenize(
|
||||
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
|
||||
) -> bytes:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class LlamaTokenizer(BaseLlamaTokenizer):
|
||||
def __init__(self, llama: llama_cpp.Llama):
|
||||
self._model = llama._model # type: ignore
|
||||
|
||||
def tokenize(
|
||||
self, text: bytes, add_bos: bool = True, special: bool = True
|
||||
) -> List[int]:
|
||||
return self._model.tokenize(text, add_bos=add_bos, special=special)
|
||||
|
||||
def detokenize(
|
||||
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
|
||||
) -> bytes:
|
||||
if prev_tokens is not None:
|
||||
return self._model.detokenize(tokens[len(prev_tokens) :])
|
||||
else:
|
||||
return self._model.detokenize(tokens)
|
||||
|
||||
def encode(
|
||||
self, text: str, add_bos: bool = True, special: bool = True
|
||||
) -> List[int]:
|
||||
return self.tokenize(
|
||||
text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
|
||||
)
|
||||
|
||||
def decode(self, tokens: List[int]) -> str:
|
||||
return self.detokenize(tokens).decode("utf-8", errors="ignore")
|
||||
|
||||
@classmethod
|
||||
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
|
||||
return cls(llama_cpp.Llama(model_path=path, vocab_only=True))
|
||||
|
||||
|
||||
class LlamaHFTokenizer(BaseLlamaTokenizer):
|
||||
def __init__(self, hf_tokenizer: Any):
|
||||
self.hf_tokenizer = hf_tokenizer
|
||||
|
||||
def tokenize(
|
||||
self, text: bytes, add_bos: bool = True, special: bool = True
|
||||
) -> List[int]:
|
||||
return self.hf_tokenizer.encode(
|
||||
text.decode("utf-8", errors="ignore"), add_special_tokens=special
|
||||
)
|
||||
|
||||
def detokenize(
|
||||
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
|
||||
) -> bytes:
|
||||
if prev_tokens is not None:
|
||||
text = self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
|
||||
prev_text = self.hf_tokenizer.decode(prev_tokens).encode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
return text[len(prev_text) :]
|
||||
else:
|
||||
return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
|
||||
try:
|
||||
from transformers import AutoTokenizer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The `transformers` library is required to use the `HFTokenizer`."
|
||||
"You can install it with `pip install transformers`."
|
||||
)
|
||||
hf_tokenizer = AutoTokenizer.from_pretrained(
|
||||
pretrained_model_name_or_path=pretrained_model_name_or_path
|
||||
)
|
||||
return cls(hf_tokenizer)
|
|
@ -97,7 +97,7 @@ class CreateChatCompletionResponse(TypedDict):
|
|||
|
||||
|
||||
class ChatCompletionMessageToolCallChunkFunction(TypedDict):
|
||||
name: str
|
||||
name: Optional[str]
|
||||
arguments: str
|
||||
|
||||
|
||||
|
@ -118,12 +118,12 @@ class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
|
|||
|
||||
|
||||
class ChatCompletionStreamResponseDelta(TypedDict):
|
||||
content: NotRequired[str]
|
||||
content: NotRequired[Optional[str]]
|
||||
function_call: NotRequired[
|
||||
ChatCompletionStreamResponseDeltaFunctionCall
|
||||
Optional[ChatCompletionStreamResponseDeltaFunctionCall]
|
||||
] # DEPRECATED
|
||||
tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]]
|
||||
role: NotRequired[Literal["system", "user", "assistant", "tool"]]
|
||||
tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]]
|
||||
role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]]
|
||||
|
||||
|
||||
class ChatCompletionStreamResponseChoice(TypedDict):
|
||||
|
@ -132,6 +132,7 @@ class ChatCompletionStreamResponseChoice(TypedDict):
|
|||
ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
|
||||
]
|
||||
finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
|
||||
logprobs: NotRequired[Optional[CompletionLogprobs]]
|
||||
|
||||
|
||||
class CreateChatCompletionStreamResponse(TypedDict):
|
||||
|
|
|
@ -146,30 +146,8 @@ _libllava.llava_eval_image_embed.restype = c_bool
|
|||
################################################
|
||||
|
||||
|
||||
# struct clip_vision_hparams {
|
||||
# int32_t image_size;
|
||||
# int32_t patch_size;
|
||||
# int32_t hidden_size;
|
||||
# int32_t n_intermediate;
|
||||
# int32_t projection_dim;
|
||||
# int32_t n_head;
|
||||
# int32_t n_layer;
|
||||
# float eps;
|
||||
# };
|
||||
class clip_vision_hparams(Structure):
|
||||
_fields_ = [
|
||||
("image_size", c_int32),
|
||||
("patch_size", c_int32),
|
||||
("hidden_size", c_int32),
|
||||
("n_intermediate", c_int32),
|
||||
("projection_dim", c_int32),
|
||||
("n_head", c_int32),
|
||||
("n_layer", c_int32),
|
||||
("eps", c_float),
|
||||
]
|
||||
|
||||
# /** load mmproj model */
|
||||
# CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
|
||||
# CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
|
||||
def clip_model_load(fname: bytes, verbosity: Union[c_int, int]) -> clip_ctx_p:
|
||||
return _libllava.clip_model_load(fname, verbosity)
|
||||
|
||||
|
@ -183,50 +161,3 @@ def clip_free(ctx: clip_ctx_p):
|
|||
|
||||
_libllava.clip_free.argtypes = [clip_ctx_p]
|
||||
_libllava.clip_free.restype = None
|
||||
|
||||
# size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
# int clip_n_patches(const struct clip_ctx * ctx);
|
||||
# int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
|
||||
# // RGB uint8 image
|
||||
# struct clip_image_u8 {
|
||||
# int nx;
|
||||
# int ny;
|
||||
# uint8_t * data = NULL;
|
||||
# size_t size;
|
||||
# };
|
||||
|
||||
# // RGB float32 image (NHWC)
|
||||
# // Memory layout: RGBRGBRGB...
|
||||
# struct clip_image_f32 {
|
||||
# int nx;
|
||||
# int ny;
|
||||
# float * data = NULL;
|
||||
# size_t size;
|
||||
# };
|
||||
|
||||
# struct clip_image_u8_batch {
|
||||
# struct clip_image_u8 * data;
|
||||
# size_t size;
|
||||
# };
|
||||
|
||||
# struct clip_image_f32_batch {
|
||||
# struct clip_image_f32 * data;
|
||||
# size_t size;
|
||||
# };
|
||||
|
||||
# struct clip_image_u8 * make_clip_image_u8();
|
||||
# struct clip_image_f32 * make_clip_image_f32();
|
||||
# CLIP_API void clip_image_u8_free(clip_image_u8 * img);
|
||||
# CLIP_API void clip_image_f32_free(clip_image_f32 * img);
|
||||
# CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||
# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||
# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||
|
||||
# bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
|
||||
# bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
|
||||
# bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
|
||||
# float * vec);
|
||||
|
||||
# bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
|
|
@ -6,6 +6,7 @@ from typing import Dict, Optional, Union, List
|
|||
|
||||
import llama_cpp
|
||||
import llama_cpp.llama_speculative as llama_speculative
|
||||
import llama_cpp.llama_tokenizer as llama_tokenizer
|
||||
|
||||
from llama_cpp.server.settings import ModelSettings
|
||||
|
||||
|
@ -93,6 +94,10 @@ class LlamaProxy:
|
|||
)
|
||||
)
|
||||
|
||||
tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
|
||||
if settings.hf_pretrained_model_name_or_path is not None:
|
||||
tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(settings.hf_pretrained_model_name_or_path)
|
||||
|
||||
draft_model = None
|
||||
if settings.draft_model is not None:
|
||||
draft_model = llama_speculative.LlamaPromptLookupDecoding(
|
||||
|
@ -156,6 +161,8 @@ class LlamaProxy:
|
|||
chat_handler=chat_handler,
|
||||
# Speculative Decoding
|
||||
draft_model=draft_model,
|
||||
# Tokenizer
|
||||
tokenizer=tokenizer,
|
||||
# Misc
|
||||
verbose=settings.verbose,
|
||||
)
|
||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 098f6d737b65134cf220d12b9b706e8cfc5e4610
|
||||
Subproject commit 8084d554406b767d36b3250b3b787462d5dd626f
|
Loading…
Reference in a new issue