This commit is contained in:
baalajimaestro 2024-02-14 17:50:11 +05:30
commit 21ac214a38
Signed by: baalajimaestro
GPG key ID: F93C394FE9BBAFD5
22 changed files with 2414 additions and 188 deletions

View file

@ -16,7 +16,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
submodules: "true"
submodules: "recursive"
# Used to host cibuildwheel
- uses: actions/setup-python@v3
@ -48,7 +48,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
submodules: "true"
submodules: "recursive"
- uses: actions/setup-python@v3
with:
python-version: "3.8"

View file

@ -14,7 +14,7 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: "true"
submodules: "recursive"
- name: Set up QEMU
uses: docker/setup-qemu-action@v2

View file

@ -18,7 +18,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
submodules: "true"
submodules: "recursive"
- name: Set up Python
uses: actions/setup-python@v4
with:

View file

@ -12,7 +12,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
submodules: "true"
submodules: "recursive"
- name: Set up Python
uses: actions/setup-python@v4
with:

View file

@ -19,7 +19,7 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
submodules: "true"
submodules: "recursive"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
@ -42,7 +42,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
submodules: "true"
submodules: "recursive"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
@ -65,7 +65,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
submodules: "true"
submodules: "recursive"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
@ -85,7 +85,7 @@ jobs:
# steps:
# - uses: actions/checkout@v3
# with:
# submodules: "true"
# submodules: "recursive"
# - name: Set up Python 3.8
# uses: actions/setup-python@v4
# with:
@ -112,7 +112,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
submodules: "true"
submodules: "recursive"
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:

View file

@ -7,11 +7,42 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
## [0.2.43]
- feat: Update llama.cpp to ggerganov/llama.cpp@8084d554406b767d36b3250b3b787462d5dd626f
- feat: Support batch embeddings by @iamlemec in #1186
- fix: submodule kompute is not included in sdist by @abetlen in 7dbbfdecadebe7750be650d9409959640ff9a460
- fix: fix: Update openbuddy prompt format by @abetlen in 07a783779a62a4aac0b11161c7e0eb983ff215f8
## [0.2.42]
- feat: Update llama.cpp to ggerganov/llama.cpp@ea9c8e11436ad50719987fa23a289c74b7b40d40
- fix: sample idx off-by-one error for logit_processors by @lapp0 in #1179
- fix: chat formatting bugs in `chatml-function-calling` by @abetlen in 4b0e3320bd8c2c209e29978d0b21e2e471cc9ee3 and 68fb71b6a26a1e57331868f959b47ab4b87851e1
## [0.2.41]
- feat: Update llama.cpp to ggerganov/llama.cpp@895407f31b358e3d9335e847d13f033491ec8a5b
- fix: Don't change order of json schema object properties in generated grammar unless prop_order is passed by @abetlen in d1822fed6b706f38bd1ff0de4dec5baaa3cf84fa
## [0.2.40]
- feat: Update llama.cpp to ggerganov/llama.cpp@3bdc4cd0f595a6096cca4a64aa75ffa8a3503465
- feat: Generic chatml Function Calling using chat_format="chatml-function-calling"` by @abetlen in #957
- fix: Circular dependancy preventing early Llama object free by @notwa in #1176
- docs: Set the correct command for compiling with syscl support by @akarshanbiswas in #1172
- feat: use gpu backend for clip if available by @iamlemec in #1175
## [0.2.39]
- feat: Update llama.cpp to ggerganov/llama.cpp@b08f22c882a1443e6b97081f3ce718a4d1a741f8
- fix: Fix destructor logging bugs by using llama_log_callback to avoid suppress_stdout_stderr by @abetlen in 59760c85eddc72dfcc1839f43760ef72c23d6874
## [0.2.38]
- feat: Update llama.cpp to ggerganov/llama.cpp@1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915
- feat: Add speculative decoding by @abetlen in #1120
- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template 078cca0361bf5a94d2cf52ed04980d20e32d6f95
- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template by @abetlen in 078cca0361bf5a94d2cf52ed04980d20e32d6f95
## [0.2.37]

View file

@ -46,6 +46,14 @@ if (LLAMA_BUILD)
)
if (LLAVA_BUILD)
if (LLAMA_CUBLAS)
add_compile_definitions(GGML_USE_CUBLAS)
endif()
if (LLAMA_METAL)
add_compile_definitions(GGML_USE_METAL)
endif()
# Building llava
add_subdirectory(vendor/llama.cpp/examples/llava)
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")

View file

@ -19,10 +19,10 @@ build.opencl:
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
build.openblas:
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e .
build.blis:
CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" python3 -m pip install --verbose -e .
CMAKE_ARGS="-DLLAMA_BLAS=on -DLLAMA_BLAS_VENDOR=FLAME" python3 -m pip install --verbose -e .
build.metal:
CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install --verbose -e .

View file

@ -118,7 +118,8 @@ CMAKE_ARGS="-DLLAMA_KOMPUTE=on" pip install llama-cpp-python
To install with SYCL support, set the `LLAMA_SYCL=on` environment variable before installing:
```bash
CMAKE_ARGS="-DLLAMA_SYCL=on" pip install llama-cpp-python
source /opt/intel/oneapi/setvars.sh
CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install llama-cpp-python
```
### Windows Notes
@ -291,14 +292,15 @@ To constrain the response to a specific JSON Schema, you can use the `schema` pr
### Function Calling
The high-level API also provides a simple interface for function calling.
The high-level API also provides a simple interface for function calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.
Note that the only model that supports full function calling at this time is "functionary".
The gguf-converted files for this model can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
The gguf-converted files for functionary can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
```python
>>> from llama_cpp import Llama
>>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
>>> # or
>>> llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling")
>>> llm.create_chat_completion(
messages = [
{

View file

@ -0,0 +1,910 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"name\": \"get_article_details\",\n",
" \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"title\": {\n",
" \"type\": \"str\"\n",
" },\n",
" \"authors\": {\n",
" \"type\": \"list[str]\"\n",
" },\n",
" \"short_summary\": {\n",
" \"type\": \"str\"\n",
" },\n",
" \"date_published\": {\n",
" \"type\": \"str\"\n",
" },\n",
" \"tags\": {\n",
" \"type\": \"list[str]\"\n",
" }\n",
" }\n",
" },\n",
" \"returns\": \"Article\"\n",
"}\n"
]
}
],
"source": [
"import json\n",
"import inspect\n",
"from typing import get_type_hints\n",
"\n",
"class Article:\n",
" pass\n",
"\n",
"class Weather:\n",
" pass\n",
"\n",
"class Directions:\n",
" pass\n",
"\n",
"def calculate_mortgage_payment(loan_amount: int, interest_rate: float, loan_term: int) -> float:\n",
" \"\"\"Get the monthly mortgage payment given an interest rate percentage.\"\"\"\n",
" \n",
" # TODO: you must implement this to actually call it later\n",
" pass\n",
"\n",
"def get_article_details(title: str, authors: list[str], short_summary: str, date_published: str, tags: list[str]) -> Article:\n",
" '''Get article details from unstructured article text.\n",
"date_published: formatted as \"MM/DD/YYYY\"'''\n",
" \n",
" # TODO: you must implement this to actually call it later\n",
" pass\n",
"\n",
"def get_weather(zip_code: str) -> Weather:\n",
" \"\"\"Get the current weather given a zip code.\"\"\"\n",
" \n",
" # TODO: you must implement this to actually call it later\n",
" pass\n",
"\n",
"def get_directions(start: str, destination: str) -> Directions:\n",
" \"\"\"Get directions from Google Directions API.\n",
"start: start address as a string including zipcode (if any)\n",
"destination: end address as a string including zipcode (if any)\"\"\"\n",
" \n",
" # TODO: you must implement this to actually call it later\n",
" pass\n",
"\n",
"def get_type_name(t):\n",
" name = str(t)\n",
" if \"list\" in name or \"dict\" in name:\n",
" return name\n",
" else:\n",
" return t.__name__\n",
"\n",
"def serialize_function_to_json(func):\n",
" signature = inspect.signature(func)\n",
" type_hints = get_type_hints(func)\n",
"\n",
" function_info = {\n",
" \"name\": func.__name__,\n",
" \"description\": func.__doc__,\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {}\n",
" },\n",
" \"returns\": type_hints.get('return', 'void').__name__\n",
" }\n",
"\n",
" for name, _ in signature.parameters.items():\n",
" param_type = get_type_name(type_hints.get(name, type(None)))\n",
" function_info[\"parameters\"][\"properties\"][name] = {\"type\": param_type}\n",
"\n",
" return json.dumps(function_info, indent=2)\n",
"\n",
"print(serialize_function_to_json(get_article_details))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import xml.etree.ElementTree as ET\n",
"import re\n",
"\n",
"def extract_function_calls(completion):\n",
" completion = completion.strip()\n",
" pattern = r\"(<multiplefunctions>(.*?)</multiplefunctions>)\"\n",
" match = re.search(pattern, completion, re.DOTALL)\n",
" if not match:\n",
" return None\n",
" \n",
" multiplefn = match.group(1)\n",
" root = ET.fromstring(multiplefn)\n",
" functions = root.findall(\"functioncall\")\n",
" return [json.loads(fn.text) for fn in functions]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def generate_hermes_prompt(prompt, functions):\n",
" functions = \"\\n\\n\".join([serialize_function_to_json(fn) for fn in functions])\n",
" prompt = f\"\"\"<|im_start|>system\n",
"You are a helpful assistant with access to the following functions:\n",
"\n",
"{functions}\n",
"\n",
"To use these functions respond with:\n",
"<multiplefunctions>\n",
" <functioncall> {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} </functioncall>\n",
" <functioncall> {{\"name\": \"function_name\", \"arguments\": {{\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}}}} </functioncall>\n",
" ...\n",
"</multiplefunctions>\n",
"\n",
"Edge cases you must handle:\n",
"- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
"<|im_start|>user\n",
"{prompt}<|im_end|>\n",
"<|im_start|>assistant\"\"\"\n",
" return prompt"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<|im_start|>system\n",
"You are a helpful assistant with access to the following functions:\n",
"\n",
"{\n",
" \"name\": \"get_weather\",\n",
" \"description\": \"Get the current weather given a zip code.\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"zip_code\": {\n",
" \"type\": \"str\"\n",
" }\n",
" }\n",
" },\n",
" \"returns\": \"Weather\"\n",
"}\n",
"\n",
"{\n",
" \"name\": \"calculate_mortgage_payment\",\n",
" \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"loan_amount\": {\n",
" \"type\": \"int\"\n",
" },\n",
" \"interest_rate\": {\n",
" \"type\": \"float\"\n",
" },\n",
" \"loan_term\": {\n",
" \"type\": \"int\"\n",
" }\n",
" }\n",
" },\n",
" \"returns\": \"float\"\n",
"}\n",
"\n",
"{\n",
" \"name\": \"get_article_details\",\n",
" \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"title\": {\n",
" \"type\": \"str\"\n",
" },\n",
" \"authors\": {\n",
" \"type\": \"list[str]\"\n",
" },\n",
" \"short_summary\": {\n",
" \"type\": \"str\"\n",
" },\n",
" \"date_published\": {\n",
" \"type\": \"str\"\n",
" },\n",
" \"tags\": {\n",
" \"type\": \"list[str]\"\n",
" }\n",
" }\n",
" },\n",
" \"returns\": \"Article\"\n",
"}\n",
"\n",
"To use these functions respond with:\n",
"<multiplefunctions>\n",
" <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
" <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
" ...\n",
"</multiplefunctions>\n",
"\n",
"Edge cases you must handle:\n",
"- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
"<|im_start|>user\n",
"What's the weather in 10001?<|im_end|>\n",
"<|im_start|>assistant\n",
"<|im_start|>system\n",
"You are a helpful assistant with access to the following functions:\n",
"\n",
"{\n",
" \"name\": \"get_weather\",\n",
" \"description\": \"Get the current weather given a zip code.\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"zip_code\": {\n",
" \"type\": \"str\"\n",
" }\n",
" }\n",
" },\n",
" \"returns\": \"Weather\"\n",
"}\n",
"\n",
"{\n",
" \"name\": \"calculate_mortgage_payment\",\n",
" \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"loan_amount\": {\n",
" \"type\": \"int\"\n",
" },\n",
" \"interest_rate\": {\n",
" \"type\": \"float\"\n",
" },\n",
" \"loan_term\": {\n",
" \"type\": \"int\"\n",
" }\n",
" }\n",
" },\n",
" \"returns\": \"float\"\n",
"}\n",
"\n",
"{\n",
" \"name\": \"get_article_details\",\n",
" \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"title\": {\n",
" \"type\": \"str\"\n",
" },\n",
" \"authors\": {\n",
" \"type\": \"list[str]\"\n",
" },\n",
" \"short_summary\": {\n",
" \"type\": \"str\"\n",
" },\n",
" \"date_published\": {\n",
" \"type\": \"str\"\n",
" },\n",
" \"tags\": {\n",
" \"type\": \"list[str]\"\n",
" }\n",
" }\n",
" },\n",
" \"returns\": \"Article\"\n",
"}\n",
"\n",
"To use these functions respond with:\n",
"<multiplefunctions>\n",
" <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
" <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
" ...\n",
"</multiplefunctions>\n",
"\n",
"Edge cases you must handle:\n",
"- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
"<|im_start|>user\n",
"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.<|im_end|>\n",
"<|im_start|>assistant\n",
"<|im_start|>system\n",
"You are a helpful assistant with access to the following functions:\n",
"\n",
"{\n",
" \"name\": \"get_weather\",\n",
" \"description\": \"Get the current weather given a zip code.\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"zip_code\": {\n",
" \"type\": \"str\"\n",
" }\n",
" }\n",
" },\n",
" \"returns\": \"Weather\"\n",
"}\n",
"\n",
"{\n",
" \"name\": \"calculate_mortgage_payment\",\n",
" \"description\": \"Get the monthly mortgage payment given an interest rate percentage.\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"loan_amount\": {\n",
" \"type\": \"int\"\n",
" },\n",
" \"interest_rate\": {\n",
" \"type\": \"float\"\n",
" },\n",
" \"loan_term\": {\n",
" \"type\": \"int\"\n",
" }\n",
" }\n",
" },\n",
" \"returns\": \"float\"\n",
"}\n",
"\n",
"{\n",
" \"name\": \"get_article_details\",\n",
" \"description\": \"Get article details from unstructured article text.\\ndate_published: formatted as \\\"MM/DD/YYYY\\\"\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"title\": {\n",
" \"type\": \"str\"\n",
" },\n",
" \"authors\": {\n",
" \"type\": \"list[str]\"\n",
" },\n",
" \"short_summary\": {\n",
" \"type\": \"str\"\n",
" },\n",
" \"date_published\": {\n",
" \"type\": \"str\"\n",
" },\n",
" \"tags\": {\n",
" \"type\": \"list[str]\"\n",
" }\n",
" }\n",
" },\n",
" \"returns\": \"Article\"\n",
"}\n",
"\n",
"To use these functions respond with:\n",
"<multiplefunctions>\n",
" <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
" <functioncall> {\"name\": \"function_name\", \"arguments\": {\"arg_1\": \"value_1\", \"arg_2\": value_2, ...}} </functioncall>\n",
" ...\n",
"</multiplefunctions>\n",
"\n",
"Edge cases you must handle:\n",
"- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>\n",
"<|im_start|>user\n",
"What's the current exchange rate for USD to EUR?<|im_end|>\n",
"<|im_start|>assistant\n"
]
}
],
"source": [
"prompts = [\n",
" \"What's the weather in 10001?\",\n",
" \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n",
" \"What's the current exchange rate for USD to EUR?\"\n",
"]\n",
"functions = [get_weather, calculate_mortgage_payment, get_article_details]\n",
"\n",
"for prompt in prompts:\n",
" print(generate_hermes_prompt(prompt, functions))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"ggml_init_cublas: GGML_CUDA_FORCE_MMQ: no\n",
"ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes\n",
"ggml_init_cublas: found 1 CUDA devices:\n",
" Device 0: NVIDIA GeForce RTX 2060, compute capability 7.5\n",
"llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from ../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf (version GGUF V3 (latest))\n",
"llama_model_loader: - tensor 0: token_embd.weight q4_K [ 4096, 32002, 1, 1 ]\n",
"llama_model_loader: - tensor 1: blk.0.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 2: blk.0.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 3: blk.0.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 4: blk.0.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 5: blk.0.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 6: blk.0.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 7: blk.0.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 8: blk.0.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 9: blk.0.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 10: blk.1.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 11: blk.1.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 12: blk.1.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 13: blk.1.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 14: blk.1.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 15: blk.1.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 16: blk.1.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 17: blk.1.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 18: blk.1.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 19: blk.2.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 20: blk.2.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 21: blk.2.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 22: blk.2.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 23: blk.2.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 24: blk.2.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 25: blk.2.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 26: blk.2.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 27: blk.2.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 28: blk.3.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 29: blk.3.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 30: blk.3.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 31: blk.3.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 32: blk.3.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 33: blk.3.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 34: blk.3.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 35: blk.3.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 36: blk.3.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 37: blk.4.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 38: blk.4.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 39: blk.4.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 40: blk.4.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 41: blk.4.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 42: blk.4.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 43: blk.4.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 44: blk.4.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 45: blk.4.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 46: blk.5.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 47: blk.5.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 48: blk.5.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 49: blk.5.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 50: blk.5.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 51: blk.5.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 52: blk.5.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 53: blk.5.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 54: blk.5.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 55: blk.6.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 56: blk.6.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 57: blk.6.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 58: blk.6.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 59: blk.6.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 60: blk.6.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 61: blk.6.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 62: blk.6.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 63: blk.6.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 64: blk.7.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 65: blk.7.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 66: blk.7.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 67: blk.7.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 68: blk.7.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 69: blk.7.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 70: blk.7.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 71: blk.7.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 72: blk.7.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 73: blk.8.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 74: blk.8.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 75: blk.8.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 76: blk.8.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 77: blk.8.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 78: blk.8.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 79: blk.8.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 80: blk.8.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 81: blk.8.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 82: blk.9.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 83: blk.9.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 84: blk.9.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 85: blk.9.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 86: blk.9.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 87: blk.9.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 88: blk.9.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 89: blk.9.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 90: blk.9.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 91: blk.10.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 92: blk.10.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 93: blk.10.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 94: blk.10.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 95: blk.10.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 96: blk.10.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 97: blk.10.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 98: blk.10.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 99: blk.10.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 100: blk.11.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 101: blk.11.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 102: blk.11.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 103: blk.11.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 104: blk.11.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 105: blk.11.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 106: blk.11.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 107: blk.11.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 108: blk.11.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 109: blk.12.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 110: blk.12.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 111: blk.12.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 112: blk.12.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 113: blk.12.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 114: blk.12.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 115: blk.12.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 116: blk.12.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 117: blk.12.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 118: blk.13.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 119: blk.13.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 120: blk.13.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 121: blk.13.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 122: blk.13.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 123: blk.13.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 124: blk.13.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 125: blk.13.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 126: blk.13.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 127: blk.14.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 128: blk.14.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 129: blk.14.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 130: blk.14.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 131: blk.14.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 132: blk.14.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 133: blk.14.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 134: blk.14.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 135: blk.14.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 136: blk.15.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 137: blk.15.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 138: blk.15.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 139: blk.15.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 140: blk.15.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 141: blk.15.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 142: blk.15.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 143: blk.15.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 144: blk.15.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 145: blk.16.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 146: blk.16.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 147: blk.16.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 148: blk.16.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 149: blk.16.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 150: blk.16.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 151: blk.16.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 152: blk.16.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 153: blk.16.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 154: blk.17.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 155: blk.17.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 156: blk.17.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 157: blk.17.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 158: blk.17.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 159: blk.17.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 160: blk.17.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 161: blk.17.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 162: blk.17.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 163: blk.18.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 164: blk.18.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 165: blk.18.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 166: blk.18.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 167: blk.18.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 168: blk.18.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 169: blk.18.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 170: blk.18.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 171: blk.18.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 172: blk.19.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 173: blk.19.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 174: blk.19.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 175: blk.19.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 176: blk.19.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 177: blk.19.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 178: blk.19.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 179: blk.19.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 180: blk.19.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 181: blk.20.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 182: blk.20.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 183: blk.20.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 184: blk.20.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 185: blk.20.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 186: blk.20.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 187: blk.20.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 188: blk.20.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 189: blk.20.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 190: blk.21.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 191: blk.21.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 192: blk.21.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 193: blk.21.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 194: blk.21.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 195: blk.21.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 196: blk.21.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 197: blk.21.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 198: blk.21.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 199: blk.22.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 200: blk.22.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 201: blk.22.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 202: blk.22.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 203: blk.22.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 204: blk.22.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 205: blk.22.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 206: blk.22.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 207: blk.22.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 208: blk.23.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 209: blk.23.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 210: blk.23.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 211: blk.23.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 212: blk.23.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 213: blk.23.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 214: blk.23.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 215: blk.23.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 216: blk.23.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 217: blk.24.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 218: blk.24.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 219: blk.24.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 220: blk.24.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 221: blk.24.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 222: blk.24.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 223: blk.24.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 224: blk.24.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 225: blk.24.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 226: blk.25.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 227: blk.25.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 228: blk.25.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 229: blk.25.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 230: blk.25.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 231: blk.25.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 232: blk.25.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 233: blk.25.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 234: blk.25.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 235: blk.26.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 236: blk.26.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 237: blk.26.attn_v.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 238: blk.26.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 239: blk.26.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 240: blk.26.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 241: blk.26.ffn_down.weight q4_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 242: blk.26.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 243: blk.26.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 244: blk.27.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 245: blk.27.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 246: blk.27.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 247: blk.27.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 248: blk.27.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 249: blk.27.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 250: blk.27.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 251: blk.27.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 252: blk.27.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 253: blk.28.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 254: blk.28.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 255: blk.28.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 256: blk.28.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 257: blk.28.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 258: blk.28.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 259: blk.28.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 260: blk.28.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 261: blk.28.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 262: blk.29.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 263: blk.29.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 264: blk.29.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 265: blk.29.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 266: blk.29.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 267: blk.29.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 268: blk.29.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 269: blk.29.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 270: blk.29.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 271: blk.30.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 272: blk.30.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 273: blk.30.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 274: blk.30.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 275: blk.30.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 276: blk.30.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 277: blk.30.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 278: blk.30.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 279: blk.30.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 280: blk.31.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 281: blk.31.attn_k.weight q4_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 282: blk.31.attn_v.weight q6_K [ 4096, 1024, 1, 1 ]\n",
"llama_model_loader: - tensor 283: blk.31.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 284: blk.31.ffn_gate.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 285: blk.31.ffn_up.weight q4_K [ 4096, 14336, 1, 1 ]\n",
"llama_model_loader: - tensor 286: blk.31.ffn_down.weight q6_K [ 14336, 4096, 1, 1 ]\n",
"llama_model_loader: - tensor 287: blk.31.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 288: blk.31.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 289: output_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
"llama_model_loader: - tensor 290: output.weight q6_K [ 4096, 32002, 1, 1 ]\n",
"llama_model_loader: - kv 0: general.architecture str = llama\n",
"llama_model_loader: - kv 1: general.name str = teknium_openhermes-2.5-mistral-7b\n",
"llama_model_loader: - kv 2: llama.context_length u32 = 32768\n",
"llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n",
"llama_model_loader: - kv 4: llama.block_count u32 = 32\n",
"llama_model_loader: - kv 5: llama.feed_forward_length u32 = 14336\n",
"llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n",
"llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n",
"llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8\n",
"llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n",
"llama_model_loader: - kv 10: llama.rope.freq_base f32 = 10000.000000\n",
"llama_model_loader: - kv 11: general.file_type u32 = 15\n",
"llama_model_loader: - kv 12: tokenizer.ggml.model str = llama\n",
"llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32002] = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
"llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32002] = [0.000000, 0.000000, 0.000000, 0.0000...\n",
"llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32002] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
"llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n",
"llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 32000\n",
"llama_model_loader: - kv 18: tokenizer.ggml.padding_token_id u32 = 0\n",
"llama_model_loader: - kv 19: general.quantization_version u32 = 2\n",
"llama_model_loader: - type f32: 65 tensors\n",
"llama_model_loader: - type q4_K: 193 tensors\n",
"llama_model_loader: - type q6_K: 33 tensors\n",
"llm_load_vocab: special tokens definition check successful ( 261/32002 ).\n",
"llm_load_print_meta: format = GGUF V3 (latest)\n",
"llm_load_print_meta: arch = llama\n",
"llm_load_print_meta: vocab type = SPM\n",
"llm_load_print_meta: n_vocab = 32002\n",
"llm_load_print_meta: n_merges = 0\n",
"llm_load_print_meta: n_ctx_train = 32768\n",
"llm_load_print_meta: n_embd = 4096\n",
"llm_load_print_meta: n_head = 32\n",
"llm_load_print_meta: n_head_kv = 8\n",
"llm_load_print_meta: n_layer = 32\n",
"llm_load_print_meta: n_rot = 128\n",
"llm_load_print_meta: n_gqa = 4\n",
"llm_load_print_meta: f_norm_eps = 0.0e+00\n",
"llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n",
"llm_load_print_meta: f_clamp_kqv = 0.0e+00\n",
"llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
"llm_load_print_meta: n_ff = 14336\n",
"llm_load_print_meta: rope scaling = linear\n",
"llm_load_print_meta: freq_base_train = 10000.0\n",
"llm_load_print_meta: freq_scale_train = 1\n",
"llm_load_print_meta: n_yarn_orig_ctx = 32768\n",
"llm_load_print_meta: rope_finetuned = unknown\n",
"llm_load_print_meta: model type = 7B\n",
"llm_load_print_meta: model ftype = mostly Q4_K - Medium\n",
"llm_load_print_meta: model params = 7.24 B\n",
"llm_load_print_meta: model size = 4.07 GiB (4.83 BPW) \n",
"llm_load_print_meta: general.name = teknium_openhermes-2.5-mistral-7b\n",
"llm_load_print_meta: BOS token = 1 '<s>'\n",
"llm_load_print_meta: EOS token = 32000 '<|im_end|>'\n",
"llm_load_print_meta: UNK token = 0 '<unk>'\n",
"llm_load_print_meta: PAD token = 0 '<unk>'\n",
"llm_load_print_meta: LF token = 13 '<0x0A>'\n",
"llm_load_tensors: ggml ctx size = 0.11 MiB\n",
"llm_load_tensors: using CUDA for GPU acceleration\n",
"llm_load_tensors: mem required = 70.42 MiB\n",
"llm_load_tensors: offloading 32 repeating layers to GPU\n",
"llm_load_tensors: offloading non-repeating layers to GPU\n",
"llm_load_tensors: offloaded 35/35 layers to GPU\n",
"llm_load_tensors: VRAM used: 4095.06 MiB\n",
"...............................................................................................\n",
"llama_new_context_with_model: n_ctx = 2048\n",
"llama_new_context_with_model: freq_base = 10000.0\n",
"llama_new_context_with_model: freq_scale = 1\n",
"llama_kv_cache_init: offloading v cache to GPU\n",
"llama_kv_cache_init: offloading k cache to GPU\n",
"llama_kv_cache_init: VRAM kv self = 256.00 MiB\n",
"llama_new_context_with_model: kv self size = 256.00 MiB\n",
"llama_build_graph: non-view tensors processed: 740/740\n",
"llama_new_context_with_model: compute buffer total size = 159.07 MiB\n",
"llama_new_context_with_model: VRAM scratch buffer: 156.00 MiB\n",
"llama_new_context_with_model: total VRAM used: 4507.07 MiB (model: 4095.06 MiB, context: 412.00 MiB)\n"
]
}
],
"source": [
"import llama_cpp\n",
"\n",
"llama = llama_cpp.Llama(model_path=\"../../models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf\", n_gpu_layers=-1, n_ctx=2048, verbose=False)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{'name': 'get_weather', 'arguments': {'zip_code': '10001'}}]\n",
"====================================================================================================\n",
"[{'name': 'calculate_mortgage_payment', 'arguments': {'loan_amount': 200000, 'interest_rate': 0.04, 'loan_term': 30}}]\n",
"====================================================================================================\n",
"Unfortunately, I do not have a built-in function to check currency exchange rates. However, you can use third-party APIs or websites like Google Finance or XE to get this information.\n",
"====================================================================================================\n"
]
}
],
"source": [
"prompts = [\n",
" \"What's the weather in 10001?\",\n",
" \"Determine the monthly mortgage payment for a loan amount of $200,000, an interest rate of 4%, and a loan term of 30 years.\",\n",
" \"What's the current exchange rate for USD to EUR?\"\n",
"]\n",
"functions = [get_weather, calculate_mortgage_payment, get_article_details]\n",
"\n",
"for prompt in prompts:\n",
" prompt = generate_hermes_prompt(prompt, functions)\n",
" completion = llama.create_completion(prompt, max_tokens=-1)[\"choices\"][0][\"text\"]\n",
" function_calls = extract_function_calls(completion)\n",
" if function_calls:\n",
" print(function_calls)\n",
" else:\n",
" print(completion.strip())\n",
" print(\"=\"*100)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"get_weather\n",
"{'zip_code': '05751'}\n",
"====================================================================================================\n",
"get_weather\n",
"{'zip_code': '05751'}\n",
"get_weather\n",
"{'zip_code': '07030'}\n",
"calculate_mortgage_payment\n",
"{'loan_amount': 250000, 'interest_rate': 4.18, 'loan_term': 30}\n",
"====================================================================================================\n",
"I don't have a function to get exchange rates, but I can provide some resources where you can find this information. You can check websites like Google Finance, XE.com, or Yahoo Finance for up-to-date currency exchange rates.\n",
"====================================================================================================\n"
]
}
],
"source": [
"prompts = [\n",
" \"What's the weather in 05751?\",\n",
" \"I'm planning a trip to Killington, Vermont (05751) from Hoboken, NJ (07030). Can you get me weather for both locations and directions?\",\n",
" \"What's the current exchange rate for USD to EUR?\"\n",
"]\n",
"\n",
"for prompt in prompts:\n",
" completion = llama.create_completion(generate_hermes_prompt(prompt, functions), max_tokens=-1)[\"choices\"][0][\"text\"]\n",
" function_calls = extract_function_calls(completion)\n",
"\n",
" if function_calls:\n",
" for function in function_calls:\n",
" print(function[\"name\"])\n",
" print(function[\"arguments\"])\n",
" else:\n",
" print(completion.strip())\n",
"\n",
" print(\"=\"*100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5+"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View file

@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *
__version__ = "0.2.38"
__version__ = "0.2.43"

View file

@ -42,6 +42,8 @@ class _LlamaModel:
self._llama_free_model = llama_cpp._lib.llama_free_model # type: ignore
self.model = None
if not os.path.exists(path_model):
raise ValueError(f"Model path does not exist: {path_model}")
@ -248,6 +250,7 @@ class _LlamaContext:
self.verbose = verbose
self._llama_free = llama_cpp._lib.llama_free # type: ignore
self.ctx = None
assert self.model.model is not None
@ -497,6 +500,7 @@ class _LlamaBatch:
self._llama_batch_free = llama_cpp._lib.llama_batch_free # type: ignore
self.batch = None
self.batch = llama_cpp.llama_batch_init(
self.n_tokens, self.embd, self.n_seq_max
)
@ -506,6 +510,14 @@ class _LlamaBatch:
self._llama_batch_free(self.batch)
self.batch = None
def n_tokens(self) -> int:
assert self.batch is not None
return self.batch.n_tokens
def reset(self):
assert self.batch is not None
self.batch.n_tokens = 0
def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
assert self.batch is not None
n_tokens = len(batch)
@ -518,6 +530,20 @@ class _LlamaBatch:
self.batch.logits[i] = logits_all
self.batch.logits[n_tokens - 1] = True
def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
assert self.batch is not None
n_tokens = len(batch)
n_tokens0 = self.batch.n_tokens
self.batch.n_tokens += n_tokens
for i in range(n_tokens):
j = n_tokens0 + i
self.batch.token[j] = batch[i]
self.batch.pos[j] = i
self.batch.seq_id[j][0] = seq_id
self.batch.n_seq_id[j] = 1
self.batch.logits[j] = logits_all
self.batch.logits[n_tokens - 1] = True
class _LlamaTokenDataArray:
def __init__(self, *, n_vocab: int):

View file

@ -19,6 +19,8 @@ from collections import deque
import ctypes
from llama_cpp.llama_types import List
from .llama_types import *
from .llama_grammar import LlamaGrammar
from .llama_cache import (
@ -27,6 +29,10 @@ from .llama_cache import (
LlamaDiskCache, # type: ignore
LlamaRAMCache, # type: ignore
)
from .llama_tokenizer import (
BaseLlamaTokenizer,
LlamaTokenizer
)
import llama_cpp.llama_cpp as llama_cpp
import llama_cpp.llama_chat_format as llama_chat_format
@ -44,6 +50,9 @@ from ._internals import (
_LlamaSamplingContext, # type: ignore
)
from ._logger import set_verbose
from ._utils import (
suppress_stdout_stderr
)
class Llama:
@ -95,6 +104,8 @@ class Llama:
chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
# Speculative Decoding
draft_model: Optional[LlamaDraftModel] = None,
# Tokenizer Override
tokenizer: Optional[BaseLlamaTokenizer] = None,
# Misc
verbose: bool = True,
# Extra Params
@ -159,6 +170,7 @@ class Llama:
chat_format: String specifying the chat format to use when calling create_chat_completion.
chat_handler: Optional chat handler to use when calling create_chat_completion.
draft_model: Optional draft model to use for speculative decoding.
tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
verbose: Print verbose output to stderr.
Raises:
@ -173,7 +185,8 @@ class Llama:
self.numa = numa
if not Llama.__backend_initialized:
llama_cpp.llama_backend_init(self.numa)
with suppress_stdout_stderr(disable=verbose):
llama_cpp.llama_backend_init(self.numa)
Llama.__backend_initialized = True
self.model_path = model_path
@ -235,6 +248,7 @@ class Llama:
self.n_threads_batch = n_threads_batch or max(
multiprocessing.cpu_count() // 2, 1
)
# Context Params
self.context_params = llama_cpp.llama_context_default_params()
self.context_params.seed = seed
@ -267,7 +281,7 @@ class Llama:
)
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
self.context_params.mul_mat_q = mul_mat_q
self.context_params.logits_all = logits_all
self.context_params.logits_all = logits_all if draft_model is None else True # Must be set to True for speculative decoding
self.context_params.embedding = embedding
self.context_params.offload_kqv = offload_kqv
@ -286,6 +300,10 @@ class Llama:
self._model = _LlamaModel(
path_model=self.model_path, params=self.model_params, verbose=self.verbose
)
# Override tokenizer
self.tokenizer_ = tokenizer or LlamaTokenizer(self)
# Set the default value for the context and correct the batch
if n_ctx == 0:
n_ctx = self._model.n_ctx_train()
@ -431,18 +449,19 @@ class Llama:
Returns:
A list of tokens.
"""
return self._model.tokenize(text, add_bos, special)
return self.tokenizer_.tokenize(text, add_bos, special)
def detokenize(self, tokens: List[int]) -> bytes:
def detokenize(self, tokens: List[int], prev_tokens: Optional[List[int]] = None) -> bytes:
"""Detokenize a list of tokens.
Args:
tokens: The list of tokens to detokenize.
prev_tokens: The list of previous tokens. Offset mapping will be performed if provided
Returns:
The detokenized string.
"""
return self._model.detokenize(tokens)
return self.tokenizer_.detokenize(tokens, prev_tokens)
def set_cache(self, cache: Optional[BaseLlamaCache]):
"""Set the cache.
@ -538,7 +557,7 @@ class Llama:
logits[:] = (
logits_processor(self._input_ids, logits)
if idx is None
else logits_processor(self._input_ids[:idx], logits)
else logits_processor(self._input_ids[:idx + 1], logits)
)
sampling_params = _LlamaSamplingParams(
@ -698,10 +717,53 @@ class Llama:
Returns:
An embedding object.
"""
assert self._ctx.ctx is not None
assert self._model.model is not None
model_name: str = model if model is not None else self.model_path
# get numeric embeddings
embeds: List[List[float]]
total_tokens: int
embeds, total_tokens = self.embed(input, return_count=True) # type: ignore
# convert to CreateEmbeddingResponse
data: List[Embedding] = [
{
"object": "embedding",
"embedding": emb,
"index": idx,
}
for idx, emb in enumerate(embeds)
]
return {
"object": "list",
"data": data,
"model": model_name,
"usage": {
"prompt_tokens": total_tokens,
"total_tokens": total_tokens,
},
}
def embed(
self,
input: Union[str, List[str]],
normalize: bool = True,
truncate: bool = True,
return_count: bool = False,
):
"""Embed a string.
Args:
input: The utf-8 encoded string to embed.
Returns:
A list of embeddings
"""
assert self._ctx.ctx is not None
n_embd = self.n_embd()
n_ctx = self.n_ctx()
if self.context_params.embedding == False:
raise RuntimeError(
"Llama model must be created with embedding=True to call this method"
@ -715,48 +777,72 @@ class Llama:
else:
inputs = input
data: List[Embedding] = []
# reset batch
self._batch.reset()
# decode and fetch embeddings
data: List[List[float]] = []
def decode_batch(sizes: List[int]):
assert self._ctx.ctx is not None
llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
self._ctx.decode(self._batch)
self._batch.reset()
# store embeddings
for i, s in enumerate(sizes):
embedding = llama_cpp.llama_get_embeddings_ith(self._ctx.ctx, i)[
:n_embd
]
norm = np.linalg.norm(embedding) if normalize else s
embedding: List[float] = [v / float(norm) for v in embedding]
data.append(embedding)
# init state
total_tokens = 0
for index, input in enumerate(inputs):
tokens = self.tokenize(input.encode("utf-8"), special=True)
self.reset()
self.eval(tokens)
t_batch = 0
s_sizes: List[int] = []
# accumulate batches and encode
for text in inputs:
tokens = self.tokenize(text.encode("utf-8"))
if truncate:
tokens = tokens[:n_ctx]
n_tokens = len(tokens)
total_tokens += n_tokens
embedding = llama_cpp.llama_get_embeddings(self._ctx.ctx)[
: llama_cpp.llama_n_embd(self._model.model)
]
data.append(
{
"object": "embedding",
"embedding": embedding,
"index": index,
}
)
# check for overrun
if n_tokens > n_ctx:
raise ValueError(
f"Requested tokens ({n_tokens}) exceed context window of {n_ctx}"
)
# time to eval batch
if t_batch + n_tokens > self._n_ctx:
decode_batch(s_sizes)
t_batch = 0
s_sizes = []
# add to batch
self._batch.add_sequence(tokens, len(s_sizes), False)
t_batch += n_tokens
s_sizes.append(n_tokens)
# hanlde last batch
decode_batch(s_sizes)
if self.verbose:
llama_cpp.llama_print_timings(self._ctx.ctx)
return {
"object": "list",
"data": data,
"model": model_name,
"usage": {
"prompt_tokens": total_tokens,
"total_tokens": total_tokens,
},
}
output = data[0] if isinstance(input, str) else data
def embed(self, input: str) -> List[float]:
"""Embed a string.
llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
self.reset()
Args:
input: The utf-8 encoded string to embed.
Returns:
A list of embeddings
"""
return list(map(float, self.create_embedding(input)["data"][0]["embedding"]))
if return_count:
return output, total_tokens
else:
return output
def _create_completion(
self,
@ -1552,6 +1638,38 @@ class Llama:
logit_bias=logit_bias,
)
def create_chat_completion_openai_v1(
self,
*args: Any,
**kwargs: Any,
):
"""Generate a chat completion with return type based on the the OpenAI v1 API.
OpenAI python package is required to use this method.
You can install it with `pip install openai`.
Args:
*args: Positional arguments to pass to create_chat_completion.
**kwargs: Keyword arguments to pass to create_chat_completion.
Returns:
Generated chat completion or a stream of chat completion chunks.
"""
try:
from openai.types.chat import ChatCompletion, ChatCompletionChunk
stream = kwargs.get("stream", False) # type: ignore
assert isinstance(stream, bool)
if stream:
return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore
else:
return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore
except ImportError:
raise ImportError(
"To use create_chat_completion_openai_v1, you must install the openai package."
"You can install it with `pip install openai`."
)
def __getstate__(self):
return dict(
model_path=self.model_path,
@ -1693,8 +1811,8 @@ class Llama:
"""Return the vocabulary size."""
return self._model.n_vocab()
def tokenizer(self) -> "LlamaTokenizer":
"""Return the tokenizer for this model."""
def tokenizer(self) -> LlamaTokenizer:
"""Return the llama tokenizer for this model."""
return LlamaTokenizer(self)
def token_eos(self) -> int:
@ -1738,21 +1856,6 @@ class Llama:
return longest_prefix
class LlamaTokenizer:
def __init__(self, llama: Llama):
self.llama = llama
def encode(self, text: str, add_bos: bool = True) -> List[int]:
return self.llama.tokenize(
text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=True
)
def decode(self, tokens: List[int]) -> str:
return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
@classmethod
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
return cls(Llama(model_path=path, vocab_only=True))
class LlamaState:

File diff suppressed because it is too large Load diff

View file

@ -139,9 +139,11 @@ llama_seq_id = c_int32
# enum llama_vocab_type {
# LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
# LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
# LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
# };
LLAMA_VOCAB_TYPE_SPM = 0
LLAMA_VOCAB_TYPE_BPE = 1
LLAMA_VOCAB_TYPE_WPM = 2
# enum llama_token_type {
@ -468,6 +470,7 @@ class llama_model_params(Structure):
# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
# bool embedding; // embedding mode only
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
# bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
# };
class llama_context_params(Structure):
"""Parameters for llama_context
@ -494,6 +497,7 @@ class llama_context_params(Structure):
logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
embedding (bool): embedding mode only
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
"""
_fields_ = [
@ -518,6 +522,7 @@ class llama_context_params(Structure):
("logits_all", c_bool),
("embedding", c_bool),
("offload_kqv", c_bool),
("do_pooling", c_bool),
]
@ -1697,6 +1702,21 @@ _lib.llama_get_embeddings.argtypes = [llama_context_p]
_lib.llama_get_embeddings.restype = c_float_p
# // Get the embeddings for the ith sequence
# // llama_get_embeddings(ctx) + i*n_embd
# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
def llama_get_embeddings_ith(
ctx: llama_context_p, i: Union[c_int32, int]
): # type: (...) -> Array[float] # type: ignore
"""Get the embeddings for the ith sequence
llama_get_embeddings(ctx) + i*n_embd"""
return _lib.llama_get_embeddings_ith(ctx, i)
_lib.llama_get_embeddings_ith.argtypes = [llama_context_p, c_int32]
_lib.llama_get_embeddings_ith.restype = c_float_p
# //
# // Vocab
# //

View file

@ -1471,12 +1471,15 @@ class SchemaConverter:
if schema_type == "object" and "properties" in schema:
# TODO: `required` keyword
prop_order = self._prop_order
prop_pairs = sorted(
schema["properties"].items(),
# sort by position in prop_order (if specified) then by key
key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
)
if self._prop_order:
prop_order = self._prop_order
prop_pairs = sorted(
schema["properties"].items(),
# sort by position in prop_order (if specified) then by key
key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
)
else:
prop_pairs = schema["properties"].items()
rule = '"{" space'
for i, (prop_name, prop_schema) in enumerate(prop_pairs):

View file

@ -0,0 +1,95 @@
from __future__ import annotations
import abc
from typing import (
List,
Optional,
Any,
)
import llama_cpp
from llama_cpp.llama_types import List
class BaseLlamaTokenizer(abc.ABC):
@abc.abstractmethod
def tokenize(
self, text: bytes, add_bos: bool = True, special: bool = True
) -> List[int]:
raise NotImplementedError
@abc.abstractmethod
def detokenize(
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
) -> bytes:
raise NotImplementedError
class LlamaTokenizer(BaseLlamaTokenizer):
def __init__(self, llama: llama_cpp.Llama):
self._model = llama._model # type: ignore
def tokenize(
self, text: bytes, add_bos: bool = True, special: bool = True
) -> List[int]:
return self._model.tokenize(text, add_bos=add_bos, special=special)
def detokenize(
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
) -> bytes:
if prev_tokens is not None:
return self._model.detokenize(tokens[len(prev_tokens) :])
else:
return self._model.detokenize(tokens)
def encode(
self, text: str, add_bos: bool = True, special: bool = True
) -> List[int]:
return self.tokenize(
text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
)
def decode(self, tokens: List[int]) -> str:
return self.detokenize(tokens).decode("utf-8", errors="ignore")
@classmethod
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
return cls(llama_cpp.Llama(model_path=path, vocab_only=True))
class LlamaHFTokenizer(BaseLlamaTokenizer):
def __init__(self, hf_tokenizer: Any):
self.hf_tokenizer = hf_tokenizer
def tokenize(
self, text: bytes, add_bos: bool = True, special: bool = True
) -> List[int]:
return self.hf_tokenizer.encode(
text.decode("utf-8", errors="ignore"), add_special_tokens=special
)
def detokenize(
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
) -> bytes:
if prev_tokens is not None:
text = self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
prev_text = self.hf_tokenizer.decode(prev_tokens).encode(
"utf-8", errors="ignore"
)
return text[len(prev_text) :]
else:
return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
try:
from transformers import AutoTokenizer
except ImportError:
raise ImportError(
"The `transformers` library is required to use the `HFTokenizer`."
"You can install it with `pip install transformers`."
)
hf_tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path
)
return cls(hf_tokenizer)

View file

@ -97,7 +97,7 @@ class CreateChatCompletionResponse(TypedDict):
class ChatCompletionMessageToolCallChunkFunction(TypedDict):
name: str
name: Optional[str]
arguments: str
@ -118,12 +118,12 @@ class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
class ChatCompletionStreamResponseDelta(TypedDict):
content: NotRequired[str]
content: NotRequired[Optional[str]]
function_call: NotRequired[
ChatCompletionStreamResponseDeltaFunctionCall
Optional[ChatCompletionStreamResponseDeltaFunctionCall]
] # DEPRECATED
tool_calls: NotRequired[List[ChatCompletionMessageToolCallChunk]]
role: NotRequired[Literal["system", "user", "assistant", "tool"]]
tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]]
role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]]
class ChatCompletionStreamResponseChoice(TypedDict):
@ -132,6 +132,7 @@ class ChatCompletionStreamResponseChoice(TypedDict):
ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
]
finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
logprobs: NotRequired[Optional[CompletionLogprobs]]
class CreateChatCompletionStreamResponse(TypedDict):

View file

@ -146,30 +146,8 @@ _libllava.llava_eval_image_embed.restype = c_bool
################################################
# struct clip_vision_hparams {
# int32_t image_size;
# int32_t patch_size;
# int32_t hidden_size;
# int32_t n_intermediate;
# int32_t projection_dim;
# int32_t n_head;
# int32_t n_layer;
# float eps;
# };
class clip_vision_hparams(Structure):
_fields_ = [
("image_size", c_int32),
("patch_size", c_int32),
("hidden_size", c_int32),
("n_intermediate", c_int32),
("projection_dim", c_int32),
("n_head", c_int32),
("n_layer", c_int32),
("eps", c_float),
]
# /** load mmproj model */
# CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
# CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
def clip_model_load(fname: bytes, verbosity: Union[c_int, int]) -> clip_ctx_p:
return _libllava.clip_model_load(fname, verbosity)
@ -183,50 +161,3 @@ def clip_free(ctx: clip_ctx_p):
_libllava.clip_free.argtypes = [clip_ctx_p]
_libllava.clip_free.restype = None
# size_t clip_embd_nbytes(const struct clip_ctx * ctx);
# int clip_n_patches(const struct clip_ctx * ctx);
# int clip_n_mmproj_embd(const struct clip_ctx * ctx);
# // RGB uint8 image
# struct clip_image_u8 {
# int nx;
# int ny;
# uint8_t * data = NULL;
# size_t size;
# };
# // RGB float32 image (NHWC)
# // Memory layout: RGBRGBRGB...
# struct clip_image_f32 {
# int nx;
# int ny;
# float * data = NULL;
# size_t size;
# };
# struct clip_image_u8_batch {
# struct clip_image_u8 * data;
# size_t size;
# };
# struct clip_image_f32_batch {
# struct clip_image_f32 * data;
# size_t size;
# };
# struct clip_image_u8 * make_clip_image_u8();
# struct clip_image_f32 * make_clip_image_f32();
# CLIP_API void clip_image_u8_free(clip_image_u8 * img);
# CLIP_API void clip_image_f32_free(clip_image_f32 * img);
# CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
# bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
# bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
# bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
# float * vec);
# bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);

View file

@ -6,6 +6,7 @@ from typing import Dict, Optional, Union, List
import llama_cpp
import llama_cpp.llama_speculative as llama_speculative
import llama_cpp.llama_tokenizer as llama_tokenizer
from llama_cpp.server.settings import ModelSettings
@ -93,6 +94,10 @@ class LlamaProxy:
)
)
tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
if settings.hf_pretrained_model_name_or_path is not None:
tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(settings.hf_pretrained_model_name_or_path)
draft_model = None
if settings.draft_model is not None:
draft_model = llama_speculative.LlamaPromptLookupDecoding(
@ -156,6 +161,8 @@ class LlamaProxy:
chat_handler=chat_handler,
# Speculative Decoding
draft_model=draft_model,
# Tokenizer
tokenizer=tokenizer,
# Misc
verbose=settings.verbose,
)

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 098f6d737b65134cf220d12b9b706e8cfc5e4610
Subproject commit 8084d554406b767d36b3250b3b787462d5dd626f