From 35918873b4010a230a9aa478fd16f35127d7eb9a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 26 Jan 2024 11:45:48 -0500 Subject: [PATCH 01/25] Update llama.cpp --- llama_cpp/llama_cpp.py | 32 +++++++++++++++++++++++++++++--- vendor/llama.cpp | 2 +- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index d31a5da..c4256dd 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -93,9 +93,7 @@ c_size_t_p = POINTER(c_size_t) # from ggml-backend.h # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); -ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE( - c_bool, c_void_p, c_bool, c_void_p -) +ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(c_bool, c_void_p, c_bool, c_void_p) # llama.h bindings @@ -2174,6 +2172,34 @@ _lib.llama_sample_typical.argtypes = [ _lib.llama_sample_typical.restype = None +# /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. +# LLAMA_API void llama_sample_entropy( +# struct llama_context * ctx, +# llama_token_data_array * candidates_p, +# float min_temp, +# float max_temp, +# float exponent_val); +def llama_sample_entropy( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + min_temp: Union[c_float, float], + max_temp: Union[c_float, float], + exponent_val: Union[c_float, float], +): + """Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.""" + return _lib.llama_sample_entropy(ctx, candidates, min_temp, max_temp, exponent_val) + + +_lib.llama_sample_entropy.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_float, + c_float, +] +_lib.llama_sample_entropy.restype = None + + # LLAMA_API void llama_sample_temp( # struct llama_context * ctx, # llama_token_data_array * candidates, diff --git a/vendor/llama.cpp b/vendor/llama.cpp index faa3526..5f1925a 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit faa3526a1eba458120987ed8269e5616385a76f4 +Subproject commit 5f1925a8cef81eb9b372faaae34b0dd76d5361d4 From c6d3bd62e8db86d7c9234d763f7b7f94e6aa8cc0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 27 Jan 2024 16:22:46 -0500 Subject: [PATCH 02/25] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 5f1925a..6db2b41 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 5f1925a8cef81eb9b372faaae34b0dd76d5361d4 +Subproject commit 6db2b41a76ee78d5efdd5c3cddd5d7ad3f646855 From d8f6914f459c1a8536ace5b379492a482fa0db16 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 27 Jan 2024 16:52:18 -0500 Subject: [PATCH 03/25] Add json schema mode (#1122) * Add json schema mode * Add llava chat format support --- llama_cpp/llama_chat_format.py | 21 ++++++++++++++++----- llama_cpp/llama_types.py | 1 + 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 6c274aa..e418d40 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -318,7 +318,14 @@ def chat_formatter_to_chat_completion_handler( stop = stop + rstop if response_format is not None and response_format["type"] == "json_object": - grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF) + try: + # create grammar from json schema + if "schema" in response_format: + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(response_format["schema"]) + ) + except Exception as e: + grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF) completion_or_chunks = llama.create_completion( prompt=prompt, @@ -1434,10 +1441,14 @@ class Llava15ChatHandler: prompt = llama.input_ids[: llama.n_tokens].tolist() if response_format is not None and response_format["type"] == "json_object": - with suppress_stdout_stderr(disable=self.verbose): - grammar = llama_grammar.LlamaGrammar.from_string( - llama_grammar.JSON_GBNF - ) + try: + # create grammar from json schema + if "schema" in response_format: + grammar = llama_grammar.LlamaGrammar.from_json_schema( + json.dumps(response_format["schema"]) + ) + except Exception as e: + grammar = llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF) return _convert_completion_to_chat( llama.create_completion( diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 5b51e98..c3deba8 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -154,6 +154,7 @@ class ChatCompletionFunctionCallOption(TypedDict): class ChatCompletionRequestResponseFormat(TypedDict): type: Literal["text", "json_object"] + schema: NotRequired[JsonType] # https://docs.endpoints.anyscale.com/guides/json_mode/ class ChatCompletionRequestMessageContentPartText(TypedDict): From c1d0fff8a990b1e3ebede2f2abf0f6350b2c30a3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 27 Jan 2024 18:36:56 -0500 Subject: [PATCH 04/25] Bump version --- CHANGELOG.md | 5 +++++ llama_cpp/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a94ef5..ddf1d0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.34] + +- feat: Update llama.cpp to ggerganov/llama.cpp@6db2b41a76ee78d5efdd5c3cddd5d7ad3f646855 +- feat: Add json schema mode by @abetlen in #1122 + ## [0.2.33] - feat: Update llama.cpp to ggerganov/llama.cpp@faa3526a1eba458120987ed8269e5616385a76f4 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 55f695e..36eacab 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.33" \ No newline at end of file +__version__ = "0.2.34" \ No newline at end of file From 399fa1e03b3b966105669cb2b698ef7e440051f9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 27 Jan 2024 19:36:33 -0500 Subject: [PATCH 05/25] docs: Add JSON and JSON schema mode examples to README --- README.md | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/README.md b/README.md index 7813c96..f5cd50e 100644 --- a/README.md +++ b/README.md @@ -216,6 +216,59 @@ Note that `chat_format` option must be set for the particular model you are usin Chat completion is available through the [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion) method of the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class. +### JSON and JSON Schema Mode + +If you want to constrain chat responses to only valid JSON or a specific JSON Schema you can use the `response_format` argument to the `create_chat_completion` method. + +#### Json Mode + +The following example will constrain the response to be valid JSON. + +```python +>>> from llama_cpp import Llama +>>> llm = Llama(model_path="path/to/model.gguf", chat_format="chatml") +>>> llm.create_chat_completion( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant that outputs in JSON.", + }, + {"role": "user", "content": "Who won the world series in 2020"}, + ], + response_format={ + "type": "json_object", + }, + temperature=0.7, +) +``` + +#### Json Mode + +To constrain the response to a specific JSON Schema, you can use the `schema` property of the `response_format` argument. + +```python +>>> from llama_cpp import Llama +>>> llm = Llama(model_path="path/to/model.gguf", chat_format="chatml") +>>> llm.create_chat_completion( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant that outputs in JSON.", + }, + {"role": "user", "content": "Who won the world series in 2020"}, + ], + response_format={ + "type": "json_object", + "schema": { + "type": "object", + "properties": {"team_name": {"type": "string"}}, + "required": ["team_name"], + }, + }, + temperature=0.7, +) +``` + ### Function Calling The high-level API also provides a simple interface for function calling. From 8c592100623ec08583c34190a15d4c1d07945b25 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 27 Jan 2024 19:37:59 -0500 Subject: [PATCH 06/25] docs: Fix typo --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f5cd50e..faf4a87 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ Chat completion is available through the [`create_chat_completion`](https://llam If you want to constrain chat responses to only valid JSON or a specific JSON Schema you can use the `response_format` argument to the `create_chat_completion` method. -#### Json Mode +#### JSON Mode The following example will constrain the response to be valid JSON. @@ -242,7 +242,7 @@ The following example will constrain the response to be valid JSON. ) ``` -#### Json Mode +#### JSON Schema Mode To constrain the response to a specific JSON Schema, you can use the `schema` property of the `response_format` argument. From ccf4908bfd8a3cefa0f554a11a3d16fe0d979980 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 28 Jan 2024 12:55:32 -0500 Subject: [PATCH 07/25] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 6db2b41..35dec26 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 6db2b41a76ee78d5efdd5c3cddd5d7ad3f646855 +Subproject commit 35dec26cc25a9ff7d8c3ed52326b94f772b911ce From 31e0288a410b054a0b40aaa2b54b635d9a261f8e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 28 Jan 2024 19:34:27 -0500 Subject: [PATCH 08/25] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 35dec26..d2f650c 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 35dec26cc25a9ff7d8c3ed52326b94f772b911ce +Subproject commit d2f650cb5b04ee2726663e79b47da5efe196ce00 From 52c4a84faf07d92565e0c4f77ba884a48fc24b52 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 28 Jan 2024 19:35:37 -0500 Subject: [PATCH 09/25] Bump version --- CHANGELOG.md | 4 ++++ llama_cpp/__init__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ddf1d0d..20671f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.35] + +- feat: Update llama.cpp to ggerganov/llama.cpp@d2f650cb5b04ee2726663e79b47da5efe196ce00 + ## [0.2.34] - feat: Update llama.cpp to ggerganov/llama.cpp@6db2b41a76ee78d5efdd5c3cddd5d7ad3f646855 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 36eacab..639a5a5 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.34" \ No newline at end of file +__version__ = "0.2.35" \ No newline at end of file From ce38dbdf0730aa2fb3a659e0a5bda2e8ad07da17 Mon Sep 17 00:00:00 2001 From: Rafaelblsilva Date: Mon, 29 Jan 2024 02:34:42 -0300 Subject: [PATCH 10/25] Add mistral instruct chat format as "mistral-instruct" (#799) * Added mistral instruct chat format as "mistral" * Fix stop sequence (merge issue) * Update chat format name to `mistral-instruct` --------- Co-authored-by: Andrei --- llama_cpp/llama_chat_format.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index e418d40..989275a 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -877,6 +877,22 @@ def format_chatml( return ChatFormatterResponse(prompt=_prompt, stop=_sep) +@register_chat_format("mistral-instruct") +def format_mistral( + messages: List[llama_types.ChatCompletionRequestMessage], + **kwargs: Any, +) -> ChatFormatterResponse: + _roles = dict(user="[INST] ", assistant="[/INST]") + _sep = " " + system_template = """{system_message}""" + system_message = _get_system_message(messages) + system_message = system_template.format(system_message=system_message) + _messages = _map_roles(messages, _roles) + _messages.append((_roles["assistant"], None)) + _prompt = _format_no_colon_single(system_message, _messages, _sep) + return ChatFormatterResponse(prompt=_prompt) + + @register_chat_format("chatglm3") def format_chatglm3( messages: List[llama_types.ChatCompletionRequestMessage], From 9ae5819ee4a3d892c70e6a2f52de76f1882edcc0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 29 Jan 2024 00:59:01 -0500 Subject: [PATCH 11/25] Add chat format test. --- llama_cpp/llama_chat_format.py | 22 ++++++++++++---------- tests/test_llama_chat_format.py | 23 +++++++++++++++++++++++ 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 989275a..5466de3 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -878,19 +878,21 @@ def format_chatml( @register_chat_format("mistral-instruct") -def format_mistral( +def format_mistral_instruct( messages: List[llama_types.ChatCompletionRequestMessage], **kwargs: Any, ) -> ChatFormatterResponse: - _roles = dict(user="[INST] ", assistant="[/INST]") - _sep = " " - system_template = """{system_message}""" - system_message = _get_system_message(messages) - system_message = system_template.format(system_message=system_message) - _messages = _map_roles(messages, _roles) - _messages.append((_roles["assistant"], None)) - _prompt = _format_no_colon_single(system_message, _messages, _sep) - return ChatFormatterResponse(prompt=_prompt) + bos = "" + eos = "" + stop = eos + prompt = bos + for message in messages: + if message["role"] == "user" and message["content"] is not None and isinstance(message["content"], str): + prompt += "[INST] " + message["content"] + elif message["role"] == "assistant" and message["content"] is not None and isinstance(message["content"], str): + prompt += " [/INST]" + message["content"] + eos + prompt += " [/INST]" + return ChatFormatterResponse(prompt=prompt, stop=stop) @register_chat_format("chatglm3") diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py index 1ef18d9..c10aee4 100644 --- a/tests/test_llama_chat_format.py +++ b/tests/test_llama_chat_format.py @@ -1,10 +1,33 @@ import json +import jinja2 + from llama_cpp import ( ChatCompletionRequestUserMessage, ) +import llama_cpp.llama_types as llama_types +import llama_cpp.llama_chat_format as llama_chat_format + from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter +def test_mistral_instruct(): + chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + chat_formatter = jinja2.Template(chat_template) + messages = [ + llama_types.ChatCompletionRequestUserMessage(role="user", content="Instruction"), + llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content="Model answer"), + llama_types.ChatCompletionRequestUserMessage(role="user", content="Follow-up instruction"), + ] + response = llama_chat_format.format_mistral_instruct( + messages=messages, + ) + reference = chat_formatter.render( + messages=messages, + bos_token="", + eos_token="", + ) + assert response.prompt == reference + mistral_7b_tokenizer_config = """{ "add_bos_token": true, From 85f8c4c06e33d5ea7df35e662412d36441a65456 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 29 Jan 2024 10:39:08 -0500 Subject: [PATCH 12/25] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d2f650c..2aed77e 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d2f650cb5b04ee2726663e79b47da5efe196ce00 +Subproject commit 2aed77eb06a329f0d82bb1c467f4244904d4073f From 9f7852acfafee8aee9dd19baf3528ede7b45b881 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 29 Jan 2024 10:39:23 -0500 Subject: [PATCH 13/25] misc: Add vulkan target --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 5ed3fa2..806b120 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,9 @@ build.blis: build.metal: CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install --verbose -e . +build.vulkan: + CMAKE_ARGS="-DLLAMA_VULKAN=on" python3 -m pip install --verbose -e . + build.sdist: python3 -m build --sdist From 464af5b39fea3cf1ba16e755a9df85f09bbb25ac Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 29 Jan 2024 10:46:04 -0500 Subject: [PATCH 14/25] Bump version --- CHANGELOG.md | 5 +++++ llama_cpp/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20671f8..e99dd17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.36] + +- feat: Update llama.cpp to ggerganov/llama.cpp@2aed77eb06a329f0d82bb1c467f4244904d4073f +- feat: Add mistral instruct chat format as "mistral-instruct" by @Rafaelblsilva in #799 + ## [0.2.35] - feat: Update llama.cpp to ggerganov/llama.cpp@d2f650cb5b04ee2726663e79b47da5efe196ce00 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 639a5a5..f73f3d4 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.35" \ No newline at end of file +__version__ = "0.2.36" \ No newline at end of file From 843e77e3e2310b1cfdc6b41982aec9007312e142 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 29 Jan 2024 11:01:26 -0500 Subject: [PATCH 15/25] docs: Add Vulkan build instructions --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index faf4a87..05326ac 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,14 @@ To install with hipBLAS / ROCm support for AMD cards, set the `LLAMA_HIPBLAS=on` CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python ``` +#### Vulkan + +To install with Vulkan support, set the `LLAMA_VULKAN=on` environment variable before installing: + +```bash +CMAKE_ARGS="-DLLAMA_VULKAN=on" pip install llama-cpp-python +``` + ### Windows Notes If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install: From 059f6b3ac8bcc95389d487fecbab0fbe38eb798e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 29 Jan 2024 11:02:25 -0500 Subject: [PATCH 16/25] docs: fix typos --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 05326ac..35c0c47 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp- #### cuBLAS -To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing: +To install with cuBLAS, set the `LLAMA_CUBLAS=on` environment variable before installing: ```bash CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python @@ -87,7 +87,7 @@ CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python #### CLBlast -To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing: +To install with CLBlast, set the `LLAMA_CLBLAST=on` environment variable before installing: ```bash CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python From da003d87681f02475eedb6937443e5f07db889b0 Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 29 Jan 2024 14:22:23 -0500 Subject: [PATCH 17/25] Automatically set chat format from gguf (#1110) * Use jinja formatter to load chat format from gguf * Fix off-by-one error in metadata loader * Implement chat format auto-detection --- llama_cpp/_internals.py | 4 ++-- llama_cpp/llama.py | 37 +++++++++++++++++++++++++++++++++- llama_cpp/llama_chat_format.py | 30 +++++++++++++++++++++++++-- llama_cpp/server/settings.py | 4 ++-- 4 files changed, 68 insertions(+), 7 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index ec47c42..651cd4c 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -216,13 +216,13 @@ class _LlamaModel: for i in range(llama_cpp.llama_model_meta_count(self.model)): nbytes = llama_cpp.llama_model_meta_key_by_index(self.model, i, buffer, buffer_size) if nbytes > buffer_size: - buffer_size = nbytes + buffer_size = nbytes + 1 buffer = ctypes.create_string_buffer(buffer_size) nbytes = llama_cpp.llama_model_meta_key_by_index(self.model, i, buffer, buffer_size) key = buffer.value.decode("utf-8") nbytes = llama_cpp.llama_model_meta_val_str_by_index(self.model, i, buffer, buffer_size) if nbytes > buffer_size: - buffer_size = nbytes + buffer_size = nbytes + 1 buffer = ctypes.create_string_buffer(buffer_size) nbytes = llama_cpp.llama_model_meta_val_str_by_index(self.model, i, buffer, buffer_size) value = buffer.value.decode("utf-8") diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 74739cb..b5618c1 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -87,7 +87,7 @@ class Llama: # Backend Params numa: bool = False, # Chat Format Params - chat_format: str = "llama-2", + chat_format: Optional[str] = None, chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None, # Misc verbose: bool = True, @@ -343,6 +343,41 @@ class Llama: if self.verbose: print(f"Model metadata: {self.metadata}", file=sys.stderr) + if self.chat_format is None and self.chat_handler is None and "tokenizer.chat_template" in self.metadata: + chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata(self.metadata) + + if chat_format is not None: + self.chat_format = chat_format + if self.verbose: + print(f"Guessed chat format: {chat_format}", file=sys.stderr) + else: + template = self.metadata["tokenizer.chat_template"] + try: + eos_token_id = int(self.metadata["tokenizer.ggml.eos_token_id"]) + except: + eos_token_id = self.token_eos() + try: + bos_token_id = int(self.metadata["tokenizer.ggml.bos_token_id"]) + except: + bos_token_id = self.token_bos() + + eos_token = self.detokenize([eos_token_id]).decode("utf-8") + bos_token = self.detokenize([bos_token_id]).decode("utf-8") + + if self.verbose: + print(f"Using chat template: {template}", file=sys.stderr) + print(f"Using chat eos_token: {eos_token}", file=sys.stderr) + print(f"Using chat bos_token: {bos_token}", file=sys.stderr) + + self.chat_handler = llama_chat_format.Jinja2ChatFormatter( + template=template, + eos_token=eos_token, + bos_token=bos_token + ).to_chat_handler() + + if self.chat_format is None and self.chat_handler is None: + self.chat_format = "llama-2" + @property def ctx(self) -> llama_cpp.llama_context_p: assert self._ctx.ctx is not None diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 5466de3..4bc4a6c 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -14,6 +14,20 @@ import llama_cpp.llama_grammar as llama_grammar from ._utils import suppress_stdout_stderr, Singleton +### Common Chat Templates and Special Tokens ### + +# Source: https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/tokenizer_config.json +CHATML_CHAT_TEMPLATE = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" +CHATML_BOS_TOKEN = "" +CHATML_EOS_TOKEN = "<|im_end|>" + +# Source: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/blob/main/tokenizer_config.json +MISTRAL_INSTRUCT_CHAT_TEMPLATE = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" +MISTRAL_INSTRUCT_BOS_TOKEN = "" +MISTRAL_INSTRUCT_EOS_TOKEN = "" + + +### Chat Completion Handler ### class LlamaChatCompletionHandler(Protocol): """Base Protocol for a llama chat completion handler. @@ -118,7 +132,6 @@ def register_chat_completion_handler(name: str): ### Chat Formatter ### - @dataclasses.dataclass class ChatFormatterResponse: """Dataclass that stores completion parameters for a given chat format and @@ -440,7 +453,20 @@ def hf_tokenizer_config_to_chat_completion_handler( return chat_formatter_to_chat_completion_handler(chat_formatter) +def guess_chat_format_from_gguf_metadata(metadata: Dict[str, str]) -> Optional[str]: + if "tokenizer.chat_template" not in metadata: + return None + + if metadata["tokenizer.chat_template"] == CHATML_CHAT_TEMPLATE: + return "chatml" + + if metadata["tokenizer.chat_template"] == MISTRAL_INSTRUCT_CHAT_TEMPLATE: + return "mistral-instruct" + + return None + ### Utility functions for formatting chat prompts ### +# TODO: Replace these with jinja2 templates def _get_system_message( @@ -929,7 +955,6 @@ def format_openchat( _prompt = _format_chatml(system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep) - # Chat format for Saiga models, see more details and available models: # https://huggingface.co/collections/IlyaGusev/saiga2-saigamistral-6505d4ccc3d1e53166b636cd @register_chat_format("saiga") @@ -951,6 +976,7 @@ def format_saiga( _prompt += "bot" return ChatFormatterResponse(prompt=_prompt.strip()) +# Tricky chat formats that require custom chat handlers @register_chat_completion_handler("functionary") def functionary_chat_handler( diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 9f0dc8a..9fe1a7b 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -113,8 +113,8 @@ class ModelSettings(BaseSettings): description="Enable NUMA support.", ) # Chat Format Params - chat_format: str = Field( - default="llama-2", + chat_format: Optional[str] = Field( + default=None, description="Chat format to use.", ) clip_model_path: Optional[str] = Field( From 011cd84ded5a2150dd66e9a1bdcecdc5142112eb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Jan 2024 09:48:09 -0500 Subject: [PATCH 18/25] Update llama.cpp --- Makefile | 6 ++++++ llama_cpp/llama_cpp.py | 2 ++ vendor/llama.cpp | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 806b120..ff1484c 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,12 @@ build.metal: build.vulkan: CMAKE_ARGS="-DLLAMA_VULKAN=on" python3 -m pip install --verbose -e . +build.kompute: + CMAKE_ARGS="-DLLAMA_KOMPUTE=on" python3 -m pip install --verbose -e . + +build.sycl: + CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e . + build.sdist: python3 -m build --sdist diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index c4256dd..2168579 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -187,6 +187,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6 # LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors # LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors # LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file # }; @@ -211,6 +212,7 @@ LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19 LLAMA_FTYPE_MOSTLY_IQ2_XS = 20 LLAMA_FTYPE_MOSTLY_Q2_K_S = 21 LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22 +LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23 LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2aed77e..8f8ddfc 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2aed77eb06a329f0d82bb1c467f4244904d4073f +Subproject commit 8f8ddfcfadc830b936318c3ea9fe2e8e3365aa85 From 13b7ced7dab618b729a486750c2e18d64d34bbd1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Jan 2024 12:21:41 -0500 Subject: [PATCH 19/25] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8f8ddfc..fea4fd4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8f8ddfcfadc830b936318c3ea9fe2e8e3365aa85 +Subproject commit fea4fd4ba7f6b754ac795387b275e1a014a77bde From 247a16de66112b5c4924c934eb40abd139ec7fac Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Jan 2024 12:23:07 -0500 Subject: [PATCH 20/25] docs: Update README --- README.md | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 35c0c47..0a77bbd 100644 --- a/README.md +++ b/README.md @@ -12,20 +12,17 @@ This package provides: - Low-level access to C API via `ctypes` interface. - High-level Python API for text completion - - OpenAI-like API - - [LangChain compatibility](https://python.langchain.com/docs/integrations/llms/llamacpp) - - [LlamaIndex compatibility](https://docs.llamaindex.ai/en/stable/examples/llm/llama_2_llama_cpp.html) + - OpenAI-like API + - [LangChain compatibility](https://python.langchain.com/docs/integrations/llms/llamacpp) + - [LlamaIndex compatibility](https://docs.llamaindex.ai/en/stable/examples/llm/llama_2_llama_cpp.html) - OpenAI compatible web server - - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion) - - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling) - - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models) - - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support) + - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion) + - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling) + - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models) + - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support) Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest). - - - ## Installation `llama-cpp-python` can be installed directly from PyPI as a source distribution by running: @@ -38,7 +35,6 @@ This will build `llama.cpp` from source using cmake and your system's c compiler If you run into issues during installation add the `--verbose` flag to the `pip install` command to see the full cmake build log. - ### Installation with Specific Hardware Acceleration (BLAS, CUDA, Metal, etc) The default pip install behaviour is to build `llama.cpp` for CPU only on Linux and Windows and use Metal on MacOS. @@ -109,13 +105,29 @@ To install with Vulkan support, set the `LLAMA_VULKAN=on` environment variable b CMAKE_ARGS="-DLLAMA_VULKAN=on" pip install llama-cpp-python ``` +#### Kompute + +To install with Kompute support, set the `LLAMA_KOMPUTE=on` environment variable before installing: + +```bash +CMAKE_ARGS="-DLLAMA_KOMPUTE=on" pip install llama-cpp-python +``` + +#### SYCL + +To install with SYCL support, set the `LLAMA_SYCL=on` environment variable before installing: + +```bash +CMAKE_ARGS="-DLLAMA_SYCL=on" pip install llama-cpp-python +``` + ### Windows Notes If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install: ```ps $env:CMAKE_GENERATOR = "MinGW Makefiles" -$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe" +$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe" ``` See the above instructions and set `CMAKE_ARGS` to the BLAS backend you want to use. @@ -165,7 +177,7 @@ Below is a short example demonstrating how to use the high-level API to for basi >>> from llama_cpp import Llama >>> llm = Llama( model_path="./models/7B/llama-model.gguf", - # n_gpu_layers=-1, # Uncomment to use GPU acceleration + # n_gpu_layers=-1, # Uncomment to use GPU acceleration # seed=1337, # Uncomment to set a specific seed # n_ctx=2048, # Uncomment to increase the context window ) @@ -284,7 +296,6 @@ The high-level API also provides a simple interface for function calling. Note that the only model that supports full function calling at this time is "functionary". The gguf-converted files for this model can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF) - ```python >>> from llama_cpp import Llama >>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary") @@ -293,7 +304,7 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h { "role": "system", "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary" - + }, { "role": "user", @@ -332,7 +343,6 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h ### Multi-modal Models - `llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to read information from both text and images. @@ -378,7 +388,6 @@ For instance, if you want to work with larger contexts, you can expand the conte llm = Llama(model_path="./models/7B/llama-model.gguf", n_ctx=2048) ``` - ## OpenAI Compatible Web Server `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. @@ -426,7 +435,8 @@ A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). ```bash docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/llama-model.gguf ghcr.io/abetlen/llama-cpp-python:latest ``` -[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389) + +[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389) ## Low-level API @@ -454,7 +464,6 @@ Below is a short example demonstrating how to use the low-level API to tokenize Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API. - ## Documentation Documentation is available via [https://llama-cpp-python.readthedocs.io/](https://llama-cpp-python.readthedocs.io/). From bf9e824922a3fa95b336ad441eca7e42f9b33358 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 30 Jan 2024 12:27:27 -0500 Subject: [PATCH 21/25] Bump version --- CHANGELOG.md | 5 +++++ llama_cpp/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e99dd17..435af43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.37] + +- feat: Update llama.cpp to ggerganov/llama.cpp@fea4fd4ba7f6b754ac795387b275e1a014a77bde +- feat: Automatically set chat format from gguf by @abetlen in #1110 + ## [0.2.36] - feat: Update llama.cpp to ggerganov/llama.cpp@2aed77eb06a329f0d82bb1c467f4244904d4073f diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index f73f3d4..4ce899c 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.36" \ No newline at end of file +__version__ = "0.2.37" \ No newline at end of file From 411494706a29ee4cf3b2a9da26b970794eb18fba Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 31 Jan 2024 08:35:21 -0500 Subject: [PATCH 22/25] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index fea4fd4..1560630 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit fea4fd4ba7f6b754ac795387b275e1a014a77bde +Subproject commit 15606309a05ccf7fadbaad5538cb7c32acb1e06b From 078cca0361bf5a94d2cf52ed04980d20e32d6f95 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 31 Jan 2024 08:42:21 -0500 Subject: [PATCH 23/25] fix: Pass raise_exception and add_generation_prompt to jinja2 chat template --- llama_cpp/llama_chat_format.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 4bc4a6c..08f991b 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -185,16 +185,17 @@ class Jinja2ChatFormatter(ChatFormatter): messages: List[llama_types.ChatCompletionRequestMessage], **kwargs: Any, ) -> ChatFormatterResponse: - if self.add_generation_prompt: - messages = [ - *messages, - llama_types.ChatCompletionRequestAssistantMessage( - role="assistant", content="" - ), - ] + def raise_exception(message: str): + raise ValueError(message) + prompt = self._environment.render( - messages=messages, eos_token=self.eos_token, bos_token=self.bos_token + messages=messages, + eos_token=self.eos_token, + bos_token=self.bos_token, + raise_exception=raise_exception, + add_generation_prompt=self.add_generation_prompt ) + return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token]) def to_chat_handler(self) -> LlamaChatCompletionHandler: From 2b37d8e438ff6285b586ca92cc165f52d1f5afdc Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 31 Jan 2024 10:37:19 -0500 Subject: [PATCH 24/25] fix: Run server command. Closes #1143 --- examples/high_level_api/fastapi_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py index 4b3189d..9421db5 100644 --- a/examples/high_level_api/fastapi_server.py +++ b/examples/high_level_api/fastapi_server.py @@ -9,7 +9,7 @@ export MODEL=../models/7B/... Then run: ``` -uvicorn llama_cpp.server.app:app --reload +uvicorn --factory llama_cpp.server.app:create_app --reload ``` or From 71e3e4c435826677ff671f1d82748fe1dd4d64e1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 31 Jan 2024 10:41:42 -0500 Subject: [PATCH 25/25] Update llama.cpp --- llama_cpp/llama_cpp.py | 41 ++++++++++++++++++++++++++++++++++------- vendor/llama.cpp | 2 +- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 2168579..431a99f 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -98,7 +98,7 @@ ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(c_bool, c_void_p, c_bool, c_ # llama.h bindings _lib.llama_max_devices.argtypes = [] -_lib.llama_max_devices.restype = ctypes.c_int32 +_lib.llama_max_devices.restype = ctypes.c_size_t LLAMA_MAX_DEVICES = _lib.llama_max_devices() @@ -390,7 +390,7 @@ class llama_model_kv_override(Structure): # // LLAMA_SPLIT_LAYER: ignored # int32_t main_gpu; -# // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES +# // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() # const float * tensor_split; # // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. @@ -417,7 +417,7 @@ class llama_model_params(Structure): n_gpu_layers (int): number of layers to store in VRAM split_mode (int): how to split the model across multiple GPUs main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored - tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES + tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted. progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data @@ -760,16 +760,43 @@ _lib.llama_time_us.argtypes = [] _lib.llama_time_us.restype = ctypes.c_int64 -# LLAMA_API int32_t llama_max_devices(void); +# LLAMA_API size_t llama_max_devices(void); def llama_max_devices() -> int: return _lib.llama_max_devices() _lib.llama_max_devices.argtypes = [] -_lib.llama_max_devices.restype = ctypes.c_int32 +_lib.llama_max_devices.restype = ctypes.c_size_t -# LLAMA_API bool llama_mmap_supported (void); +# LLAMA_API bool llama_supports_mmap (void); +def llama_supports_mmap() -> bool: + return _lib.llama_supports_mmap() + + +_lib.llama_supports_mmap.argtypes = [] +_lib.llama_supports_mmap.restype = c_bool + + +# LLAMA_API bool llama_supports_mlock (void); +def llama_supports_mlock() -> bool: + return _lib.llama_supports_mlock() + + +_lib.llama_supports_mlock.argtypes = [] +_lib.llama_supports_mlock.restype = c_bool + + +# LLAMA_API bool llama_supports_gpu_offload(void); +def llama_supports_gpu_offload() -> bool: + return _lib.llama_supports_gpu_offload() + + +_lib.llama_supports_gpu_offload.argtypes = [] +_lib.llama_supports_gpu_offload.restype = c_bool + + +# LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead"); def llama_mmap_supported() -> bool: return _lib.llama_mmap_supported() @@ -778,7 +805,7 @@ _lib.llama_mmap_supported.argtypes = [] _lib.llama_mmap_supported.restype = c_bool -# LLAMA_API bool llama_mlock_supported(void); +# LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead"); def llama_mlock_supported() -> bool: return _lib.llama_mlock_supported() diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1560630..5cb04db 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 15606309a05ccf7fadbaad5538cb7c32acb1e06b +Subproject commit 5cb04dbc16d1da38c8fdcc0111b40e67d00dd1c3