From cf1fdd8a9a9f461c095b45c2797fb9f19576ae9c Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Thu, 29 Feb 2024 12:55:50 -0600 Subject: [PATCH 01/15] docs: fix typo in README.md embeddings example. (#1232) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8d0980e..32c624f 100644 --- a/README.md +++ b/README.md @@ -525,7 +525,7 @@ To generate text embeddings use [`create_embedding`](http://localhost:8000/api-r ```python import llama_cpp -llm = llama_cpp.Llama(model_path="path/to/model.gguf", embeddings=True) +llm = llama_cpp.Llama(model_path="path/to/model.gguf", embedding=True) embeddings = llm.create_embedding("Hello, world!") From f062a7f51d9826df36b605ec8664df6b84a70a1b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 1 Mar 2024 12:57:16 -0500 Subject: [PATCH 02/15] feat: Update llama.cpp --- llama_cpp/llama.py | 4 ---- llama_cpp/llama_cpp.py | 10 +++------- vendor/llama.cpp | 2 +- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index d1bac9b..70498f3 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -86,7 +86,6 @@ class Llama: yarn_beta_fast: float = 32.0, yarn_beta_slow: float = 1.0, yarn_orig_ctx: int = 0, - mul_mat_q: bool = True, logits_all: bool = False, embedding: bool = False, offload_kqv: bool = True, @@ -291,7 +290,6 @@ class Llama: yarn_beta_slow if yarn_beta_slow != 0.0 else 0 ) self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0 - self.context_params.mul_mat_q = mul_mat_q self.context_params.logits_all = ( logits_all if draft_model is None else True ) # Must be set to True for speculative decoding @@ -1724,7 +1722,6 @@ class Llama: yarn_beta_fast=self.context_params.yarn_beta_fast, yarn_beta_slow=self.context_params.yarn_beta_slow, yarn_orig_ctx=self.context_params.yarn_orig_ctx, - mul_mat_q=self.context_params.mul_mat_q, logits_all=self.context_params.logits_all, embedding=self.context_params.embedding, # Sampling Params @@ -1768,7 +1765,6 @@ class Llama: yarn_beta_fast=state["yarn_beta_fast"], yarn_beta_slow=state["yarn_beta_slow"], yarn_orig_ctx=state["yarn_orig_ctx"], - mul_mat_q=state["mul_mat_q"], logits_all=state["logits_all"], embedding=state["embedding"], # Sampling Params diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 038a6f8..1593256 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -559,9 +559,7 @@ class llama_model_params(ctypes.Structure): # enum ggml_type type_k; // data type for K cache # enum ggml_type type_v; // data type for V cache - # // Keep the booleans together to avoid misalignment during copy-by-value. -# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) # bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool embedding; // embedding mode only # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU @@ -589,7 +587,6 @@ class llama_context_params(ctypes.Structure): cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval type_k (int): data type for K cache type_v (int): data type for V cache - mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true) logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) embedding (bool): embedding mode only offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU @@ -615,7 +612,6 @@ class llama_context_params(ctypes.Structure): ("cb_eval_user_data", ctypes.c_void_p), ("type_k", ctypes.c_int), ("type_v", ctypes.c_int), - ("mul_mat_q", ctypes.c_bool), ("logits_all", ctypes.c_bool), ("embedding", ctypes.c_bool), ("offload_kqv", ctypes.c_bool), @@ -1519,11 +1515,11 @@ def llama_copy_state_data( ... -# Set the state reading from the specified address -# Returns the number of bytes read +# // Set the state reading from the specified address +# // Returns the number of bytes read # LLAMA_API size_t llama_set_state_data( # struct llama_context * ctx, -# uint8_t * src); +# const uint8_t * src); @ctypes_function( "llama_set_state_data", [llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)], diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 08c5ee8..c2224f0 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 08c5ee87e4cceb603ecceac90734fcdade57311b +Subproject commit c2224f003bf9cf558b1a3c57033563e11a4de9a5 From 97aa3a153debe25df874055a6f96db0ac943091c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 1 Mar 2024 13:10:25 -0500 Subject: [PATCH 03/15] docs: Add information re: auto chat formats. Closes #1236 --- README.md | 11 ++++++++++- llama_cpp/llama.py | 4 +++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 32c624f..1d296e9 100644 --- a/README.md +++ b/README.md @@ -286,7 +286,16 @@ By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest The high-level API also provides a simple interface for chat completion. -Note that `chat_format` option must be set for the particular model you are using. +Chat completion requires that the model know how to format the messages into a single prompt. +The `Llama` class does this using pre-registered chat formats (ie. `chatml`, `llama-2`, `gemma`, etc) or by providing a custom chat handler object. + +The model will will format the messages into a single prompt using the following order of precedence: + - Use the `chat_handler` if provided + - Use the `chat_format` if provided + - Use the `tokenizer.chat_template` from the `gguf` model's metadata (should work for most new models, older models may not have this) + - else, fallback to the `llama-2` chat format + +Set `verbose=True` to see the selected chat format. ```python >>> from llama_cpp import Llama diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 70498f3..108a4cf 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -410,7 +410,7 @@ class Llama: bos_token = self._model.token_get_text(bos_token_id) if self.verbose: - print(f"Using chat template: {template}", file=sys.stderr) + print(f"Using gguf chat template: {template}", file=sys.stderr) print(f"Using chat eos_token: {eos_token}", file=sys.stderr) print(f"Using chat bos_token: {bos_token}", file=sys.stderr) @@ -420,6 +420,8 @@ class Llama: if self.chat_format is None and self.chat_handler is None: self.chat_format = "llama-2" + if self.verbose: + print(f"Using fallback chat format: {chat_format}", file=sys.stderr) @property def ctx(self) -> llama_cpp.llama_context_p: From d5df431278433b580e52222dbf4174f5102585b1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 1 Mar 2024 13:15:16 -0500 Subject: [PATCH 04/15] chore: Bump version --- CHANGELOG.md | 5 +++++ llama_cpp/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d7a96a9..375c6ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.54] + +- feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a +- docs: fix typo in README.md embeddings example by @iamlemec in #1232 + ## [0.2.53] - feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index aa0536c..a9a8222 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.53" \ No newline at end of file +__version__ = "0.2.54" \ No newline at end of file From 0e70984fb69d621c191913bf870b7d9201bcc3d5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 2 Mar 2024 22:20:04 -0500 Subject: [PATCH 05/15] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 36 ++++++++++++++++++++++++++++++++++-- vendor/llama.cpp | 2 +- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 1593256..88ba41c 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -148,6 +148,12 @@ ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE( ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p ) +# // Abort callback +# // If not NULL, called before ggml computation +# // If it returns true, the computation is aborted +# typedef bool (*ggml_abort_callback)(void * data); +ggml_abort_callback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p) + # llama.h bindings _lib.llama_max_devices.argtypes = [] @@ -560,10 +566,16 @@ class llama_model_params(ctypes.Structure): # enum ggml_type type_v; // data type for V cache # // Keep the booleans together to avoid misalignment during copy-by-value. -# bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) +# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool embedding; // embedding mode only # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU # bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) + +# // Abort callback +# // if it returns true, execution of llama_decode() will be aborted +# // currently works only with CPU execution +# ggml_abort_callback abort_callback; +# void * abort_callback_data; # }; class llama_context_params(ctypes.Structure): """Parameters for llama_context @@ -591,6 +603,8 @@ class llama_context_params(ctypes.Structure): embedding (bool): embedding mode only offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) + abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted + abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback """ _fields_ = [ @@ -616,6 +630,8 @@ class llama_context_params(ctypes.Structure): ("embedding", ctypes.c_bool), ("offload_kqv", ctypes.c_bool), ("do_pooling", ctypes.c_bool), + ("abort_callback", ggml_abort_callback), + ("abort_callback_data", ctypes.c_void_p), ] @@ -1703,8 +1719,24 @@ def llama_set_n_threads( """ ... +# // Set abort callback +# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); +@ctypes_function( + "llama_set_abort_callback", + [llama_context_p_ctypes, ggml_abort_callback, ctypes.c_void_p], + None, +) +def llama_set_abort_callback( + ctx: llama_context_p, + abort_callback: Callable[[ctypes.c_void_p], None], + abort_callback_data: ctypes.c_void_p, + /, +): + """Set abort callback""" + ... -# // Token logits obtained from the last call to llama_eval() + +# // Token logits obtained from the last call to llama_decode() # // The logits for the last token are stored in the last row # // Logits for which llama_batch.logits[i] == 0 are undefined # // Rows: n_tokens provided with llama_batch diff --git a/vendor/llama.cpp b/vendor/llama.cpp index c2224f0..9731134 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit c2224f003bf9cf558b1a3c57033563e11a4de9a5 +Subproject commit 9731134296af3a6839cd682e51d9c2109a871de5 From 663659f7301963e0a3e98662e14668a6632c6295 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Sun, 3 Mar 2024 04:20:41 +0100 Subject: [PATCH 06/15] docs: fix small typo in README: 'model know how' -> 'model knows how' (#1244) Co-authored-by: Andrei --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1d296e9..3323f38 100644 --- a/README.md +++ b/README.md @@ -286,7 +286,7 @@ By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest The high-level API also provides a simple interface for chat completion. -Chat completion requires that the model know how to format the messages into a single prompt. +Chat completion requires that the model knows how to format the messages into a single prompt. The `Llama` class does this using pre-registered chat formats (ie. `chatml`, `llama-2`, `gemma`, etc) or by providing a custom chat handler object. The model will will format the messages into a single prompt using the following order of precedence: From 13177aae0f674100f7a7d23c54fc9f14012bf6a2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 2 Mar 2024 22:46:40 -0500 Subject: [PATCH 07/15] chore: Bump version --- CHANGELOG.md | 5 +++++ llama_cpp/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 375c6ef..e16a6df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.55] + +- feat: Update llama.cpp to ggerganov/9731134296af3a6839cd682e51d9c2109a871de5 +- docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244 + ## [0.2.54] - feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index a9a8222..519ab51 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.54" \ No newline at end of file +__version__ = "0.2.55" \ No newline at end of file From 87a6e5797eb7b0cd63ad27c528fb950c80c84ad8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 3 Mar 2024 11:27:04 -0500 Subject: [PATCH 08/15] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 14 +++++++++----- vendor/llama.cpp | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 88ba41c..08adfe2 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -320,10 +320,12 @@ LLAMA_ROPE_SCALING_TYPE_YARN = 2 LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN # enum llama_pooling_type { +# LLAMA_POOLING_TYPE_UNSPECIFIED = -1, # LLAMA_POOLING_TYPE_NONE = 0, # LLAMA_POOLING_TYPE_MEAN = 1, # LLAMA_POOLING_TYPE_CLS = 2, # }; +LLAMA_POOLING_TYPE_UNSPECIFIED = -1 LLAMA_POOLING_TYPE_NONE = 0 LLAMA_POOLING_TYPE_MEAN = 1 LLAMA_POOLING_TYPE_CLS = 2 @@ -547,7 +549,10 @@ class llama_model_params(ctypes.Structure): # uint32_t n_batch; // prompt processing maximum batch size # uint32_t n_threads; // number of threads to use for generation # uint32_t n_threads_batch; // number of threads to use for batch processing -# int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` + +# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` +# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id +# // (ignored if no pooling layer) # // ref: https://github.com/ggerganov/llama.cpp/pull/2054 # float rope_freq_base; // RoPE base frequency, 0 = from model @@ -569,7 +574,6 @@ class llama_model_params(ctypes.Structure): # bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) # bool embedding; // embedding mode only # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU -# bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) # // Abort callback # // if it returns true, execution of llama_decode() will be aborted @@ -587,6 +591,7 @@ class llama_context_params(ctypes.Structure): n_threads (int): number of threads to use for generation n_threads_batch (int): number of threads to use for batch processing rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type` + pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) rope_freq_base (float): RoPE base frequency, 0 = from model rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model @@ -602,7 +607,6 @@ class llama_context_params(ctypes.Structure): logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) embedding (bool): embedding mode only offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU - do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback """ @@ -613,7 +617,8 @@ class llama_context_params(ctypes.Structure): ("n_batch", ctypes.c_uint32), ("n_threads", ctypes.c_uint32), ("n_threads_batch", ctypes.c_uint32), - ("rope_scaling_type", ctypes.c_int32), + ("rope_scaling_type", ctypes.c_int), + ("pooling_type", ctypes.c_int), ("rope_freq_base", ctypes.c_float), ("rope_freq_scale", ctypes.c_float), ("yarn_ext_factor", ctypes.c_float), @@ -629,7 +634,6 @@ class llama_context_params(ctypes.Structure): ("logits_all", ctypes.c_bool), ("embedding", ctypes.c_bool), ("offload_kqv", ctypes.c_bool), - ("do_pooling", ctypes.c_bool), ("abort_callback", ggml_abort_callback), ("abort_callback_data", ctypes.c_void_p), ] diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 9731134..67be2ce 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 9731134296af3a6839cd682e51d9c2109a871de5 +Subproject commit 67be2ce1015d070b3b2cd488bcb041eefb61de72 From 93dc56ace8e3de97f6f39a7071ff63aaf29d376f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 6 Mar 2024 01:32:00 -0500 Subject: [PATCH 09/15] Update llama.cpp --- llama_cpp/llama.py | 6 +++--- llama_cpp/llama_cpp.py | 34 ++++++++++++++++++++++++++-------- vendor/llama.cpp | 2 +- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 108a4cf..7187b4a 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -293,7 +293,7 @@ class Llama: self.context_params.logits_all = ( logits_all if draft_model is None else True ) # Must be set to True for speculative decoding - self.context_params.embedding = embedding + self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.offload_kqv = offload_kqv # Sampling Params @@ -787,7 +787,7 @@ class Llama: n_embd = self.n_embd() n_batch = self.n_batch - if self.context_params.embedding == False: + if self.context_params.embeddings == False: raise RuntimeError( "Llama model must be created with embedding=True to call this method" ) @@ -1725,7 +1725,7 @@ class Llama: yarn_beta_slow=self.context_params.yarn_beta_slow, yarn_orig_ctx=self.context_params.yarn_orig_ctx, logits_all=self.context_params.logits_all, - embedding=self.context_params.embedding, + embedding=self.context_params.embeddings, # Sampling Params last_n_tokens_size=self.last_n_tokens_size, # LoRA Params diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 08adfe2..92b9676 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -399,7 +399,7 @@ llama_progress_callback = ctypes.CFUNCTYPE( # // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) # // - pos : the positions of the respective token in the sequence # // - seq_id : the sequence to which the respective token belongs -# // - logits : if zero, the logits for the respective token will not be output +# // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output # // # typedef struct llama_batch { # int32_t n_tokens; @@ -409,7 +409,7 @@ llama_progress_callback = ctypes.CFUNCTYPE( # llama_pos * pos; # int32_t * n_seq_id; # llama_seq_id ** seq_id; -# int8_t * logits; +# int8_t * logits; // TODO: rename this to "output" # // NOTE: helpers for smooth API transition - can be deprecated in the future @@ -572,7 +572,7 @@ class llama_model_params(ctypes.Structure): # // Keep the booleans together to avoid misalignment during copy-by-value. # bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) -# bool embedding; // embedding mode only +# bool embeddings; // if true, extract embeddings (together with logits) # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU # // Abort callback @@ -605,7 +605,7 @@ class llama_context_params(ctypes.Structure): type_k (int): data type for K cache type_v (int): data type for V cache logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) - embedding (bool): embedding mode only + embeddings (bool): if true, extract embeddings (together with logits) offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback @@ -632,7 +632,7 @@ class llama_context_params(ctypes.Structure): ("type_k", ctypes.c_int), ("type_v", ctypes.c_int), ("logits_all", ctypes.c_bool), - ("embedding", ctypes.c_bool), + ("embeddings", ctypes.c_bool), ("offload_kqv", ctypes.c_bool), ("abort_callback", ggml_abort_callback), ("abort_callback_data", ctypes.c_void_p), @@ -1774,8 +1774,8 @@ def llama_get_logits_ith( ... -# Get the embeddings for the input -# shape: [n_embd] (1-dimensional) +# // Get all output token embeddings +# // shape: [n_tokens*n_embd] (1-dimensional) # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); @ctypes_function( "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float) @@ -1786,8 +1786,9 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float] ... -# // Get the embeddings for the ith sequence +# // Get the embeddings for the ith token # // llama_get_embeddings(ctx) + i*n_embd +# // shape: [n_embd] (1-dimensional) # LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); @ctypes_function( "llama_get_embeddings_ith", @@ -1802,6 +1803,23 @@ def llama_get_embeddings_ith( ... +# // Get the embeddings for a sequence id +# // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE +# // shape: [n_embd] (1-dimensional) +# LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id); +@ctypes_function( + "llama_get_embeddings_seq", + [llama_context_p_ctypes, llama_seq_id], + ctypes.POINTER(ctypes.c_float), +) +def llama_get_embeddings_seq( + ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / +) -> CtypesArray[ctypes.c_float]: + """Get the embeddings for a sequence id + Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE + shape: [n_embd] (1-dimensional)""" + ... + # // # // Vocab # // diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 67be2ce..8ced9f7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 67be2ce1015d070b3b2cd488bcb041eefb61de72 +Subproject commit 8ced9f7e3225adb8501e9821ed1bbd92e3a5c7ae From 40c6b54f6880e1cbb8f6393d9097328ffd422e13 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 8 Mar 2024 20:58:50 -0500 Subject: [PATCH 10/15] feat: Update llama.cpp --- llama_cpp/llama_cpp.py | 16 ++++++++++++---- vendor/llama.cpp | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 92b9676..0176e49 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -429,10 +429,12 @@ class llama_batch(ctypes.Structure): The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens Attributes: + n_tokens (int): number of tokens token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL) embd (ctypes.Array[ctypes.ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL) pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs + logits (ctypes.Array[ctypes.ctypes.c_int8]): if zero, the logits for the respective token will not be output """ _fields_ = [ @@ -547,6 +549,7 @@ class llama_model_params(ctypes.Structure): # uint32_t seed; // RNG seed, -1 for random # uint32_t n_ctx; // text context, 0 = from model # uint32_t n_batch; // prompt processing maximum batch size +# uint32_t n_parallel; // number of parallel sequences (i.e. distinct states for recurrent models) # uint32_t n_threads; // number of threads to use for generation # uint32_t n_threads_batch; // number of threads to use for batch processing @@ -588,6 +591,7 @@ class llama_context_params(ctypes.Structure): seed (int): RNG seed, -1 for random n_ctx (int): text context, 0 = from model n_batch (int): prompt processing maximum batch size + n_parallel (int): number of parallel sequences (i.e. distinct states for recurrent models) n_threads (int): number of threads to use for generation n_threads_batch (int): number of threads to use for batch processing rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type` @@ -615,6 +619,7 @@ class llama_context_params(ctypes.Structure): ("seed", ctypes.c_uint32), ("n_ctx", ctypes.c_uint32), ("n_batch", ctypes.c_uint32), + ("n_parallel", ctypes.c_uint32), ("n_threads", ctypes.c_uint32), ("n_threads_batch", ctypes.c_uint32), ("rope_scaling_type", ctypes.c_int), @@ -1322,7 +1327,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /): # // seq_id < 0 : match any sequence # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_rm( +# LLAMA_API bool llama_kv_cache_seq_rm( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, @@ -1335,7 +1340,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /): llama_pos, llama_pos, ], - None, + ctypes.c_bool, ) def llama_kv_cache_seq_rm( ctx: llama_context_p, @@ -1343,7 +1348,7 @@ def llama_kv_cache_seq_rm( p0: Union[llama_pos, int], p1: Union[llama_pos, int], /, -): +) -> bool: """Removes all tokens that belong to the specified sequence and have positions in [p0, p1) seq_id < 0 : match any sequence p0 < 0 : [0, p1] @@ -1754,7 +1759,10 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]: The logits for the last token are stored in the last row Logits for which llama_batch.logits[i] == 0 are undefined Rows: n_tokens provided with llama_batch - Cols: n_vocab""" + Cols: n_vocab + + Returns: + Pointer to the logits buffer of shape (n_tokens, n_vocab)""" ... diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8ced9f7..c2101a2 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8ced9f7e3225adb8501e9821ed1bbd92e3a5c7ae +Subproject commit c2101a2e909ac7c08976d414e64e96c90ee5fa9e From 2811014bae356401856a9c0796f42e719f2e8c3c Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Fri, 8 Mar 2024 19:59:35 -0600 Subject: [PATCH 11/15] feat: Switch embed to llama_get_embeddings_seq (#1263) * switch to llama_get_embeddings_seq * Remove duplicate definition of llama_get_embeddings_seq Co-authored-by: Andrei --------- Co-authored-by: Andrei --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7187b4a..aabbb7e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -814,7 +814,7 @@ class Llama: # store embeddings for i in range(n_seq): - embedding: List[float] = llama_cpp.llama_get_embeddings_ith( + embedding: List[float] = llama_cpp.llama_get_embeddings_seq( self._ctx.ctx, i )[:n_embd] if normalize: From 1f3156d4f2d8c5439dbb2ad72b8c1de84703eb09 Mon Sep 17 00:00:00 2001 From: Kevin Cao Date: Fri, 8 Mar 2024 21:00:10 -0500 Subject: [PATCH 12/15] fix: Check for existence of clip model path (#1264) --- llama_cpp/llama_chat_format.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 69ed601..4eb2b02 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1848,6 +1848,9 @@ class Llava15ChatHandler: self.verbose = verbose self._clip_free = self._llava_cpp._libllava.clip_free # type: ignore + if not os.path.exists(clip_model_path): + raise ValueError(f"Clip model path does not exist: {clip_model_path}") + with suppress_stdout_stderr(disable=self.verbose): self.clip_ctx = self._llava_cpp.clip_model_load( self.clip_model_path.encode(), 0 From c139f8b5d50f6f416a24c0ba65983a3fb84bf2f3 Mon Sep 17 00:00:00 2001 From: Felipe Lorenz Date: Fri, 8 Mar 2024 21:09:00 -0500 Subject: [PATCH 13/15] feat: Add endpoints for tokenize, detokenize and count tokens (#1136) * Add endpoint to count tokens * Add tokenize and detokenize endpoints * Change response key to tokens for tokenize endpoint * Fix dependency bug * Cleanup * Remove example added by mistake * Move tokenize, detokenize, and count to Extras namespace. Tag existing endpoints --------- Co-authored-by: Andrei Betlen --- llama_cpp/server/app.py | 71 +++++++++++++++++++++++++++++++++++++-- llama_cpp/server/types.py | 36 ++++++++++++++++++++ 2 files changed, 105 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ec92809..aa6afc1 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -41,6 +41,11 @@ from llama_cpp.server.types import ( CreateEmbeddingRequest, CreateChatCompletionRequest, ModelList, + TokenizeInputRequest, + TokenizeInputResponse, + TokenizeInputCountResponse, + DetokenizeInputRequest, + DetokenizeInputResponse, ) from llama_cpp.server.errors import RouteErrorHandler @@ -196,6 +201,9 @@ async def authenticate( ) +openai_v1_tag = "OpenAI V1" + + @router.post( "/v1/completions", summary="Completion", @@ -227,11 +235,13 @@ async def authenticate( }, } }, + tags=[openai_v1_tag], ) @router.post( "/v1/engines/copilot-codex/completions", include_in_schema=False, dependencies=[Depends(authenticate)], + tags=[openai_v1_tag], ) async def create_completion( request: Request, @@ -297,7 +307,10 @@ async def create_completion( @router.post( - "/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)] + "/v1/embeddings", + summary="Embedding", + dependencies=[Depends(authenticate)], + tags=[openai_v1_tag], ) async def create_embedding( request: CreateEmbeddingRequest, @@ -339,6 +352,7 @@ async def create_embedding( }, } }, + tags=[openai_v1_tag], ) async def create_chat_completion( request: Request, @@ -391,7 +405,12 @@ async def create_chat_completion( return iterator_or_completion -@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)]) +@router.get( + "/v1/models", + summary="Models", + dependencies=[Depends(authenticate)], + tags=[openai_v1_tag], +) async def get_models( llama_proxy: LlamaProxy = Depends(get_llama_proxy), ) -> ModelList: @@ -407,3 +426,51 @@ async def get_models( for model_alias in llama_proxy ], } + + +extras_tag = "Extras" + + +@router.post( + "/extras/tokenize", + summary="Tokenize", + dependencies=[Depends(authenticate)], + tags=[extras_tag], +) +async def tokenize( + body: TokenizeInputRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> TokenizeInputResponse: + tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) + + return {"tokens": tokens} + + +@router.post( + "/extras/tokenize/count", + summary="Tokenize Count", + dependencies=[Depends(authenticate)], + tags=[extras_tag], +) +async def count_query_tokens( + body: TokenizeInputRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> TokenizeInputCountResponse: + tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) + + return {"count": len(tokens)} + + +@router.post( + "/extras/detokenize", + summary="Detokenize", + dependencies=[Depends(authenticate)], + tags=[extras_tag], +) +async def detokenize( + body: DetokenizeInputRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> DetokenizeInputResponse: + text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8") + + return {"text": text} diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index 9a4b81e..c8b2ebc 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -264,3 +264,39 @@ class ModelData(TypedDict): class ModelList(TypedDict): object: Literal["list"] data: List[ModelData] + + +class TokenizeInputRequest(BaseModel): + model: Optional[str] = model_field + input: Optional[str] = Field(description="The input to tokenize.") + + model_config = { + "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]} + } + + +class TokenizeInputResponse(BaseModel): + tokens: List[int] = Field(description="A list of tokens.") + + model_config = {"json_schema_extra": {"example": {"tokens": [123, 321, 222]}}} + + +class TokenizeInputCountResponse(BaseModel): + count: int = Field(description="The number of tokens in the input.") + + model_config = {"json_schema_extra": {"example": {"count": 5}}} + + +class DetokenizeInputRequest(BaseModel): + model: Optional[str] = model_field + tokens: List[int] = Field(description="A list of toekns to detokenize.") + + model_config = {"json_schema_extra": {"example": [{"tokens": [123, 321, 222]}]}} + + +class DetokenizeInputResponse(BaseModel): + text: str = Field(description="The detokenized text.") + + model_config = { + "json_schema_extra": {"example": {"text": "How many tokens in this query?"}} + } From d02a9cf16ff88ad011e2eb1ce29f4d9400f13cd1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 8 Mar 2024 21:10:53 -0500 Subject: [PATCH 14/15] Fixed json strings grammar by blacklisting character control set. Closes #1259 --- llama_cpp/llama_grammar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py index 6a37857..9cc48a9 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -1337,7 +1337,7 @@ array ::= string ::= "\"" ( - [^"\\] | + [^"\\\x7F\x00-\x1F] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes )* "\"" ws @@ -1366,7 +1366,7 @@ array ::= string ::= "\"" ( - [^"\\] | + [^"\\\x7F\x00-\x1F] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes )* "\"" ws From a7281994d87927e42d8e636295c786057e98d8fe Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 8 Mar 2024 21:14:44 -0500 Subject: [PATCH 15/15] chore: Bump version --- CHANGELOG.md | 8 ++++++++ llama_cpp/__init__.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e16a6df..90dd1e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.56] + +- feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e +- feat(server): Add endpoints for tokenize, detokenize and count tokens by @felipelo in #1136 +- feat: Switch embed to llama_get_embeddings_seq by @iamlemec in #1263 +- fix: Fixed json strings grammar by blacklisting character control set by @ExtReMLapin in d02a9cf16ff88ad011e2eb1ce29f4d9400f13cd1 +- fix: Check for existence of clip model path by @kejcao in #1264 + ## [0.2.55] - feat: Update llama.cpp to ggerganov/9731134296af3a6839cd682e51d9c2109a871de5 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 519ab51..fcbc715 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.2.55" \ No newline at end of file +__version__ = "0.2.56" \ No newline at end of file