From cf1fdd8a9a9f461c095b45c2797fb9f19576ae9c Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Thu, 29 Feb 2024 12:55:50 -0600
Subject: [PATCH 01/15] docs: fix typo in README.md embeddings example. (#1232)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8d0980e..32c624f 100644
--- a/README.md
+++ b/README.md
@@ -525,7 +525,7 @@ To generate text embeddings use [`create_embedding`](http://localhost:8000/api-r
 ```python
 import llama_cpp
 
-llm = llama_cpp.Llama(model_path="path/to/model.gguf", embeddings=True)
+llm = llama_cpp.Llama(model_path="path/to/model.gguf", embedding=True)
 
 embeddings = llm.create_embedding("Hello, world!")
 

From f062a7f51d9826df36b605ec8664df6b84a70a1b Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 1 Mar 2024 12:57:16 -0500
Subject: [PATCH 02/15] feat: Update llama.cpp

---
 llama_cpp/llama.py     |  4 ----
 llama_cpp/llama_cpp.py | 10 +++-------
 vendor/llama.cpp       |  2 +-
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index d1bac9b..70498f3 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -86,7 +86,6 @@ class Llama:
         yarn_beta_fast: float = 32.0,
         yarn_beta_slow: float = 1.0,
         yarn_orig_ctx: int = 0,
-        mul_mat_q: bool = True,
         logits_all: bool = False,
         embedding: bool = False,
         offload_kqv: bool = True,
@@ -291,7 +290,6 @@ class Llama:
             yarn_beta_slow if yarn_beta_slow != 0.0 else 0
         )
         self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
-        self.context_params.mul_mat_q = mul_mat_q
         self.context_params.logits_all = (
             logits_all if draft_model is None else True
         )  # Must be set to True for speculative decoding
@@ -1724,7 +1722,6 @@ class Llama:
             yarn_beta_fast=self.context_params.yarn_beta_fast,
             yarn_beta_slow=self.context_params.yarn_beta_slow,
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
-            mul_mat_q=self.context_params.mul_mat_q,
             logits_all=self.context_params.logits_all,
             embedding=self.context_params.embedding,
             # Sampling Params
@@ -1768,7 +1765,6 @@ class Llama:
             yarn_beta_fast=state["yarn_beta_fast"],
             yarn_beta_slow=state["yarn_beta_slow"],
             yarn_orig_ctx=state["yarn_orig_ctx"],
-            mul_mat_q=state["mul_mat_q"],
             logits_all=state["logits_all"],
             embedding=state["embedding"],
             # Sampling Params
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 038a6f8..1593256 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -559,9 +559,7 @@ class llama_model_params(ctypes.Structure):
 #     enum ggml_type type_k; // data type for K cache
 #     enum ggml_type type_v; // data type for V cache
 
-
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
-#     bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
 #     bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
 #     bool embedding;   // embedding mode only
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
@@ -589,7 +587,6 @@ class llama_context_params(ctypes.Structure):
         cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
         type_k (int): data type for K cache
         type_v (int): data type for V cache
-        mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
         logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         embedding (bool): embedding mode only
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
@@ -615,7 +612,6 @@ class llama_context_params(ctypes.Structure):
         ("cb_eval_user_data", ctypes.c_void_p),
         ("type_k", ctypes.c_int),
         ("type_v", ctypes.c_int),
-        ("mul_mat_q", ctypes.c_bool),
         ("logits_all", ctypes.c_bool),
         ("embedding", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
@@ -1519,11 +1515,11 @@ def llama_copy_state_data(
     ...
 
 
-# Set the state reading from the specified address
-# Returns the number of bytes read
+# // Set the state reading from the specified address
+# // Returns the number of bytes read
 # LLAMA_API size_t llama_set_state_data(
 #         struct llama_context * ctx,
-#                      uint8_t * src);
+#                const uint8_t * src);
 @ctypes_function(
     "llama_set_state_data",
     [llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)],
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 08c5ee8..c2224f0 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 08c5ee87e4cceb603ecceac90734fcdade57311b
+Subproject commit c2224f003bf9cf558b1a3c57033563e11a4de9a5

From 97aa3a153debe25df874055a6f96db0ac943091c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 1 Mar 2024 13:10:25 -0500
Subject: [PATCH 03/15] docs: Add information re: auto chat formats. Closes
 #1236

---
 README.md          | 11 ++++++++++-
 llama_cpp/llama.py |  4 +++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 32c624f..1d296e9 100644
--- a/README.md
+++ b/README.md
@@ -286,7 +286,16 @@ By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest
 
 The high-level API also provides a simple interface for chat completion.
 
-Note that `chat_format` option must be set for the particular model you are using.
+Chat completion requires that the model know how to format the messages into a single prompt.
+The `Llama` class does this using pre-registered chat formats (ie. `chatml`, `llama-2`, `gemma`, etc) or by providing a custom chat handler object.
+
+The model will will format the messages into a single prompt using the following order of precedence:
+  - Use the `chat_handler` if provided
+  - Use the `chat_format` if provided
+  - Use the `tokenizer.chat_template` from the `gguf` model's metadata (should work for most new models, older models may not have this)
+  - else, fallback to the `llama-2` chat format
+
+Set `verbose=True` to see the selected chat format.
 
 ```python
 >>> from llama_cpp import Llama
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 70498f3..108a4cf 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -410,7 +410,7 @@ class Llama:
                 bos_token = self._model.token_get_text(bos_token_id)
 
                 if self.verbose:
-                    print(f"Using chat template: {template}", file=sys.stderr)
+                    print(f"Using gguf chat template: {template}", file=sys.stderr)
                     print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
                     print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
 
@@ -420,6 +420,8 @@ class Llama:
 
         if self.chat_format is None and self.chat_handler is None:
             self.chat_format = "llama-2"
+            if self.verbose:
+                print(f"Using fallback chat format: {chat_format}", file=sys.stderr)
 
     @property
     def ctx(self) -> llama_cpp.llama_context_p:

From d5df431278433b580e52222dbf4174f5102585b1 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 1 Mar 2024 13:15:16 -0500
Subject: [PATCH 04/15] chore: Bump version

---
 CHANGELOG.md          | 5 +++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d7a96a9..375c6ef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.54]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a
+- docs: fix typo in README.md embeddings example by @iamlemec in #1232
+
 ## [0.2.53]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index aa0536c..a9a8222 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.53"
\ No newline at end of file
+__version__ = "0.2.54"
\ No newline at end of file

From 0e70984fb69d621c191913bf870b7d9201bcc3d5 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 2 Mar 2024 22:20:04 -0500
Subject: [PATCH 05/15] feat: Update llama.cpp

---
 llama_cpp/llama_cpp.py | 36 ++++++++++++++++++++++++++++++++++--
 vendor/llama.cpp       |  2 +-
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 1593256..88ba41c 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -148,6 +148,12 @@ ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(
     ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p
 )
 
+# // Abort callback
+# // If not NULL, called before ggml computation
+# // If it returns true, the computation is aborted
+# typedef bool (*ggml_abort_callback)(void * data);
+ggml_abort_callback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p)
+
 # llama.h bindings
 
 _lib.llama_max_devices.argtypes = []
@@ -560,10 +566,16 @@ class llama_model_params(ctypes.Structure):
 #     enum ggml_type type_v; // data type for V cache
 
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
-#     bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+#     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
 #     bool embedding;   // embedding mode only
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
 #     bool do_pooling;  // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
+
+#     // Abort callback
+#     // if it returns true, execution of llama_decode() will be aborted
+#     // currently works only with CPU execution
+#     ggml_abort_callback abort_callback;
+#     void *              abort_callback_data;
 # };
 class llama_context_params(ctypes.Structure):
     """Parameters for llama_context
@@ -591,6 +603,8 @@ class llama_context_params(ctypes.Structure):
         embedding (bool): embedding mode only
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
         do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
+        abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
+        abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
     """
 
     _fields_ = [
@@ -616,6 +630,8 @@ class llama_context_params(ctypes.Structure):
         ("embedding", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
         ("do_pooling", ctypes.c_bool),
+        ("abort_callback", ggml_abort_callback),
+        ("abort_callback_data", ctypes.c_void_p),
     ]
 
 
@@ -1703,8 +1719,24 @@ def llama_set_n_threads(
     """
     ...
 
+# // Set abort callback
+# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
+@ctypes_function(
+    "llama_set_abort_callback",
+    [llama_context_p_ctypes, ggml_abort_callback, ctypes.c_void_p],
+    None,
+)
+def llama_set_abort_callback(
+    ctx: llama_context_p,
+    abort_callback: Callable[[ctypes.c_void_p], None],
+    abort_callback_data: ctypes.c_void_p,
+    /,
+):
+    """Set abort callback"""
+    ...
 
-# // Token logits obtained from the last call to llama_eval()
+
+# // Token logits obtained from the last call to llama_decode()
 # // The logits for the last token are stored in the last row
 # // Logits for which llama_batch.logits[i] == 0 are undefined
 # // Rows: n_tokens provided with llama_batch
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index c2224f0..9731134 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit c2224f003bf9cf558b1a3c57033563e11a4de9a5
+Subproject commit 9731134296af3a6839cd682e51d9c2109a871de5

From 663659f7301963e0a3e98662e14668a6632c6295 Mon Sep 17 00:00:00 2001
From: Kenneth Hoste <kenneth.hoste@ugent.be>
Date: Sun, 3 Mar 2024 04:20:41 +0100
Subject: [PATCH 06/15] docs: fix small typo in README: 'model know how' ->
 'model knows how' (#1244)

Co-authored-by: Andrei <abetlen@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1d296e9..3323f38 100644
--- a/README.md
+++ b/README.md
@@ -286,7 +286,7 @@ By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest
 
 The high-level API also provides a simple interface for chat completion.
 
-Chat completion requires that the model know how to format the messages into a single prompt.
+Chat completion requires that the model knows how to format the messages into a single prompt.
 The `Llama` class does this using pre-registered chat formats (ie. `chatml`, `llama-2`, `gemma`, etc) or by providing a custom chat handler object.
 
 The model will will format the messages into a single prompt using the following order of precedence:

From 13177aae0f674100f7a7d23c54fc9f14012bf6a2 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 2 Mar 2024 22:46:40 -0500
Subject: [PATCH 07/15] chore: Bump version

---
 CHANGELOG.md          | 5 +++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 375c6ef..e16a6df 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.55]
+
+- feat: Update llama.cpp to ggerganov/9731134296af3a6839cd682e51d9c2109a871de5
+- docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244
+
 ## [0.2.54]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index a9a8222..519ab51 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.54"
\ No newline at end of file
+__version__ = "0.2.55"
\ No newline at end of file

From 87a6e5797eb7b0cd63ad27c528fb950c80c84ad8 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 3 Mar 2024 11:27:04 -0500
Subject: [PATCH 08/15] feat: Update llama.cpp

---
 llama_cpp/llama_cpp.py | 14 +++++++++-----
 vendor/llama.cpp       |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 88ba41c..08adfe2 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -320,10 +320,12 @@ LLAMA_ROPE_SCALING_TYPE_YARN = 2
 LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
 
 # enum llama_pooling_type {
+#     LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
 #     LLAMA_POOLING_TYPE_NONE = 0,
 #     LLAMA_POOLING_TYPE_MEAN = 1,
 #     LLAMA_POOLING_TYPE_CLS  = 2,
 # };
+LLAMA_POOLING_TYPE_UNSPECIFIED = -1
 LLAMA_POOLING_TYPE_NONE = 0
 LLAMA_POOLING_TYPE_MEAN = 1
 LLAMA_POOLING_TYPE_CLS = 2
@@ -547,7 +549,10 @@ class llama_model_params(ctypes.Structure):
 #     uint32_t n_batch;           // prompt processing maximum batch size
 #     uint32_t n_threads;         // number of threads to use for generation
 #     uint32_t n_threads_batch;   // number of threads to use for batch processing
-#     int32_t  rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
+
+#     enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
+#     enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
+#                                                     // (ignored if no pooling layer)
 
 #     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
 #     float    rope_freq_base;   // RoPE base frequency, 0 = from model
@@ -569,7 +574,6 @@ class llama_model_params(ctypes.Structure):
 #     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
 #     bool embedding;   // embedding mode only
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-#     bool do_pooling;  // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
 
 #     // Abort callback
 #     // if it returns true, execution of llama_decode() will be aborted
@@ -587,6 +591,7 @@ class llama_context_params(ctypes.Structure):
         n_threads (int): number of threads to use for generation
         n_threads_batch (int): number of threads to use for batch processing
         rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
+        pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
         rope_freq_base (float): RoPE base frequency, 0 = from model
         rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model
         yarn_ext_factor (float): YaRN extrapolation mix factor, negative = from model
@@ -602,7 +607,6 @@ class llama_context_params(ctypes.Structure):
         logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         embedding (bool): embedding mode only
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
-        do_pooling (bool): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
         abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
         abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
     """
@@ -613,7 +617,8 @@ class llama_context_params(ctypes.Structure):
         ("n_batch", ctypes.c_uint32),
         ("n_threads", ctypes.c_uint32),
         ("n_threads_batch", ctypes.c_uint32),
-        ("rope_scaling_type", ctypes.c_int32),
+        ("rope_scaling_type", ctypes.c_int),
+        ("pooling_type", ctypes.c_int),
         ("rope_freq_base", ctypes.c_float),
         ("rope_freq_scale", ctypes.c_float),
         ("yarn_ext_factor", ctypes.c_float),
@@ -629,7 +634,6 @@ class llama_context_params(ctypes.Structure):
         ("logits_all", ctypes.c_bool),
         ("embedding", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
-        ("do_pooling", ctypes.c_bool),
         ("abort_callback", ggml_abort_callback),
         ("abort_callback_data", ctypes.c_void_p),
     ]
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 9731134..67be2ce 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 9731134296af3a6839cd682e51d9c2109a871de5
+Subproject commit 67be2ce1015d070b3b2cd488bcb041eefb61de72

From 93dc56ace8e3de97f6f39a7071ff63aaf29d376f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 6 Mar 2024 01:32:00 -0500
Subject: [PATCH 09/15] Update llama.cpp

---
 llama_cpp/llama.py     |  6 +++---
 llama_cpp/llama_cpp.py | 34 ++++++++++++++++++++++++++--------
 vendor/llama.cpp       |  2 +-
 3 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 108a4cf..7187b4a 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -293,7 +293,7 @@ class Llama:
         self.context_params.logits_all = (
             logits_all if draft_model is None else True
         )  # Must be set to True for speculative decoding
-        self.context_params.embedding = embedding
+        self.context_params.embeddings = embedding # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
 
         # Sampling Params
@@ -787,7 +787,7 @@ class Llama:
         n_embd = self.n_embd()
         n_batch = self.n_batch
 
-        if self.context_params.embedding == False:
+        if self.context_params.embeddings == False:
             raise RuntimeError(
                 "Llama model must be created with embedding=True to call this method"
             )
@@ -1725,7 +1725,7 @@ class Llama:
             yarn_beta_slow=self.context_params.yarn_beta_slow,
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
             logits_all=self.context_params.logits_all,
-            embedding=self.context_params.embedding,
+            embedding=self.context_params.embeddings,
             # Sampling Params
             last_n_tokens_size=self.last_n_tokens_size,
             # LoRA Params
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 08adfe2..92b9676 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -399,7 +399,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(
 # // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
 # // - pos    : the positions of the respective token in the sequence
 # // - seq_id : the sequence to which the respective token belongs
-# // - logits : if zero, the logits for the respective token will not be output
+# // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
 # //
 # typedef struct llama_batch {
 #     int32_t n_tokens;
@@ -409,7 +409,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(
 #     llama_pos    *  pos;
 #     int32_t      *  n_seq_id;
 #     llama_seq_id ** seq_id;
-#     int8_t       *  logits;
+#     int8_t       *  logits; // TODO: rename this to "output"
 
 
 #     // NOTE: helpers for smooth API transition - can be deprecated in the future
@@ -572,7 +572,7 @@ class llama_model_params(ctypes.Structure):
 
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-#     bool embedding;   // embedding mode only
+#     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
 
 #     // Abort callback
@@ -605,7 +605,7 @@ class llama_context_params(ctypes.Structure):
         type_k (int): data type for K cache
         type_v (int): data type for V cache
         logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        embedding (bool): embedding mode only
+        embeddings (bool): if true, extract embeddings (together with logits)
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
         abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
         abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
@@ -632,7 +632,7 @@ class llama_context_params(ctypes.Structure):
         ("type_k", ctypes.c_int),
         ("type_v", ctypes.c_int),
         ("logits_all", ctypes.c_bool),
-        ("embedding", ctypes.c_bool),
+        ("embeddings", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
         ("abort_callback", ggml_abort_callback),
         ("abort_callback_data", ctypes.c_void_p),
@@ -1774,8 +1774,8 @@ def llama_get_logits_ith(
     ...
 
 
-# Get the embeddings for the input
-# shape: [n_embd] (1-dimensional)
+# // Get all output token embeddings
+# // shape: [n_tokens*n_embd] (1-dimensional)
 # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 @ctypes_function(
     "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@@ -1786,8 +1786,9 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
     ...
 
 
-# // Get the embeddings for the ith sequence
+# // Get the embeddings for the ith token
 # // llama_get_embeddings(ctx) + i*n_embd
+# // shape: [n_embd] (1-dimensional)
 # LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
 @ctypes_function(
     "llama_get_embeddings_ith",
@@ -1802,6 +1803,23 @@ def llama_get_embeddings_ith(
     ...
 
 
+# // Get the embeddings for a sequence id
+# // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
+# // shape: [n_embd] (1-dimensional)
+# LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
+@ctypes_function(
+    "llama_get_embeddings_seq",
+    [llama_context_p_ctypes, llama_seq_id],
+    ctypes.POINTER(ctypes.c_float),
+)
+def llama_get_embeddings_seq(
+    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
+) -> CtypesArray[ctypes.c_float]:
+    """Get the embeddings for a sequence id
+    Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
+    shape: [n_embd] (1-dimensional)"""
+    ...
+
 # //
 # // Vocab
 # //
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 67be2ce..8ced9f7 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 67be2ce1015d070b3b2cd488bcb041eefb61de72
+Subproject commit 8ced9f7e3225adb8501e9821ed1bbd92e3a5c7ae

From 40c6b54f6880e1cbb8f6393d9097328ffd422e13 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 8 Mar 2024 20:58:50 -0500
Subject: [PATCH 10/15] feat: Update llama.cpp

---
 llama_cpp/llama_cpp.py | 16 ++++++++++++----
 vendor/llama.cpp       |  2 +-
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 92b9676..0176e49 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -429,10 +429,12 @@ class llama_batch(ctypes.Structure):
     The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
 
     Attributes:
+        n_tokens (int): number of tokens
         token (ctypes.Array[llama_token]): the token ids of the input (used when embd is NULL)
         embd (ctypes.Array[ctypes.ctypes.c_float]): token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
         pos (ctypes.Array[ctypes.Array[llama_pos]]): the positions of the respective token in the sequence
         seq_id (ctypes.Array[ctypes.Array[llama_seq_id]]): the sequence to which the respective token belongs
+        logits (ctypes.Array[ctypes.ctypes.c_int8]): if zero, the logits for the respective token will not be output
     """
 
     _fields_ = [
@@ -547,6 +549,7 @@ class llama_model_params(ctypes.Structure):
 #     uint32_t seed;              // RNG seed, -1 for random
 #     uint32_t n_ctx;             // text context, 0 = from model
 #     uint32_t n_batch;           // prompt processing maximum batch size
+#     uint32_t n_parallel;        // number of parallel sequences (i.e. distinct states for recurrent models)
 #     uint32_t n_threads;         // number of threads to use for generation
 #     uint32_t n_threads_batch;   // number of threads to use for batch processing
 
@@ -588,6 +591,7 @@ class llama_context_params(ctypes.Structure):
         seed (int): RNG seed, -1 for random
         n_ctx (int): text context, 0 = from model
         n_batch (int): prompt processing maximum batch size
+        n_parallel (int): number of parallel sequences (i.e. distinct states for recurrent models)
         n_threads (int): number of threads to use for generation
         n_threads_batch (int): number of threads to use for batch processing
         rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
@@ -615,6 +619,7 @@ class llama_context_params(ctypes.Structure):
         ("seed", ctypes.c_uint32),
         ("n_ctx", ctypes.c_uint32),
         ("n_batch", ctypes.c_uint32),
+        ("n_parallel", ctypes.c_uint32),
         ("n_threads", ctypes.c_uint32),
         ("n_threads_batch", ctypes.c_uint32),
         ("rope_scaling_type", ctypes.c_int),
@@ -1322,7 +1327,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
 # // seq_id < 0 : match any sequence
 # // p0 < 0     : [0,  p1]
 # // p1 < 0     : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_rm(
+# LLAMA_API bool llama_kv_cache_seq_rm(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
 #                    llama_pos   p0,
@@ -1335,7 +1340,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
         llama_pos,
         llama_pos,
     ],
-    None,
+    ctypes.c_bool,
 )
 def llama_kv_cache_seq_rm(
     ctx: llama_context_p,
@@ -1343,7 +1348,7 @@ def llama_kv_cache_seq_rm(
     p0: Union[llama_pos, int],
     p1: Union[llama_pos, int],
     /,
-):
+) -> bool:
     """Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
     seq_id < 0 : match any sequence
     p0 < 0     : [0,  p1]
@@ -1754,7 +1759,10 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
     The logits for the last token are stored in the last row
     Logits for which llama_batch.logits[i] == 0 are undefined
     Rows: n_tokens provided with llama_batch
-    Cols: n_vocab"""
+    Cols: n_vocab
+    
+    Returns:
+        Pointer to the logits buffer of shape (n_tokens, n_vocab)"""
     ...
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 8ced9f7..c2101a2 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 8ced9f7e3225adb8501e9821ed1bbd92e3a5c7ae
+Subproject commit c2101a2e909ac7c08976d414e64e96c90ee5fa9e

From 2811014bae356401856a9c0796f42e719f2e8c3c Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Fri, 8 Mar 2024 19:59:35 -0600
Subject: [PATCH 11/15] feat: Switch embed to llama_get_embeddings_seq (#1263)

* switch to llama_get_embeddings_seq

* Remove duplicate definition of llama_get_embeddings_seq

Co-authored-by: Andrei <abetlen@gmail.com>

---------

Co-authored-by: Andrei <abetlen@gmail.com>
---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 7187b4a..aabbb7e 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -814,7 +814,7 @@ class Llama:
 
             # store embeddings
             for i in range(n_seq):
-                embedding: List[float] = llama_cpp.llama_get_embeddings_ith(
+                embedding: List[float] = llama_cpp.llama_get_embeddings_seq(
                     self._ctx.ctx, i
                 )[:n_embd]
                 if normalize:

From 1f3156d4f2d8c5439dbb2ad72b8c1de84703eb09 Mon Sep 17 00:00:00 2001
From: Kevin Cao <kjcao@proton.me>
Date: Fri, 8 Mar 2024 21:00:10 -0500
Subject: [PATCH 12/15] fix: Check for existence of clip model path (#1264)

---
 llama_cpp/llama_chat_format.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 69ed601..4eb2b02 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1848,6 +1848,9 @@ class Llava15ChatHandler:
         self.verbose = verbose
         self._clip_free = self._llava_cpp._libllava.clip_free  # type: ignore
 
+        if not os.path.exists(clip_model_path):
+            raise ValueError(f"Clip model path does not exist: {clip_model_path}")
+
         with suppress_stdout_stderr(disable=self.verbose):
             self.clip_ctx = self._llava_cpp.clip_model_load(
                 self.clip_model_path.encode(), 0

From c139f8b5d50f6f416a24c0ba65983a3fb84bf2f3 Mon Sep 17 00:00:00 2001
From: Felipe Lorenz <felipe.lorenz@gmail.com>
Date: Fri, 8 Mar 2024 21:09:00 -0500
Subject: [PATCH 13/15] feat: Add endpoints for tokenize, detokenize and count
 tokens (#1136)

* Add endpoint to count tokens

* Add tokenize and detokenize endpoints

* Change response key to tokens for tokenize endpoint

* Fix dependency bug

* Cleanup

* Remove example added by mistake

* Move tokenize, detokenize, and count to Extras namespace. Tag existing endpoints

---------

Co-authored-by: Andrei Betlen <abetlen@gmail.com>
---
 llama_cpp/server/app.py   | 71 +++++++++++++++++++++++++++++++++++++--
 llama_cpp/server/types.py | 36 ++++++++++++++++++++
 2 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index ec92809..aa6afc1 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -41,6 +41,11 @@ from llama_cpp.server.types import (
     CreateEmbeddingRequest,
     CreateChatCompletionRequest,
     ModelList,
+    TokenizeInputRequest,
+    TokenizeInputResponse,
+    TokenizeInputCountResponse,
+    DetokenizeInputRequest,
+    DetokenizeInputResponse,
 )
 from llama_cpp.server.errors import RouteErrorHandler
 
@@ -196,6 +201,9 @@ async def authenticate(
     )
 
 
+openai_v1_tag = "OpenAI V1"
+
+
 @router.post(
     "/v1/completions",
     summary="Completion",
@@ -227,11 +235,13 @@ async def authenticate(
             },
         }
     },
+    tags=[openai_v1_tag],
 )
 @router.post(
     "/v1/engines/copilot-codex/completions",
     include_in_schema=False,
     dependencies=[Depends(authenticate)],
+    tags=[openai_v1_tag],
 )
 async def create_completion(
     request: Request,
@@ -297,7 +307,10 @@ async def create_completion(
 
 
 @router.post(
-    "/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)]
+    "/v1/embeddings",
+    summary="Embedding",
+    dependencies=[Depends(authenticate)],
+    tags=[openai_v1_tag],
 )
 async def create_embedding(
     request: CreateEmbeddingRequest,
@@ -339,6 +352,7 @@ async def create_embedding(
             },
         }
     },
+    tags=[openai_v1_tag],
 )
 async def create_chat_completion(
     request: Request,
@@ -391,7 +405,12 @@ async def create_chat_completion(
         return iterator_or_completion
 
 
-@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)])
+@router.get(
+    "/v1/models",
+    summary="Models",
+    dependencies=[Depends(authenticate)],
+    tags=[openai_v1_tag],
+)
 async def get_models(
     llama_proxy: LlamaProxy = Depends(get_llama_proxy),
 ) -> ModelList:
@@ -407,3 +426,51 @@ async def get_models(
             for model_alias in llama_proxy
         ],
     }
+
+
+extras_tag = "Extras"
+
+
+@router.post(
+    "/extras/tokenize",
+    summary="Tokenize",
+    dependencies=[Depends(authenticate)],
+    tags=[extras_tag],
+)
+async def tokenize(
+    body: TokenizeInputRequest,
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> TokenizeInputResponse:
+    tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
+
+    return {"tokens": tokens}
+
+
+@router.post(
+    "/extras/tokenize/count",
+    summary="Tokenize Count",
+    dependencies=[Depends(authenticate)],
+    tags=[extras_tag],
+)
+async def count_query_tokens(
+    body: TokenizeInputRequest,
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> TokenizeInputCountResponse:
+    tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
+
+    return {"count": len(tokens)}
+
+
+@router.post(
+    "/extras/detokenize",
+    summary="Detokenize",
+    dependencies=[Depends(authenticate)],
+    tags=[extras_tag],
+)
+async def detokenize(
+    body: DetokenizeInputRequest,
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> DetokenizeInputResponse:
+    text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")
+
+    return {"text": text}
diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
index 9a4b81e..c8b2ebc 100644
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@@ -264,3 +264,39 @@ class ModelData(TypedDict):
 class ModelList(TypedDict):
     object: Literal["list"]
     data: List[ModelData]
+
+
+class TokenizeInputRequest(BaseModel):
+    model: Optional[str] = model_field
+    input: Optional[str] = Field(description="The input to tokenize.")
+
+    model_config = {
+        "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
+    }
+
+
+class TokenizeInputResponse(BaseModel):
+    tokens: List[int] = Field(description="A list of tokens.")
+
+    model_config = {"json_schema_extra": {"example": {"tokens": [123, 321, 222]}}}
+
+
+class TokenizeInputCountResponse(BaseModel):
+    count: int = Field(description="The number of tokens in the input.")
+
+    model_config = {"json_schema_extra": {"example": {"count": 5}}}
+
+
+class DetokenizeInputRequest(BaseModel):
+    model: Optional[str] = model_field
+    tokens: List[int] = Field(description="A list of toekns to detokenize.")
+
+    model_config = {"json_schema_extra": {"example": [{"tokens": [123, 321, 222]}]}}
+
+
+class DetokenizeInputResponse(BaseModel):
+    text: str = Field(description="The detokenized text.")
+
+    model_config = {
+        "json_schema_extra": {"example": {"text": "How many tokens in this query?"}}
+    }

From d02a9cf16ff88ad011e2eb1ce29f4d9400f13cd1 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 8 Mar 2024 21:10:53 -0500
Subject: [PATCH 14/15] Fixed json strings grammar by blacklisting character
 control set. Closes #1259

---
 llama_cpp/llama_grammar.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py
index 6a37857..9cc48a9 100644
--- a/llama_cpp/llama_grammar.py
+++ b/llama_cpp/llama_grammar.py
@@ -1337,7 +1337,7 @@ array  ::=
 
 string ::=
   "\"" (
-    [^"\\] |
+    [^"\\\x7F\x00-\x1F] |
     "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
   )* "\"" ws
 
@@ -1366,7 +1366,7 @@ array  ::=
 
 string ::=
   "\"" (
-    [^"\\] |
+    [^"\\\x7F\x00-\x1F] |
     "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
   )* "\"" ws
 

From a7281994d87927e42d8e636295c786057e98d8fe Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 8 Mar 2024 21:14:44 -0500
Subject: [PATCH 15/15] chore: Bump version

---
 CHANGELOG.md          | 8 ++++++++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e16a6df..90dd1e6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.56]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e
+- feat(server): Add endpoints for tokenize, detokenize and count tokens by @felipelo in #1136
+- feat: Switch embed to llama_get_embeddings_seq by @iamlemec in #1263
+- fix: Fixed json strings grammar by blacklisting character control set by @ExtReMLapin in d02a9cf16ff88ad011e2eb1ce29f4d9400f13cd1
+- fix: Check for existence of clip model path by @kejcao in #1264
+
 ## [0.2.55]
 
 - feat: Update llama.cpp to ggerganov/9731134296af3a6839cd682e51d9c2109a871de5
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 519ab51..fcbc715 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.55"
\ No newline at end of file
+__version__ = "0.2.56"
\ No newline at end of file