4 changed files with 122 additions and 147 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -79,7 +79,6 @@ class Llama:
        n_threads: Optional[int] = None,
        n_threads_batch: Optional[int] = None,
        rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
-        pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
        rope_freq_base: float = 0.0,
        rope_freq_scale: float = 0.0,
        yarn_ext_factor: float = -1.0,
@ -152,7 +151,6 @@ class Llama:
            n_threads: Number of threads to use for generation
            n_threads_batch: Number of threads to use for batch processing
            rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
-            pooling_type: Pooling type, from `enum llama_pooling_type`.
            rope_freq_base: RoPE base frequency, 0 = from model
            rope_freq_scale: RoPE frequency scaling factor, 0 = from model
            yarn_ext_factor: YaRN extrapolation mix factor, negative = from model
@ -273,7 +271,6 @@ class Llama:
            if rope_scaling_type is not None
            else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
        )
-        self.context_params.pooling_type = pooling_type
        self.context_params.rope_freq_base = (
            rope_freq_base if rope_freq_base != 0.0 else 0
        )
@ -817,12 +814,9 @@ class Llama:

            # store embeddings
            for i in range(n_seq):
-                ptr = llama_cpp.llama_get_embeddings_seq(
+                embedding: List[float] = llama_cpp.llama_get_embeddings_seq(
                    self._ctx.ctx, i
-                )
-                if not ptr:
-                    raise RuntimeError("Failed to get embeddings from sequence pooling type is not set")
-                embedding: List[float] = ptr[:n_embd]
+                )[:n_embd]
                if normalize:
                    norm = float(np.linalg.norm(embedding))
                    embedding = [v / norm for v in embedding]
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -339,7 +339,16 @@ def chat_formatter_to_chat_completion_handler(
            stop = stop + rstop

        if response_format is not None and response_format["type"] == "json_object":
-            grammar = _grammar_for_response_format(response_format, verbose=llama.verbose)
+            try:
+                # create grammar from json schema
+                if "schema" in response_format:
+                    grammar = llama_grammar.LlamaGrammar.from_json_schema(
+                        json.dumps(response_format["schema"]), verbose=llama.verbose
+                    )
+            except Exception as e:
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                )

        completion_or_chunks = llama.create_completion(
            prompt=prompt,
@ -597,35 +606,6 @@ def _format_chatglm3(
            ret += role
    return ret

-def _grammar_for_json(verbose:bool=False):
-    return llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF, verbose=verbose)
-
-def _grammar_for_json_schema(
-        schema: str,
-        verbose: bool = False,
-        fallback_to_json: bool = True
-):
-    try:
-        return llama_grammar.LlamaGrammar.from_json_schema(schema, verbose=verbose)
-    except Exception as e:
-        if fallback_to_json:
-            return _grammar_for_json(verbose=verbose)
-        else:
-            raise e
-
-def _grammar_for_response_format(
-        response_format: llama_types.ChatCompletionRequestResponseFormat,
-        verbose: bool = False
-):
-    if response_format["type"] != "json_object":
-        return None
-
-    if "schema" in response_format:
-        return _grammar_for_json_schema(
-            json.dumps(response_format["schema"]), verbose=verbose
-        )
-    else:
-        return _grammar_for_json(verbose=verbose)

 ### Chat Formats ###

@ -2014,7 +1994,16 @@ class Llava15ChatHandler:
        prompt = llama.input_ids[: llama.n_tokens].tolist()

        if response_format is not None and response_format["type"] == "json_object":
-            grammar = _grammar_for_response_format(response_format)
+            try:
+                # create grammar from json schema
+                if "schema" in response_format:
+                    grammar = llama_grammar.LlamaGrammar.from_json_schema(
+                        json.dumps(response_format["schema"])
+                    )
+            except Exception as e:
+                grammar = llama_grammar.LlamaGrammar.from_string(
+                    llama_grammar.JSON_GBNF
+                )

        return _convert_completion_to_chat(
            llama.create_completion(
@ -2170,10 +2159,26 @@ def chatml_function_calling(
            tool_calls=None,
            add_generation_prompt=True,
        )
-
        if response_format is not None and response_format["type"] == "json_object":
-            grammar = _grammar_for_response_format(response_format)
-
+            try:
+                grammar = (
+                    llama_grammar.LlamaGrammar.from_json_schema(
+                        json.dumps(response_format["schema"])
+                    )
+                    if "schema" in response_format
+                    else None
+                )
+            except Exception as e:
+                if llama.verbose:
+                    print(
+                        "Failed to parse response format as JSON schema, falling back to default grammar"
+                    )
+                    print(e)
+            grammar = (
+                llama_grammar.LlamaGrammar.from_string(llama_grammar.JSON_GBNF)
+                if grammar is None
+                else grammar
+            )
        return _convert_completion_to_chat(
            llama.create_completion(
                prompt=prompt,
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -198,15 +198,13 @@ llama_seq_id = ctypes.c_int32


 # enum llama_vocab_type {
-#     LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-#     LLAMA_VOCAB_TYPE_SPM  = 1, // SentencePiece
-#     LLAMA_VOCAB_TYPE_BPE  = 2, // Byte Pair Encoding
-#     LLAMA_VOCAB_TYPE_WPM  = 3, // WordPiece
+#     LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
+#     LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
+#     LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
 # };
-LLAMA_VOCAB_TYPE_NONE = 0
-LLAMA_VOCAB_TYPE_SPM = 1
-LLAMA_VOCAB_TYPE_BPE = 2
-LLAMA_VOCAB_TYPE_WPM = 3
+LLAMA_VOCAB_TYPE_SPM = 0
+LLAMA_VOCAB_TYPE_BPE = 1
+LLAMA_VOCAB_TYPE_WPM = 2


 # // note: these values should be synchronized with ggml_rope
@ -550,9 +548,8 @@ class llama_model_params(ctypes.Structure):
 # struct llama_context_params {
 #     uint32_t seed;              // RNG seed, -1 for random
 #     uint32_t n_ctx;             // text context, 0 = from model
-#     uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
-#     uint32_t n_ubatch;          // physical maximum batch size
-#     uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
+#     uint32_t n_batch;           // prompt processing maximum batch size
+#     uint32_t n_parallel;        // number of parallel sequences (i.e. distinct states for recurrent models)
 #     uint32_t n_threads;         // number of threads to use for generation
 #     uint32_t n_threads_batch;   // number of threads to use for batch processing

@ -593,9 +590,8 @@ class llama_context_params(ctypes.Structure):
    Attributes:
        seed (int): RNG seed, -1 for random
        n_ctx (int): text context, 0 = from model
-        n_batch (int): logical maximum batch size that can be submitted to llama_decode
-        n_ubatch (int): physical maximum batch size
-        n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models)
+        n_batch (int): prompt processing maximum batch size
+        n_parallel (int): number of parallel sequences (i.e. distinct states for recurrent models)
        n_threads (int): number of threads to use for generation
        n_threads_batch (int): number of threads to use for batch processing
        rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
@ -623,8 +619,7 @@ class llama_context_params(ctypes.Structure):
        ("seed", ctypes.c_uint32),
        ("n_ctx", ctypes.c_uint32),
        ("n_batch", ctypes.c_uint32),
-        ("n_ubatch", ctypes.c_uint32),
-        ("n_seq_max", ctypes.c_uint32),
+        ("n_parallel", ctypes.c_uint32),
        ("n_threads", ctypes.c_uint32),
        ("n_threads_batch", ctypes.c_uint32),
        ("rope_scaling_type", ctypes.c_int),
@ -672,7 +667,7 @@ It might not exist for progress report where '.' is output repeatedly."""
 #     bool allow_requantize;       // allow quantizing non-f32/f16 tensors
 #     bool quantize_output_tensor; // quantize output.weight
 #     bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-#     bool pure;                   // quantize all tensors to the default type
+#     bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
 #     void * imatrix;              // pointer to importance matrix data
 # } llama_model_quantize_params;
 class llama_model_quantize_params(ctypes.Structure):
@ -684,7 +679,7 @@ class llama_model_quantize_params(ctypes.Structure):
        allow_requantize (bool): allow quantizing non-f32/f16 tensors
        quantize_output_tensor (bool): quantize output.weight
        only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        pure (bool): quantize all tensors to the default type
+        pure (bool): disable k-quant mixtures and quantize all tensors to the same type
        imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data
    """

@ -865,7 +860,8 @@ GGML_NUMA_STRATEGY_COUNT = 5
    [ctypes.c_int],
    None,
 )
-def llama_numa_init(numa: int, /): ...
+def llama_numa_init(numa: int, /):
+    ...


 # // Call once at the end of the program - currently only used for MPI
@ -890,7 +886,8 @@ def llama_backend_free():
 )
 def llama_load_model_from_file(
    path_model: bytes, params: llama_model_params, /
-) -> Optional[llama_model_p]: ...
+) -> Optional[llama_model_p]:
+    ...


 # LLAMA_API void llama_free_model(struct llama_model * model);
@ -899,7 +896,8 @@ def llama_load_model_from_file(
    [llama_model_p_ctypes],
    None,
 )
-def llama_free_model(model: llama_model_p, /): ...
+def llama_free_model(model: llama_model_p, /):
+    ...


 # LLAMA_API struct llama_context * llama_new_context_with_model(
@ -912,7 +910,8 @@ def llama_free_model(model: llama_model_p, /): ...
 )
 def llama_new_context_with_model(
    model: llama_model_p, params: llama_context_params, /
-) -> Optional[llama_context_p]: ...
+) -> Optional[llama_context_p]:
+    ...


 # // Frees all allocated memory
@ -933,77 +932,80 @@ def llama_free(ctx: llama_context_p, /):
    [],
    ctypes.c_int64,
 )
-def llama_time_us() -> int: ...
+def llama_time_us() -> int:
+    ...


 # LLAMA_API size_t llama_max_devices(void);
@ctypes_function("llama_max_devices", [], ctypes.c_size_t)
-def llama_max_devices() -> int: ...
+def llama_max_devices() -> int:
+    ...


 # LLAMA_API bool llama_supports_mmap       (void);
@ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
-def llama_supports_mmap() -> bool: ...
+def llama_supports_mmap() -> bool:
+    ...


 # LLAMA_API bool llama_supports_mlock      (void);
@ctypes_function("llama_supports_mlock", [], ctypes.c_bool)
-def llama_supports_mlock() -> bool: ...
+def llama_supports_mlock() -> bool:
+    ...


 # LLAMA_API bool llama_supports_gpu_offload(void);
@ctypes_function("llama_supports_gpu_offload", [], ctypes.c_bool)
-def llama_supports_gpu_offload() -> bool: ...
+def llama_supports_gpu_offload() -> bool:
+    ...


 # LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
@ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
-def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ...
+def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
+    ...


 # LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
@ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_ctx(ctx: llama_context_p, /) -> int: ...
+def llama_n_ctx(ctx: llama_context_p, /) -> int:
+    ...


 # LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
@ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_batch(ctx: llama_context_p, /) -> int: ...
-
-
-# LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
-@ctypes_function("llama_n_ubatch", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_ubatch(ctx: llama_context_p, /) -> int: ...
-
-
-# LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
-@ctypes_function("llama_n_seq_max", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
+def llama_n_batch(ctx: llama_context_p, /) -> int:
+    ...


 # LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int)
-def llama_vocab_type(model: llama_model_p, /) -> int: ...
+def llama_vocab_type(model: llama_model_p, /) -> int:
+    ...


 # LLAMA_API enum llama_rope_type  llama_rope_type (const struct llama_model * model);
@ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int)
-def llama_rope_type(model: llama_model_p, /) -> int: ...
+def llama_rope_type(model: llama_model_p, /) -> int:
+    ...


 # LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
@ctypes_function("llama_n_vocab", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_vocab(model: llama_model_p, /) -> int: ...
+def llama_n_vocab(model: llama_model_p, /) -> int:
+    ...


 # LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
@ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
+def llama_n_ctx_train(model: llama_model_p, /) -> int:
+    ...


 # LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
@ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_embd(model: llama_model_p, /) -> int: ...
+def llama_n_embd(model: llama_model_p, /) -> int:
+    ...


 # // Get the model's RoPE frequency scaling factor
@ -1190,7 +1192,8 @@ def llama_model_apply_lora_from_file(
    path_base_model: Union[ctypes.c_char_p, bytes, None],
    n_threads: Union[ctypes.c_int32, int],
    /,
-) -> int: ...
+) -> int:
+    ...


 # //
@ -1216,7 +1219,7 @@ class llama_kv_cache_view_cell(ctypes.Structure):
 #     // Maximum number of sequences that can exist in a cell. It's not an error
 #     // if there are more sequences in a cell than this value, however they will
 #     // not be visible in the view cells_sequences.
-#     int32_t n_seq_max;
+#     int32_t n_max_seq;

 #     // Number of tokens in the cache. For example, if there are two populated
 #     // cells, the first with 1 sequence id in it and the second with 2 sequence
@ -1237,7 +1240,7 @@ class llama_kv_cache_view_cell(ctypes.Structure):
 #     struct llama_kv_cache_view_cell * cells;


-#     // The sequences for each cell. There will be n_seq_max items per cell.
+#     // The sequences for each cell. There will be n_max_seq items per cell.
 #     llama_seq_id * cells_sequences;
 # };
 class llama_kv_cache_view(ctypes.Structure):
@ -1257,14 +1260,14 @@ llama_kv_cache_view_p = ctypes.POINTER(llama_kv_cache_view)


 # // Create an empty KV cache view. (use only for debugging purposes)
-# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
+# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
@ctypes_function(
    "llama_kv_cache_view_init",
    [llama_context_p_ctypes, ctypes.c_int32],
    llama_kv_cache_view,
 )
 def llama_kv_cache_view_init(
-    ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], /
+    ctx: llama_context_p, n_max_seq: Union[ctypes.c_int32, int], /
 ) -> llama_kv_cache_view:
    """Create an empty KV cache view. (use only for debugging purposes)"""
    ...
@ -1579,7 +1582,8 @@ def llama_load_session_file(
    n_token_capacity: Union[ctypes.c_size_t, int],
    n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
    /,
-) -> int: ...
+) -> int:
+    ...


 # LLAMA_API bool llama_save_session_file(
@ -1603,7 +1607,8 @@ def llama_save_session_file(
    tokens: CtypesArray[llama_token],
    n_token_count: Union[ctypes.c_size_t, int],
    /,
-) -> int: ...
+) -> int:
+    ...


 # //
@ -1723,17 +1728,6 @@ def llama_set_n_threads(
    """
    ...

-
-# // Set whether to use causal attention or not
-# // If set to true, the model will only attend to the past tokens
-# LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
-@ctypes_function("llama_set_causal_attn", [llama_context_p_ctypes, ctypes.c_bool], None)
-def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /):
-    """Set whether to use causal attention or not
-    If set to true, the model will only attend to the past tokens"""
-    ...
-
-
 # // Set abort callback
 # LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
@ctypes_function(
@ -1751,18 +1745,6 @@ def llama_set_abort_callback(
    ...


-# // Wait until all computations are finished
-# // This is automatically done when using one of the functions below to obtain the computation results
-# // and is not necessary to call it explicitly in most cases
-# LLAMA_API void llama_synchronize(struct llama_context * ctx);
-@ctypes_function("llama_synchronize", [llama_context_p_ctypes], None)
-def llama_synchronize(ctx: llama_context_p, /):
-    """Wait until all computations are finished
-    This is automatically done when using one of the functions below to obtain the computation results
-    and is not necessary to call it explicitly in most cases"""
-    ...
-
-
 # // Token logits obtained from the last call to llama_decode()
 # // The logits for the last token are stored in the last row
 # // Logits for which llama_batch.logits[i] == 0 are undefined
@ -1846,7 +1828,6 @@ def llama_get_embeddings_seq(
    shape: [n_embd] (1-dimensional)"""
    ...

-
 # //
 # // Vocab
 # //
@ -1858,7 +1839,8 @@ def llama_get_embeddings_seq(
 )
 def llama_token_get_text(
    model: llama_model_p, token: Union[llama_token, int], /
-) -> bytes: ...
+) -> bytes:
+    ...


 # LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
@ -1867,7 +1849,8 @@ def llama_token_get_text(
 )
 def llama_token_get_score(
    model: llama_model_p, token: Union[llama_token, int], /
-) -> float: ...
+) -> float:
+    ...


 # LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
@ -1876,7 +1859,8 @@ def llama_token_get_score(
 )
 def llama_token_get_type(
    model: llama_model_p, token: Union[llama_token, int], /
-) -> int: ...
+) -> int:
+    ...


 # // Special tokens
@ -1929,17 +1913,20 @@ def llama_token_prefix(model: llama_model_p) -> int:

 # LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
@ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token)
-def llama_token_middle(model: llama_model_p, /) -> int: ...
+def llama_token_middle(model: llama_model_p, /) -> int:
+    ...


 # LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
@ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token)
-def llama_token_suffix(model: llama_model_p, /) -> int: ...
+def llama_token_suffix(model: llama_model_p, /) -> int:
+    ...


 # LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
-def llama_token_eot(model: llama_model_p, /) -> int: ...
+def llama_token_eot(model: llama_model_p, /) -> int:
+    ...


 # //
@ -1949,7 +1936,7 @@ def llama_token_eot(model: llama_model_p, /) -> int: ...

 # /// @details Convert the provided text into tokens.
 # /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
-# /// @return Returns the number of tokens on success, no more than n_tokens_max
+# /// @return Returns the number of tokens on success, no more than n_max_tokens
 # /// @return Returns a negative number on failure - the number of tokens that would have been returned
 # /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
 # ///                Does not insert a leading space.
@ -1958,7 +1945,7 @@ def llama_token_eot(model: llama_model_p, /) -> int: ...
 #                   const char * text,
 #                      int32_t   text_len,
 #                  llama_token * tokens,
-#                      int32_t   n_tokens_max,
+#                      int32_t   n_max_tokens,
 #                         bool   add_bos,
 #                         bool   special);
@ctypes_function(
@ -1979,26 +1966,12 @@ def llama_tokenize(
    text: bytes,
    text_len: Union[ctypes.c_int, int],
    tokens: CtypesArray[llama_token],
-    n_tokens_max: Union[ctypes.c_int, int],
+    n_max_tokens: Union[ctypes.c_int, int],
    add_bos: Union[ctypes.c_bool, bool],
    special: Union[ctypes.c_bool, bool],
    /,
 ) -> int:
-    """Convert the provided text into tokens.
-    
-    Args:
-        model: The model to use for tokenization.
-        text: The text to tokenize.
-        text_len: The length of the text.
-        tokens: The tokens pointer must be large enough to hold the resulting tokens.
-        n_max_tokens: The maximum number of tokens to return.
-        add_bos: Whether to add a beginning-of-sentence token.
-        special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
-                 Does not insert a leading space.
-                 
-    Returns:
-        Returns the number of tokens on success, no more than n_tokens_max
-        Returns a negative number on failure - the number of tokens that would have been returned"""
+    """Convert the provided text into tokens."""
    ...


@ -2070,7 +2043,8 @@ def llama_chat_apply_template(
    chat: CtypesArray[llama_chat_message],
    n_msg: int,
    /,
-) -> int: ...
+) -> int:
+    ...


 # //
@ -2671,7 +2645,8 @@ def llama_beam_search(
    n_past: Union[ctypes.c_int, int],
    n_predict: Union[ctypes.c_int, int],
    /,
-): ...
+):
+    ...


 # Performance information
@ -2748,4 +2723,5 @@ def llama_log_set(
    [ctypes.c_void_p, llama_context_p_ctypes],
    None,
 )
-def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /): ...
+def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /):
+    ...
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
+Subproject commit c2101a2e909ac7c08976d414e64e96c90ee5fa9e