feat: Update llama.cpp

2024-03-13 15:57:35 -04:00 · 2024-03-13 15:57:35 -04:00 · dd0ee56217
commit dd0ee56217
parent 08e910f7a7
2 changed files with 87 additions and 75 deletions
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -548,8 +548,9 @@ class llama_model_params(ctypes.Structure):
 # struct llama_context_params {
 #     uint32_t seed;              // RNG seed, -1 for random
 #     uint32_t n_ctx;             // text context, 0 = from model
-#     uint32_t n_batch;           // prompt processing maximum batch size
+#     uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
-#     uint32_t n_parallel;        // number of parallel sequences (i.e. distinct states for recurrent models)
+#     uint32_t n_ubatch;          // physical maximum batch size
 #     uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
 #     uint32_t n_threads;         // number of threads to use for generation
 #     uint32_t n_threads_batch;   // number of threads to use for batch processing
@ -578,6 +579,7 @@ class llama_model_params(ctypes.Structure):
 #     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
 #     // Abort callback
 #     // if it returns true, execution of llama_decode() will be aborted
 #     // currently works only with CPU execution
@ -590,8 +592,9 @@ class llama_context_params(ctypes.Structure):
    Attributes:
        seed (int): RNG seed, -1 for random
        n_ctx (int): text context, 0 = from model
-        n_batch (int): prompt processing maximum batch size
+        n_batch (int): logical maximum batch size that can be submitted to llama_decode
-        n_parallel (int): number of parallel sequences (i.e. distinct states for recurrent models)
+        n_ubatch (int): physical maximum batch size
        n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models)
        n_threads (int): number of threads to use for generation
        n_threads_batch (int): number of threads to use for batch processing
        rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
@ -619,7 +622,8 @@ class llama_context_params(ctypes.Structure):
        ("seed", ctypes.c_uint32),
        ("n_ctx", ctypes.c_uint32),
        ("n_batch", ctypes.c_uint32),
-        ("n_parallel", ctypes.c_uint32),
+        ("n_ubatch", ctypes.c_uint32),
        ("n_seq_max", ctypes.c_uint32),
        ("n_threads", ctypes.c_uint32),
        ("n_threads_batch", ctypes.c_uint32),
        ("rope_scaling_type", ctypes.c_int),
@ -667,7 +671,7 @@ It might not exist for progress report where '.' is output repeatedly."""
 #     bool allow_requantize;       // allow quantizing non-f32/f16 tensors
 #     bool quantize_output_tensor; // quantize output.weight
 #     bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-#     bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
+#     bool pure;                   // quantize all tensors to the default type
 #     void * imatrix;              // pointer to importance matrix data
 # } llama_model_quantize_params;
 class llama_model_quantize_params(ctypes.Structure):
@ -679,7 +683,7 @@ class llama_model_quantize_params(ctypes.Structure):
        allow_requantize (bool): allow quantizing non-f32/f16 tensors
        quantize_output_tensor (bool): quantize output.weight
        only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        pure (bool): disable k-quant mixtures and quantize all tensors to the same type
+        pure (bool): quantize all tensors to the default type
        imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data
    """
@ -860,8 +864,7 @@ GGML_NUMA_STRATEGY_COUNT = 5
    [ctypes.c_int],
    None,
 )
-def llama_numa_init(numa: int, /):
+def llama_numa_init(numa: int, /): ...
    ...
 # // Call once at the end of the program - currently only used for MPI
@ -886,8 +889,7 @@ def llama_backend_free():
 )
 def llama_load_model_from_file(
    path_model: bytes, params: llama_model_params, /
-) -> Optional[llama_model_p]:
+) -> Optional[llama_model_p]: ...
    ...
 # LLAMA_API void llama_free_model(struct llama_model * model);
@ -896,8 +898,7 @@ def llama_load_model_from_file(
    [llama_model_p_ctypes],
    None,
 )
-def llama_free_model(model: llama_model_p, /):
+def llama_free_model(model: llama_model_p, /): ...
    ...
 # LLAMA_API struct llama_context * llama_new_context_with_model(
@ -910,8 +911,7 @@ def llama_free_model(model: llama_model_p, /):
 )
 def llama_new_context_with_model(
    model: llama_model_p, params: llama_context_params, /
-) -> Optional[llama_context_p]:
+) -> Optional[llama_context_p]: ...
    ...
 # // Frees all allocated memory
@ -932,80 +932,77 @@ def llama_free(ctx: llama_context_p, /):
    [],
    ctypes.c_int64,
 )
-def llama_time_us() -> int:
+def llama_time_us() -> int: ...
    ...
 # LLAMA_API size_t llama_max_devices(void);
@ctypes_function("llama_max_devices", [], ctypes.c_size_t)
-def llama_max_devices() -> int:
+def llama_max_devices() -> int: ...
    ...
 # LLAMA_API bool llama_supports_mmap       (void);
@ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
-def llama_supports_mmap() -> bool:
+def llama_supports_mmap() -> bool: ...
    ...
 # LLAMA_API bool llama_supports_mlock      (void);
@ctypes_function("llama_supports_mlock", [], ctypes.c_bool)
-def llama_supports_mlock() -> bool:
+def llama_supports_mlock() -> bool: ...
    ...
 # LLAMA_API bool llama_supports_gpu_offload(void);
@ctypes_function("llama_supports_gpu_offload", [], ctypes.c_bool)
-def llama_supports_gpu_offload() -> bool:
+def llama_supports_gpu_offload() -> bool: ...
    ...
 # LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
@ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
-def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
+def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ...
    ...
 # LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
@ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_ctx(ctx: llama_context_p, /) -> int:
+def llama_n_ctx(ctx: llama_context_p, /) -> int: ...
    ...
 # LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
@ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32)
-def llama_n_batch(ctx: llama_context_p, /) -> int:
+def llama_n_batch(ctx: llama_context_p, /) -> int: ...
-    ...
+
 # LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
@ctypes_function("llama_n_ubatch", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_ubatch(ctx: llama_context_p, /) -> int: ...
 # LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
@ctypes_function("llama_n_seq_max", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
 # LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int)
-def llama_vocab_type(model: llama_model_p, /) -> int:
+def llama_vocab_type(model: llama_model_p, /) -> int: ...
    ...
 # LLAMA_API enum llama_rope_type  llama_rope_type (const struct llama_model * model);
@ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int)
-def llama_rope_type(model: llama_model_p, /) -> int:
+def llama_rope_type(model: llama_model_p, /) -> int: ...
    ...
 # LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
@ctypes_function("llama_n_vocab", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_vocab(model: llama_model_p, /) -> int:
+def llama_n_vocab(model: llama_model_p, /) -> int: ...
    ...
 # LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
@ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_ctx_train(model: llama_model_p, /) -> int:
+def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
    ...
 # LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
@ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
-def llama_n_embd(model: llama_model_p, /) -> int:
+def llama_n_embd(model: llama_model_p, /) -> int: ...
    ...
 # // Get the model's RoPE frequency scaling factor
@ -1192,8 +1189,7 @@ def llama_model_apply_lora_from_file(
    path_base_model: Union[ctypes.c_char_p, bytes, None],
    n_threads: Union[ctypes.c_int32, int],
    /,
-) -> int:
+) -> int: ...
    ...
 # //
@ -1219,7 +1215,7 @@ class llama_kv_cache_view_cell(ctypes.Structure):
 #     // Maximum number of sequences that can exist in a cell. It's not an error
 #     // if there are more sequences in a cell than this value, however they will
 #     // not be visible in the view cells_sequences.
-#     int32_t n_max_seq;
+#     int32_t n_seq_max;
 #     // Number of tokens in the cache. For example, if there are two populated
 #     // cells, the first with 1 sequence id in it and the second with 2 sequence
@ -1240,7 +1236,7 @@ class llama_kv_cache_view_cell(ctypes.Structure):
 #     struct llama_kv_cache_view_cell * cells;
-#     // The sequences for each cell. There will be n_max_seq items per cell.
+#     // The sequences for each cell. There will be n_seq_max items per cell.
 #     llama_seq_id * cells_sequences;
 # };
 class llama_kv_cache_view(ctypes.Structure):
@ -1260,14 +1256,14 @@ llama_kv_cache_view_p = ctypes.POINTER(llama_kv_cache_view)
 # // Create an empty KV cache view. (use only for debugging purposes)
-# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
+# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
@ctypes_function(
    "llama_kv_cache_view_init",
    [llama_context_p_ctypes, ctypes.c_int32],
    llama_kv_cache_view,
 )
 def llama_kv_cache_view_init(
-    ctx: llama_context_p, n_max_seq: Union[ctypes.c_int32, int], /
+    ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], /
 ) -> llama_kv_cache_view:
    """Create an empty KV cache view. (use only for debugging purposes)"""
    ...
@ -1582,8 +1578,7 @@ def llama_load_session_file(
    n_token_capacity: Union[ctypes.c_size_t, int],
    n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
    /,
-) -> int:
+) -> int: ...
    ...
 # LLAMA_API bool llama_save_session_file(
@ -1607,8 +1602,7 @@ def llama_save_session_file(
    tokens: CtypesArray[llama_token],
    n_token_count: Union[ctypes.c_size_t, int],
    /,
-) -> int:
+) -> int: ...
    ...
 # //
@ -1756,6 +1750,18 @@ def llama_set_abort_callback(
    ...
 # // Wait until all computations are finished
 # // This is automatically done when using one of the functions below to obtain the computation results
 # // and is not necessary to call it explicitly in most cases
 # LLAMA_API void llama_synchronize(struct llama_context * ctx);
@ctypes_function("llama_synchronize", [llama_context_p_ctypes], None)
 def llama_synchronize(ctx: llama_context_p, /):
    """Wait until all computations are finished
    This is automatically done when using one of the functions below to obtain the computation results
    and is not necessary to call it explicitly in most cases"""
    ...
 # // Token logits obtained from the last call to llama_decode()
 # // The logits for the last token are stored in the last row
 # // Logits for which llama_batch.logits[i] == 0 are undefined
@ -1839,6 +1845,7 @@ def llama_get_embeddings_seq(
    shape: [n_embd] (1-dimensional)"""
    ...
 # //
 # // Vocab
 # //
@ -1850,8 +1857,7 @@ def llama_get_embeddings_seq(
 )
 def llama_token_get_text(
    model: llama_model_p, token: Union[llama_token, int], /
-) -> bytes:
+) -> bytes: ...
    ...
 # LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
@ -1860,8 +1866,7 @@ def llama_token_get_text(
 )
 def llama_token_get_score(
    model: llama_model_p, token: Union[llama_token, int], /
-) -> float:
+) -> float: ...
    ...
 # LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
@ -1870,8 +1875,7 @@ def llama_token_get_score(
 )
 def llama_token_get_type(
    model: llama_model_p, token: Union[llama_token, int], /
-) -> int:
+) -> int: ...
    ...
 # // Special tokens
@ -1924,20 +1928,17 @@ def llama_token_prefix(model: llama_model_p) -> int:
 # LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
@ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token)
-def llama_token_middle(model: llama_model_p, /) -> int:
+def llama_token_middle(model: llama_model_p, /) -> int: ...
    ...
 # LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
@ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token)
-def llama_token_suffix(model: llama_model_p, /) -> int:
+def llama_token_suffix(model: llama_model_p, /) -> int: ...
    ...
 # LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
-def llama_token_eot(model: llama_model_p, /) -> int:
+def llama_token_eot(model: llama_model_p, /) -> int: ...
    ...
 # //
@ -1947,7 +1948,7 @@ def llama_token_eot(model: llama_model_p, /) -> int:
 # /// @details Convert the provided text into tokens.
 # /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
-# /// @return Returns the number of tokens on success, no more than n_max_tokens
+# /// @return Returns the number of tokens on success, no more than n_tokens_max
 # /// @return Returns a negative number on failure - the number of tokens that would have been returned
 # /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
 # ///                Does not insert a leading space.
@ -1956,7 +1957,7 @@ def llama_token_eot(model: llama_model_p, /) -> int:
 #                   const char * text,
 #                      int32_t   text_len,
 #                  llama_token * tokens,
-#                      int32_t   n_max_tokens,
+#                      int32_t   n_tokens_max,
 #                         bool   add_bos,
 #                         bool   special);
@ctypes_function(
@ -1977,12 +1978,26 @@ def llama_tokenize(
    text: bytes,
    text_len: Union[ctypes.c_int, int],
    tokens: CtypesArray[llama_token],
-    n_max_tokens: Union[ctypes.c_int, int],
+    n_tokens_max: Union[ctypes.c_int, int],
    add_bos: Union[ctypes.c_bool, bool],
    special: Union[ctypes.c_bool, bool],
    /,
 ) -> int:
-    """Convert the provided text into tokens."""
+    """Convert the provided text into tokens.
    Args:
        model: The model to use for tokenization.
        text: The text to tokenize.
        text_len: The length of the text.
        tokens: The tokens pointer must be large enough to hold the resulting tokens.
        n_max_tokens: The maximum number of tokens to return.
        add_bos: Whether to add a beginning-of-sentence token.
        special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
                 Does not insert a leading space.
    Returns:
        Returns the number of tokens on success, no more than n_tokens_max
        Returns a negative number on failure - the number of tokens that would have been returned"""
    ...
@ -2054,8 +2069,7 @@ def llama_chat_apply_template(
    chat: CtypesArray[llama_chat_message],
    n_msg: int,
    /,
-) -> int:
+) -> int: ...
    ...
 # //
@ -2656,8 +2670,7 @@ def llama_beam_search(
    n_past: Union[ctypes.c_int, int],
    n_predict: Union[ctypes.c_int, int],
    /,
-):
+): ...
    ...
 # Performance information
@ -2734,5 +2747,4 @@ def llama_log_set(
    [ctypes.c_void_p, llama_context_p_ctypes],
    None,
 )
-def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /):
+def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /): ...
    ...
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 3814a07392d2bdc22911652bc7c2f9bdb0ce042e
+Subproject commit 19885d205e768579ab090d1e99281cae58c21b54
		`@ -1 +1 @@`
			`Subproject commit 3814a07392d2bdc22911652bc7c2f9bdb0ce042e`				`Subproject commit 19885d205e768579ab090d1e99281cae58c21b54`