feat: Update llama.cpp

2024-03-18 10:26:36 -04:00 · 2024-03-18 10:26:36 -04:00 · 8d298b4750
commit 8d298b4750
parent 6eb25231e4
2 changed files with 78 additions and 9 deletions
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -581,6 +581,7 @@ class llama_model_params(ctypes.Structure):
 #     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
 #     // Abort callback
 #     // if it returns true, execution of llama_decode() will be aborted
 #     // currently works only with CPU execution
@ -1006,6 +1007,11 @@ def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
 def llama_n_embd(model: llama_model_p, /) -> int: ...
 # LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_n_layer(model: llama_model_p, /) -> int: ...
 # // Get the model's RoPE frequency scaling factor
 # LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
@ -1166,12 +1172,18 @@ def llama_model_quantize(
    ...
 # // Apply a LoRA adapter to a loaded model
 # // path_base_model is the path to a higher quality model to use as a base for
 # // the layers modified by the adapter. Can be NULL to use the current loaded model.
 # // The model needs to be reloaded before applying a new adapter, otherwise the adapter
 # // will be applied on top of the previous one
 # // Returns 0 on success
 # LLAMA_API int32_t llama_model_apply_lora_from_file(
 #         const struct llama_model * model,
-#                   const char * path_lora,
+#                       const char * path_lora,
-#                        float   scale,
+#                            float   scale,
-#                   const char * path_base_model,
+#                       const char * path_base_model,
-#                      int32_t   n_threads);
+#                          int32_t   n_threads);
@ctypes_function(
    "llama_model_apply_lora_from_file",
    [
@ -1190,7 +1202,57 @@ def llama_model_apply_lora_from_file(
    path_base_model: Union[ctypes.c_char_p, bytes, None],
    n_threads: Union[ctypes.c_int32, int],
    /,
-) -> int: ...
+) -> int:
    """Apply a LoRA adapter to a loaded model
    path_base_model is the path to a higher quality model to use as a base for
    the layers modified by the adapter. Can be NULL to use the current loaded model.
    The model needs to be reloaded before applying a new adapter, otherwise the adapter
    will be applied on top of the previous one
    Returns 0 on success"""
    ...
 # // Apply a loaded control vector to a llama_context, or if data is NULL, clear
 # // the currently loaded vector.
 # // n_embd should be the size of a single layer's control, and data should point
 # // to an n_embd x n_layers buffer starting from layer 1.
 # // il_start and il_end are the layer range the vector should apply to (both inclusive)
 # // See llama_control_vector_load in common to load a control vector.
 # LLAMA_API int32_t llama_control_vector_apply(
 #         struct llama_context * lctx,
 #                  const float * data,
 #                       size_t   len,
 #                      int32_t   n_embd,
 #                      int32_t   il_start,
 #                      int32_t   il_end);
@ctypes_function(
    "llama_control_vector_apply",
    [
        llama_context_p_ctypes,
        ctypes.POINTER(ctypes.c_float),
        ctypes.c_size_t,
        ctypes.c_int32,
        ctypes.c_int32,
        ctypes.c_int32,
    ],
    ctypes.c_int32,
 )
 def llama_control_vector_apply(
    lctx: llama_context_p,
    data: CtypesPointerOrRef[ctypes.c_float],
    len: int,
    n_embd: int,
    il_start: int,
    il_end: int,
    /,
 ) -> int:
    """Apply a loaded control vector to a llama_context, or if data is NULL, clear
    the currently loaded vector.
    n_embd should be the size of a single layer's control, and data should point
    to an n_embd x n_layers buffer starting from layer 1.
    il_start and il_end are the layer range the vector should apply to (both inclusive)
    See llama_control_vector_load in common to load a control vector."""
    ...
 # //
@ -1205,6 +1267,12 @@ def llama_model_apply_lora_from_file(
 #     llama_pos pos;
 # };
 class llama_kv_cache_view_cell(ctypes.Structure):
    """Information associated with an individual cell in the KV cache view.
    Attributes:
        pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
            May be negative if the cell is not populated."""
    _fields_ = [("pos", llama_pos)]
@ -1985,7 +2053,7 @@ def llama_tokenize(
    /,
 ) -> int:
    """Convert the provided text into tokens.
-    
+
    Args:
        model: The model to use for tokenization.
        text: The text to tokenize.
@ -1995,10 +2063,11 @@ def llama_tokenize(
        add_bos: Whether to add a beginning-of-sentence token.
        special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
                 Does not insert a leading space.
-                 
+
    Returns:
        Returns the number of tokens on success, no more than n_tokens_max
-        Returns a negative number on failure - the number of tokens that would have been returned"""
+        Returns a negative number on failure - the number of tokens that would have been returned
    """
    ...
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
+Subproject commit ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
		`@ -1 +1 @@`
			`Subproject commit 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc`				`Subproject commit ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1`