feat: Update llama.cpp

This commit is contained in:
Andrei Betlen 2024-03-18 10:26:36 -04:00
parent 6eb25231e4
commit 8d298b4750
2 changed files with 78 additions and 9 deletions

View file

@ -581,6 +581,7 @@ class llama_model_params(ctypes.Structure):
# bool embeddings; // if true, extract embeddings (together with logits) # bool embeddings; // if true, extract embeddings (together with logits)
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
# // Abort callback # // Abort callback
# // if it returns true, execution of llama_decode() will be aborted # // if it returns true, execution of llama_decode() will be aborted
# // currently works only with CPU execution # // currently works only with CPU execution
@ -1006,6 +1007,11 @@ def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
def llama_n_embd(model: llama_model_p, /) -> int: ... def llama_n_embd(model: llama_model_p, /) -> int: ...
# LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
def llama_n_layer(model: llama_model_p, /) -> int: ...
# // Get the model's RoPE frequency scaling factor # // Get the model's RoPE frequency scaling factor
# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); # LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float) @ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
@ -1166,12 +1172,18 @@ def llama_model_quantize(
... ...
# // Apply a LoRA adapter to a loaded model
# // path_base_model is the path to a higher quality model to use as a base for
# // the layers modified by the adapter. Can be NULL to use the current loaded model.
# // The model needs to be reloaded before applying a new adapter, otherwise the adapter
# // will be applied on top of the previous one
# // Returns 0 on success
# LLAMA_API int32_t llama_model_apply_lora_from_file( # LLAMA_API int32_t llama_model_apply_lora_from_file(
# const struct llama_model * model, # const struct llama_model * model,
# const char * path_lora, # const char * path_lora,
# float scale, # float scale,
# const char * path_base_model, # const char * path_base_model,
# int32_t n_threads); # int32_t n_threads);
@ctypes_function( @ctypes_function(
"llama_model_apply_lora_from_file", "llama_model_apply_lora_from_file",
[ [
@ -1190,7 +1202,57 @@ def llama_model_apply_lora_from_file(
path_base_model: Union[ctypes.c_char_p, bytes, None], path_base_model: Union[ctypes.c_char_p, bytes, None],
n_threads: Union[ctypes.c_int32, int], n_threads: Union[ctypes.c_int32, int],
/, /,
) -> int: ... ) -> int:
"""Apply a LoRA adapter to a loaded model
path_base_model is the path to a higher quality model to use as a base for
the layers modified by the adapter. Can be NULL to use the current loaded model.
The model needs to be reloaded before applying a new adapter, otherwise the adapter
will be applied on top of the previous one
Returns 0 on success"""
...
# // Apply a loaded control vector to a llama_context, or if data is NULL, clear
# // the currently loaded vector.
# // n_embd should be the size of a single layer's control, and data should point
# // to an n_embd x n_layers buffer starting from layer 1.
# // il_start and il_end are the layer range the vector should apply to (both inclusive)
# // See llama_control_vector_load in common to load a control vector.
# LLAMA_API int32_t llama_control_vector_apply(
# struct llama_context * lctx,
# const float * data,
# size_t len,
# int32_t n_embd,
# int32_t il_start,
# int32_t il_end);
@ctypes_function(
"llama_control_vector_apply",
[
llama_context_p_ctypes,
ctypes.POINTER(ctypes.c_float),
ctypes.c_size_t,
ctypes.c_int32,
ctypes.c_int32,
ctypes.c_int32,
],
ctypes.c_int32,
)
def llama_control_vector_apply(
lctx: llama_context_p,
data: CtypesPointerOrRef[ctypes.c_float],
len: int,
n_embd: int,
il_start: int,
il_end: int,
/,
) -> int:
"""Apply a loaded control vector to a llama_context, or if data is NULL, clear
the currently loaded vector.
n_embd should be the size of a single layer's control, and data should point
to an n_embd x n_layers buffer starting from layer 1.
il_start and il_end are the layer range the vector should apply to (both inclusive)
See llama_control_vector_load in common to load a control vector."""
...
# // # //
@ -1205,6 +1267,12 @@ def llama_model_apply_lora_from_file(
# llama_pos pos; # llama_pos pos;
# }; # };
class llama_kv_cache_view_cell(ctypes.Structure): class llama_kv_cache_view_cell(ctypes.Structure):
"""Information associated with an individual cell in the KV cache view.
Attributes:
pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
May be negative if the cell is not populated."""
_fields_ = [("pos", llama_pos)] _fields_ = [("pos", llama_pos)]
@ -1998,7 +2066,8 @@ def llama_tokenize(
Returns: Returns:
Returns the number of tokens on success, no more than n_tokens_max Returns the number of tokens on success, no more than n_tokens_max
Returns a negative number on failure - the number of tokens that would have been returned""" Returns a negative number on failure - the number of tokens that would have been returned
"""
... ...

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc Subproject commit ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1