feat: Update llama.cpp
This commit is contained in:
parent
6eb25231e4
commit
8d298b4750
2 changed files with 78 additions and 9 deletions
|
@ -581,6 +581,7 @@ class llama_model_params(ctypes.Structure):
|
|||
# bool embeddings; // if true, extract embeddings (together with logits)
|
||||
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
|
||||
|
||||
# // Abort callback
|
||||
# // if it returns true, execution of llama_decode() will be aborted
|
||||
# // currently works only with CPU execution
|
||||
|
@ -1006,6 +1007,11 @@ def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
|
|||
def llama_n_embd(model: llama_model_p, /) -> int: ...
|
||||
|
||||
|
||||
# LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
||||
@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
|
||||
def llama_n_layer(model: llama_model_p, /) -> int: ...
|
||||
|
||||
|
||||
# // Get the model's RoPE frequency scaling factor
|
||||
# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||
@ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
|
||||
|
@ -1166,12 +1172,18 @@ def llama_model_quantize(
|
|||
...
|
||||
|
||||
|
||||
# // Apply a LoRA adapter to a loaded model
|
||||
# // path_base_model is the path to a higher quality model to use as a base for
|
||||
# // the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
# // The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
# // will be applied on top of the previous one
|
||||
# // Returns 0 on success
|
||||
# LLAMA_API int32_t llama_model_apply_lora_from_file(
|
||||
# const struct llama_model * model,
|
||||
# const char * path_lora,
|
||||
# float scale,
|
||||
# const char * path_base_model,
|
||||
# int32_t n_threads);
|
||||
# const char * path_lora,
|
||||
# float scale,
|
||||
# const char * path_base_model,
|
||||
# int32_t n_threads);
|
||||
@ctypes_function(
|
||||
"llama_model_apply_lora_from_file",
|
||||
[
|
||||
|
@ -1190,7 +1202,57 @@ def llama_model_apply_lora_from_file(
|
|||
path_base_model: Union[ctypes.c_char_p, bytes, None],
|
||||
n_threads: Union[ctypes.c_int32, int],
|
||||
/,
|
||||
) -> int: ...
|
||||
) -> int:
|
||||
"""Apply a LoRA adapter to a loaded model
|
||||
path_base_model is the path to a higher quality model to use as a base for
|
||||
the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
will be applied on top of the previous one
|
||||
Returns 0 on success"""
|
||||
...
|
||||
|
||||
|
||||
# // Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||
# // the currently loaded vector.
|
||||
# // n_embd should be the size of a single layer's control, and data should point
|
||||
# // to an n_embd x n_layers buffer starting from layer 1.
|
||||
# // il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||
# // See llama_control_vector_load in common to load a control vector.
|
||||
# LLAMA_API int32_t llama_control_vector_apply(
|
||||
# struct llama_context * lctx,
|
||||
# const float * data,
|
||||
# size_t len,
|
||||
# int32_t n_embd,
|
||||
# int32_t il_start,
|
||||
# int32_t il_end);
|
||||
@ctypes_function(
|
||||
"llama_control_vector_apply",
|
||||
[
|
||||
llama_context_p_ctypes,
|
||||
ctypes.POINTER(ctypes.c_float),
|
||||
ctypes.c_size_t,
|
||||
ctypes.c_int32,
|
||||
ctypes.c_int32,
|
||||
ctypes.c_int32,
|
||||
],
|
||||
ctypes.c_int32,
|
||||
)
|
||||
def llama_control_vector_apply(
|
||||
lctx: llama_context_p,
|
||||
data: CtypesPointerOrRef[ctypes.c_float],
|
||||
len: int,
|
||||
n_embd: int,
|
||||
il_start: int,
|
||||
il_end: int,
|
||||
/,
|
||||
) -> int:
|
||||
"""Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||
the currently loaded vector.
|
||||
n_embd should be the size of a single layer's control, and data should point
|
||||
to an n_embd x n_layers buffer starting from layer 1.
|
||||
il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||
See llama_control_vector_load in common to load a control vector."""
|
||||
...
|
||||
|
||||
|
||||
# //
|
||||
|
@ -1205,6 +1267,12 @@ def llama_model_apply_lora_from_file(
|
|||
# llama_pos pos;
|
||||
# };
|
||||
class llama_kv_cache_view_cell(ctypes.Structure):
|
||||
"""Information associated with an individual cell in the KV cache view.
|
||||
|
||||
Attributes:
|
||||
pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
|
||||
May be negative if the cell is not populated."""
|
||||
|
||||
_fields_ = [("pos", llama_pos)]
|
||||
|
||||
|
||||
|
@ -1985,7 +2053,7 @@ def llama_tokenize(
|
|||
/,
|
||||
) -> int:
|
||||
"""Convert the provided text into tokens.
|
||||
|
||||
|
||||
Args:
|
||||
model: The model to use for tokenization.
|
||||
text: The text to tokenize.
|
||||
|
@ -1995,10 +2063,11 @@ def llama_tokenize(
|
|||
add_bos: Whether to add a beginning-of-sentence token.
|
||||
special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
||||
Does not insert a leading space.
|
||||
|
||||
|
||||
Returns:
|
||||
Returns the number of tokens on success, no more than n_tokens_max
|
||||
Returns a negative number on failure - the number of tokens that would have been returned"""
|
||||
Returns a negative number on failure - the number of tokens that would have been returned
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
|
||||
Subproject commit ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
|
Loading…
Reference in a new issue