feat: Update llama.cpp
This commit is contained in:
parent
6eb25231e4
commit
8d298b4750
2 changed files with 78 additions and 9 deletions
|
@ -581,6 +581,7 @@ class llama_model_params(ctypes.Structure):
|
||||||
# bool embeddings; // if true, extract embeddings (together with logits)
|
# bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
|
|
||||||
|
|
||||||
# // Abort callback
|
# // Abort callback
|
||||||
# // if it returns true, execution of llama_decode() will be aborted
|
# // if it returns true, execution of llama_decode() will be aborted
|
||||||
# // currently works only with CPU execution
|
# // currently works only with CPU execution
|
||||||
|
@ -1006,6 +1007,11 @@ def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
|
||||||
def llama_n_embd(model: llama_model_p, /) -> int: ...
|
def llama_n_embd(model: llama_model_p, /) -> int: ...
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
||||||
|
@ctypes_function("llama_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
|
||||||
|
def llama_n_layer(model: llama_model_p, /) -> int: ...
|
||||||
|
|
||||||
|
|
||||||
# // Get the model's RoPE frequency scaling factor
|
# // Get the model's RoPE frequency scaling factor
|
||||||
# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
# LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||||
@ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
|
@ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
|
||||||
|
@ -1166,12 +1172,18 @@ def llama_model_quantize(
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# // Apply a LoRA adapter to a loaded model
|
||||||
|
# // path_base_model is the path to a higher quality model to use as a base for
|
||||||
|
# // the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||||
|
# // The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||||
|
# // will be applied on top of the previous one
|
||||||
|
# // Returns 0 on success
|
||||||
# LLAMA_API int32_t llama_model_apply_lora_from_file(
|
# LLAMA_API int32_t llama_model_apply_lora_from_file(
|
||||||
# const struct llama_model * model,
|
# const struct llama_model * model,
|
||||||
# const char * path_lora,
|
# const char * path_lora,
|
||||||
# float scale,
|
# float scale,
|
||||||
# const char * path_base_model,
|
# const char * path_base_model,
|
||||||
# int32_t n_threads);
|
# int32_t n_threads);
|
||||||
@ctypes_function(
|
@ctypes_function(
|
||||||
"llama_model_apply_lora_from_file",
|
"llama_model_apply_lora_from_file",
|
||||||
[
|
[
|
||||||
|
@ -1190,7 +1202,57 @@ def llama_model_apply_lora_from_file(
|
||||||
path_base_model: Union[ctypes.c_char_p, bytes, None],
|
path_base_model: Union[ctypes.c_char_p, bytes, None],
|
||||||
n_threads: Union[ctypes.c_int32, int],
|
n_threads: Union[ctypes.c_int32, int],
|
||||||
/,
|
/,
|
||||||
) -> int: ...
|
) -> int:
|
||||||
|
"""Apply a LoRA adapter to a loaded model
|
||||||
|
path_base_model is the path to a higher quality model to use as a base for
|
||||||
|
the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||||
|
The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||||
|
will be applied on top of the previous one
|
||||||
|
Returns 0 on success"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# // Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||||
|
# // the currently loaded vector.
|
||||||
|
# // n_embd should be the size of a single layer's control, and data should point
|
||||||
|
# // to an n_embd x n_layers buffer starting from layer 1.
|
||||||
|
# // il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||||
|
# // See llama_control_vector_load in common to load a control vector.
|
||||||
|
# LLAMA_API int32_t llama_control_vector_apply(
|
||||||
|
# struct llama_context * lctx,
|
||||||
|
# const float * data,
|
||||||
|
# size_t len,
|
||||||
|
# int32_t n_embd,
|
||||||
|
# int32_t il_start,
|
||||||
|
# int32_t il_end);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_control_vector_apply",
|
||||||
|
[
|
||||||
|
llama_context_p_ctypes,
|
||||||
|
ctypes.POINTER(ctypes.c_float),
|
||||||
|
ctypes.c_size_t,
|
||||||
|
ctypes.c_int32,
|
||||||
|
ctypes.c_int32,
|
||||||
|
ctypes.c_int32,
|
||||||
|
],
|
||||||
|
ctypes.c_int32,
|
||||||
|
)
|
||||||
|
def llama_control_vector_apply(
|
||||||
|
lctx: llama_context_p,
|
||||||
|
data: CtypesPointerOrRef[ctypes.c_float],
|
||||||
|
len: int,
|
||||||
|
n_embd: int,
|
||||||
|
il_start: int,
|
||||||
|
il_end: int,
|
||||||
|
/,
|
||||||
|
) -> int:
|
||||||
|
"""Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||||
|
the currently loaded vector.
|
||||||
|
n_embd should be the size of a single layer's control, and data should point
|
||||||
|
to an n_embd x n_layers buffer starting from layer 1.
|
||||||
|
il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||||
|
See llama_control_vector_load in common to load a control vector."""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
# //
|
# //
|
||||||
|
@ -1205,6 +1267,12 @@ def llama_model_apply_lora_from_file(
|
||||||
# llama_pos pos;
|
# llama_pos pos;
|
||||||
# };
|
# };
|
||||||
class llama_kv_cache_view_cell(ctypes.Structure):
|
class llama_kv_cache_view_cell(ctypes.Structure):
|
||||||
|
"""Information associated with an individual cell in the KV cache view.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
pos (llama_pos): The position for this cell. Takes KV cache shifts into account.
|
||||||
|
May be negative if the cell is not populated."""
|
||||||
|
|
||||||
_fields_ = [("pos", llama_pos)]
|
_fields_ = [("pos", llama_pos)]
|
||||||
|
|
||||||
|
|
||||||
|
@ -1985,7 +2053,7 @@ def llama_tokenize(
|
||||||
/,
|
/,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Convert the provided text into tokens.
|
"""Convert the provided text into tokens.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model: The model to use for tokenization.
|
model: The model to use for tokenization.
|
||||||
text: The text to tokenize.
|
text: The text to tokenize.
|
||||||
|
@ -1995,10 +2063,11 @@ def llama_tokenize(
|
||||||
add_bos: Whether to add a beginning-of-sentence token.
|
add_bos: Whether to add a beginning-of-sentence token.
|
||||||
special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
||||||
Does not insert a leading space.
|
Does not insert a leading space.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Returns the number of tokens on success, no more than n_tokens_max
|
Returns the number of tokens on success, no more than n_tokens_max
|
||||||
Returns a negative number on failure - the number of tokens that would have been returned"""
|
Returns a negative number on failure - the number of tokens that would have been returned
|
||||||
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc
|
Subproject commit ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
|
Loading…
Reference in a new issue