feat: Update llama.cpp
This commit is contained in:
parent
08b16afe11
commit
56071c956a
2 changed files with 231 additions and 8 deletions
|
@ -237,11 +237,18 @@ LLAMA_FILE_MAGIC_GGLA = 0x67676C61
|
||||||
# define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
# define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
LLAMA_FILE_MAGIC_GGSN = 0x6767736E
|
LLAMA_FILE_MAGIC_GGSN = 0x6767736E
|
||||||
|
|
||||||
|
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
||||||
|
LLAMA_FILE_MAGIC_GGSQ = 0x67677371
|
||||||
|
|
||||||
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
|
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
|
||||||
# define LLAMA_SESSION_VERSION 5
|
# define LLAMA_SESSION_VERSION 5
|
||||||
LLAMA_SESSION_VERSION = 5
|
LLAMA_SESSION_VERSION = 5
|
||||||
|
|
||||||
|
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
||||||
|
LLAMA_STATE_SEQ_MAGIC = LLAMA_FILE_MAGIC_GGSQ
|
||||||
|
#define LLAMA_STATE_SEQ_VERSION 1
|
||||||
|
LLAMA_STATE_SEQ_VERSION = 1
|
||||||
|
|
||||||
# struct llama_model;
|
# struct llama_model;
|
||||||
llama_model_p = NewType("llama_model_p", int)
|
llama_model_p = NewType("llama_model_p", int)
|
||||||
|
@ -1467,6 +1474,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
|
||||||
|
|
||||||
|
|
||||||
# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||||
|
# // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
||||||
# // seq_id < 0 : match any sequence
|
# // seq_id < 0 : match any sequence
|
||||||
# // p0 < 0 : [0, p1]
|
# // p0 < 0 : [0, p1]
|
||||||
# // p1 < 0 : [p0, inf)
|
# // p1 < 0 : [p0, inf)
|
||||||
|
@ -1493,6 +1501,9 @@ def llama_kv_cache_seq_rm(
|
||||||
/,
|
/,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
"""Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||||
|
|
||||||
|
Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
||||||
|
|
||||||
seq_id < 0 : match any sequence
|
seq_id < 0 : match any sequence
|
||||||
p0 < 0 : [0, p1]
|
p0 < 0 : [0, p1]
|
||||||
p1 < 0 : [p0, inf)"""
|
p1 < 0 : [p0, inf)"""
|
||||||
|
@ -1652,7 +1663,16 @@ def llama_kv_cache_update(ctx: llama_context_p, /):
|
||||||
|
|
||||||
# Returns the maximum size in bytes of the state (rng, logits, embedding
|
# Returns the maximum size in bytes of the state (rng, logits, embedding
|
||||||
# and kv_cache) - will often be smaller after compacting tokens
|
# and kv_cache) - will often be smaller after compacting tokens
|
||||||
# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
# LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
|
||||||
|
@ctypes_function("llama_state_get_size", [llama_context_p_ctypes], ctypes.c_size_t)
|
||||||
|
def llama_state_get_size(ctx: llama_context_p, /) -> int:
|
||||||
|
"""Returns the maximum size in bytes of the state (rng, logits, embedding
|
||||||
|
and kv_cache) - will often be smaller after compacting tokens"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
|
||||||
|
# "use llama_state_get_size instead");
|
||||||
@ctypes_function("llama_get_state_size", [llama_context_p_ctypes], ctypes.c_size_t)
|
@ctypes_function("llama_get_state_size", [llama_context_p_ctypes], ctypes.c_size_t)
|
||||||
def llama_get_state_size(ctx: llama_context_p, /) -> int:
|
def llama_get_state_size(ctx: llama_context_p, /) -> int:
|
||||||
"""Returns the maximum size in bytes of the state (rng, logits, embedding
|
"""Returns the maximum size in bytes of the state (rng, logits, embedding
|
||||||
|
@ -1663,9 +1683,30 @@ def llama_get_state_size(ctx: llama_context_p, /) -> int:
|
||||||
# Copies the state to the specified destination address.
|
# Copies the state to the specified destination address.
|
||||||
# Destination needs to have allocated enough memory.
|
# Destination needs to have allocated enough memory.
|
||||||
# Returns the number of bytes copied
|
# Returns the number of bytes copied
|
||||||
# LLAMA_API size_t llama_copy_state_data(
|
# LLAMA_API size_t llama_state_get_data(
|
||||||
# struct llama_context * ctx,
|
# struct llama_context * ctx,
|
||||||
# uint8_t * dst);
|
# uint8_t * dst);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_state_get_data",
|
||||||
|
[
|
||||||
|
llama_context_p_ctypes,
|
||||||
|
ctypes.POINTER(ctypes.c_uint8),
|
||||||
|
],
|
||||||
|
ctypes.c_size_t,
|
||||||
|
)
|
||||||
|
def llama_state_get_data(
|
||||||
|
ctx: llama_context_p, dst: CtypesArray[ctypes.c_uint8], /
|
||||||
|
) -> int:
|
||||||
|
"""Copies the state to the specified destination address.
|
||||||
|
Destination needs to have allocated enough memory.
|
||||||
|
Returns the number of bytes copied"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API DEPRECATED(size_t llama_copy_state_data(
|
||||||
|
# struct llama_context * ctx,
|
||||||
|
# uint8_t * dst),
|
||||||
|
# "use llama_state_get_data instead");
|
||||||
@ctypes_function(
|
@ctypes_function(
|
||||||
"llama_copy_state_data",
|
"llama_copy_state_data",
|
||||||
[
|
[
|
||||||
|
@ -1685,9 +1726,26 @@ def llama_copy_state_data(
|
||||||
|
|
||||||
# // Set the state reading from the specified address
|
# // Set the state reading from the specified address
|
||||||
# // Returns the number of bytes read
|
# // Returns the number of bytes read
|
||||||
# LLAMA_API size_t llama_set_state_data(
|
# LLAMA_API size_t llama_state_set_data(
|
||||||
# struct llama_context * ctx,
|
# struct llama_context * ctx,
|
||||||
# const uint8_t * src);
|
# const uint8_t * src);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_state_set_data",
|
||||||
|
[llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)],
|
||||||
|
ctypes.c_size_t,
|
||||||
|
)
|
||||||
|
def llama_state_set_data(
|
||||||
|
ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], /
|
||||||
|
) -> int:
|
||||||
|
"""Set the state reading from the specified address
|
||||||
|
Returns the number of bytes read"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API DEPRECATED(size_t llama_set_state_data(
|
||||||
|
# struct llama_context * ctx,
|
||||||
|
# const uint8_t * src),
|
||||||
|
# "use llama_state_set_data instead");
|
||||||
@ctypes_function(
|
@ctypes_function(
|
||||||
"llama_set_state_data",
|
"llama_set_state_data",
|
||||||
[llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)],
|
[llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)],
|
||||||
|
@ -1701,12 +1759,40 @@ def llama_set_state_data(
|
||||||
|
|
||||||
|
|
||||||
# Save/load session file
|
# Save/load session file
|
||||||
# LLAMA_API bool llama_load_session_file(
|
# LLAMA_API bool llama_state_load_file(
|
||||||
# struct llama_context * ctx,
|
# struct llama_context * ctx,
|
||||||
# const char * path_session,
|
# const char * path_session,
|
||||||
# llama_token * tokens_out,
|
# llama_token * tokens_out,
|
||||||
# size_t n_token_capacity,
|
# size_t n_token_capacity,
|
||||||
# size_t * n_token_count_out);
|
# size_t * n_token_count_out);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_state_load_file",
|
||||||
|
[
|
||||||
|
llama_context_p_ctypes,
|
||||||
|
ctypes.c_char_p,
|
||||||
|
llama_token_p,
|
||||||
|
ctypes.c_size_t,
|
||||||
|
ctypes.POINTER(ctypes.c_size_t),
|
||||||
|
],
|
||||||
|
ctypes.c_bool,
|
||||||
|
)
|
||||||
|
def llama_state_load_file(
|
||||||
|
ctx: llama_context_p,
|
||||||
|
path_session: bytes,
|
||||||
|
tokens_out: CtypesArray[llama_token],
|
||||||
|
n_token_capacity: Union[ctypes.c_size_t, int],
|
||||||
|
n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
|
||||||
|
/,
|
||||||
|
) -> bool: ...
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API DEPRECATED(bool llama_load_session_file(
|
||||||
|
# struct llama_context * ctx,
|
||||||
|
# const char * path_session,
|
||||||
|
# llama_token * tokens_out,
|
||||||
|
# size_t n_token_capacity,
|
||||||
|
# size_t * n_token_count_out),
|
||||||
|
# "use llama_state_load_file instead");
|
||||||
@ctypes_function(
|
@ctypes_function(
|
||||||
"llama_load_session_file",
|
"llama_load_session_file",
|
||||||
[
|
[
|
||||||
|
@ -1728,11 +1814,36 @@ def llama_load_session_file(
|
||||||
) -> int: ...
|
) -> int: ...
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API bool llama_save_session_file(
|
# LLAMA_API bool llama_state_save_file(
|
||||||
# struct llama_context * ctx,
|
# struct llama_context * ctx,
|
||||||
# const char * path_session,
|
# const char * path_session,
|
||||||
# const llama_token * tokens,
|
# const llama_token * tokens,
|
||||||
# size_t n_token_count);
|
# size_t n_token_count);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_state_save_file",
|
||||||
|
[
|
||||||
|
llama_context_p_ctypes,
|
||||||
|
ctypes.c_char_p,
|
||||||
|
llama_token_p,
|
||||||
|
ctypes.c_size_t,
|
||||||
|
],
|
||||||
|
ctypes.c_bool,
|
||||||
|
)
|
||||||
|
def llama_state_save_file(
|
||||||
|
ctx: llama_context_p,
|
||||||
|
path_session: bytes,
|
||||||
|
tokens: CtypesArray[llama_token],
|
||||||
|
n_token_count: Union[ctypes.c_size_t, int],
|
||||||
|
/,
|
||||||
|
) -> bool: ...
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API DEPRECATED(bool llama_save_session_file(
|
||||||
|
# struct llama_context * ctx,
|
||||||
|
# const char * path_session,
|
||||||
|
# const llama_token * tokens,
|
||||||
|
# size_t n_token_count),
|
||||||
|
# "use llama_state_save_file instead");
|
||||||
@ctypes_function(
|
@ctypes_function(
|
||||||
"llama_save_session_file",
|
"llama_save_session_file",
|
||||||
[
|
[
|
||||||
|
@ -1752,6 +1863,116 @@ def llama_save_session_file(
|
||||||
) -> int: ...
|
) -> int: ...
|
||||||
|
|
||||||
|
|
||||||
|
# // Get the exact size needed to copy the KV cache of a single sequence
|
||||||
|
# LLAMA_API size_t llama_state_seq_get_size(
|
||||||
|
# struct llama_context * ctx,
|
||||||
|
# llama_seq_id seq_id);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_state_seq_get_size",
|
||||||
|
[llama_context_p_ctypes, llama_seq_id],
|
||||||
|
ctypes.c_size_t,
|
||||||
|
)
|
||||||
|
def llama_state_seq_get_size(ctx: llama_context_p, seq_id: llama_seq_id, /) -> int:
|
||||||
|
"""Get the exact size needed to copy the KV cache of a single sequence"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# // Copy the KV cache of a single sequence into the specified buffer
|
||||||
|
# LLAMA_API size_t llama_state_seq_get_data(
|
||||||
|
# struct llama_context * ctx,
|
||||||
|
# uint8_t * dst,
|
||||||
|
# llama_seq_id seq_id);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_state_seq_get_data",
|
||||||
|
[llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8), llama_seq_id],
|
||||||
|
ctypes.c_size_t,
|
||||||
|
)
|
||||||
|
def llama_state_seq_get_data(
|
||||||
|
ctx: llama_context_p, dst: CtypesArray[ctypes.c_uint8], seq_id: llama_seq_id, /
|
||||||
|
) -> int:
|
||||||
|
"""Copy the KV cache of a single sequence into the specified buffer"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
|
||||||
|
# // Returns:
|
||||||
|
# // - Positive: Ok
|
||||||
|
# // - Zero: Failed to load
|
||||||
|
# LLAMA_API size_t llama_state_seq_set_data(
|
||||||
|
# struct llama_context * ctx,
|
||||||
|
# const uint8_t * src,
|
||||||
|
# llama_seq_id dest_seq_id);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_state_seq_set_data",
|
||||||
|
[llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8), llama_seq_id],
|
||||||
|
ctypes.c_size_t,
|
||||||
|
)
|
||||||
|
def llama_state_seq_set_data(
|
||||||
|
ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], dest_seq_id: llama_seq_id, /
|
||||||
|
) -> int:
|
||||||
|
"""Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API size_t llama_state_seq_save_file(
|
||||||
|
# struct llama_context * ctx,
|
||||||
|
# const char * filepath,
|
||||||
|
# llama_seq_id seq_id,
|
||||||
|
# const llama_token * tokens,
|
||||||
|
# size_t n_token_count);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_state_seq_save_file",
|
||||||
|
[
|
||||||
|
llama_context_p_ctypes,
|
||||||
|
ctypes.c_char_p,
|
||||||
|
llama_seq_id,
|
||||||
|
llama_token_p,
|
||||||
|
ctypes.c_size_t,
|
||||||
|
],
|
||||||
|
ctypes.c_size_t,
|
||||||
|
)
|
||||||
|
def llama_state_seq_save_file(
|
||||||
|
ctx: llama_context_p,
|
||||||
|
filepath: bytes,
|
||||||
|
seq_id: llama_seq_id,
|
||||||
|
tokens: CtypesArray[llama_token],
|
||||||
|
n_token_count: Union[ctypes.c_size_t, int],
|
||||||
|
/,
|
||||||
|
) -> int:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API size_t llama_state_seq_load_file(
|
||||||
|
# struct llama_context * ctx,
|
||||||
|
# const char * filepath,
|
||||||
|
# llama_seq_id dest_seq_id,
|
||||||
|
# llama_token * tokens_out,
|
||||||
|
# size_t n_token_capacity,
|
||||||
|
# size_t * n_token_count_out);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_state_seq_load_file",
|
||||||
|
[
|
||||||
|
llama_context_p_ctypes,
|
||||||
|
ctypes.c_char_p,
|
||||||
|
llama_seq_id,
|
||||||
|
llama_token_p,
|
||||||
|
ctypes.c_size_t,
|
||||||
|
ctypes.POINTER(ctypes.c_size_t),
|
||||||
|
],
|
||||||
|
ctypes.c_size_t,
|
||||||
|
)
|
||||||
|
def llama_state_seq_load_file(
|
||||||
|
ctx: llama_context_p,
|
||||||
|
filepath: bytes,
|
||||||
|
dest_seq_id: llama_seq_id,
|
||||||
|
tokens_out: CtypesArray[llama_token],
|
||||||
|
n_token_capacity: Union[ctypes.c_size_t, int],
|
||||||
|
n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
|
||||||
|
/,
|
||||||
|
) -> int:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
# //
|
# //
|
||||||
# // Decoding
|
# // Decoding
|
||||||
# //
|
# //
|
||||||
|
@ -1930,8 +2151,9 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
# // Logits for the ith token. Equivalent to:
|
# // Logits for the ith token. For positive indices, Equivalent to:
|
||||||
# // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
# // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
||||||
|
# // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
|
||||||
# // returns NULL for invalid ids.
|
# // returns NULL for invalid ids.
|
||||||
# LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
# LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
||||||
@ctypes_function(
|
@ctypes_function(
|
||||||
|
@ -1963,8 +2185,9 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
# // Get the embeddings for the ith token. Equivalent to:
|
# // Get the embeddings for the ith token. For positive indices, Equivalent to:
|
||||||
# // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
# // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
||||||
|
# // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
|
||||||
# // shape: [n_embd] (1-dimensional)
|
# // shape: [n_embd] (1-dimensional)
|
||||||
# // returns NULL for invalid ids.
|
# // returns NULL for invalid ids.
|
||||||
# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 75cd4c77292034ecec587ecb401366f57338f7c0
|
Subproject commit 400d5d722d7edf7de0cf24a18c42b183c65047d2
|
Loading…
Reference in a new issue