|
|
@ -198,13 +198,15 @@ llama_seq_id = ctypes.c_int32
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# enum llama_vocab_type {
|
|
|
|
# enum llama_vocab_type {
|
|
|
|
# LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
|
|
|
# LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
|
|
|
# LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
|
|
|
# LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
|
|
|
|
# LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
|
|
|
# LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
|
|
|
|
|
|
|
|
# LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
|
|
|
# };
|
|
|
|
# };
|
|
|
|
LLAMA_VOCAB_TYPE_SPM = 0
|
|
|
|
LLAMA_VOCAB_TYPE_NONE = 0
|
|
|
|
LLAMA_VOCAB_TYPE_BPE = 1
|
|
|
|
LLAMA_VOCAB_TYPE_SPM = 1
|
|
|
|
LLAMA_VOCAB_TYPE_WPM = 2
|
|
|
|
LLAMA_VOCAB_TYPE_BPE = 2
|
|
|
|
|
|
|
|
LLAMA_VOCAB_TYPE_WPM = 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // note: these values should be synchronized with ggml_rope
|
|
|
|
# // note: these values should be synchronized with ggml_rope
|
|
|
@ -548,8 +550,9 @@ class llama_model_params(ctypes.Structure):
|
|
|
|
# struct llama_context_params {
|
|
|
|
# struct llama_context_params {
|
|
|
|
# uint32_t seed; // RNG seed, -1 for random
|
|
|
|
# uint32_t seed; // RNG seed, -1 for random
|
|
|
|
# uint32_t n_ctx; // text context, 0 = from model
|
|
|
|
# uint32_t n_ctx; // text context, 0 = from model
|
|
|
|
# uint32_t n_batch; // prompt processing maximum batch size
|
|
|
|
# uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
|
|
|
# uint32_t n_parallel; // number of parallel sequences (i.e. distinct states for recurrent models)
|
|
|
|
# uint32_t n_ubatch; // physical maximum batch size
|
|
|
|
|
|
|
|
# uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
|
|
|
# uint32_t n_threads; // number of threads to use for generation
|
|
|
|
# uint32_t n_threads; // number of threads to use for generation
|
|
|
|
# uint32_t n_threads_batch; // number of threads to use for batch processing
|
|
|
|
# uint32_t n_threads_batch; // number of threads to use for batch processing
|
|
|
|
|
|
|
|
|
|
|
@ -590,8 +593,9 @@ class llama_context_params(ctypes.Structure):
|
|
|
|
Attributes:
|
|
|
|
Attributes:
|
|
|
|
seed (int): RNG seed, -1 for random
|
|
|
|
seed (int): RNG seed, -1 for random
|
|
|
|
n_ctx (int): text context, 0 = from model
|
|
|
|
n_ctx (int): text context, 0 = from model
|
|
|
|
n_batch (int): prompt processing maximum batch size
|
|
|
|
n_batch (int): logical maximum batch size that can be submitted to llama_decode
|
|
|
|
n_parallel (int): number of parallel sequences (i.e. distinct states for recurrent models)
|
|
|
|
n_ubatch (int): physical maximum batch size
|
|
|
|
|
|
|
|
n_seq_max (int): max number of sequences (i.e. distinct states for recurrent models)
|
|
|
|
n_threads (int): number of threads to use for generation
|
|
|
|
n_threads (int): number of threads to use for generation
|
|
|
|
n_threads_batch (int): number of threads to use for batch processing
|
|
|
|
n_threads_batch (int): number of threads to use for batch processing
|
|
|
|
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
|
|
|
|
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
|
|
|
@ -619,7 +623,8 @@ class llama_context_params(ctypes.Structure):
|
|
|
|
("seed", ctypes.c_uint32),
|
|
|
|
("seed", ctypes.c_uint32),
|
|
|
|
("n_ctx", ctypes.c_uint32),
|
|
|
|
("n_ctx", ctypes.c_uint32),
|
|
|
|
("n_batch", ctypes.c_uint32),
|
|
|
|
("n_batch", ctypes.c_uint32),
|
|
|
|
("n_parallel", ctypes.c_uint32),
|
|
|
|
("n_ubatch", ctypes.c_uint32),
|
|
|
|
|
|
|
|
("n_seq_max", ctypes.c_uint32),
|
|
|
|
("n_threads", ctypes.c_uint32),
|
|
|
|
("n_threads", ctypes.c_uint32),
|
|
|
|
("n_threads_batch", ctypes.c_uint32),
|
|
|
|
("n_threads_batch", ctypes.c_uint32),
|
|
|
|
("rope_scaling_type", ctypes.c_int),
|
|
|
|
("rope_scaling_type", ctypes.c_int),
|
|
|
@ -667,7 +672,7 @@ It might not exist for progress report where '.' is output repeatedly."""
|
|
|
|
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
|
|
|
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
|
|
|
# bool quantize_output_tensor; // quantize output.weight
|
|
|
|
# bool quantize_output_tensor; // quantize output.weight
|
|
|
|
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
|
|
|
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
|
|
|
# bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
|
|
|
# bool pure; // quantize all tensors to the default type
|
|
|
|
# void * imatrix; // pointer to importance matrix data
|
|
|
|
# void * imatrix; // pointer to importance matrix data
|
|
|
|
# } llama_model_quantize_params;
|
|
|
|
# } llama_model_quantize_params;
|
|
|
|
class llama_model_quantize_params(ctypes.Structure):
|
|
|
|
class llama_model_quantize_params(ctypes.Structure):
|
|
|
@ -679,7 +684,7 @@ class llama_model_quantize_params(ctypes.Structure):
|
|
|
|
allow_requantize (bool): allow quantizing non-f32/f16 tensors
|
|
|
|
allow_requantize (bool): allow quantizing non-f32/f16 tensors
|
|
|
|
quantize_output_tensor (bool): quantize output.weight
|
|
|
|
quantize_output_tensor (bool): quantize output.weight
|
|
|
|
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
|
|
|
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
|
|
|
pure (bool): disable k-quant mixtures and quantize all tensors to the same type
|
|
|
|
pure (bool): quantize all tensors to the default type
|
|
|
|
imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data
|
|
|
|
imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
@ -860,8 +865,7 @@ GGML_NUMA_STRATEGY_COUNT = 5
|
|
|
|
[ctypes.c_int],
|
|
|
|
[ctypes.c_int],
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
def llama_numa_init(numa: int, /):
|
|
|
|
def llama_numa_init(numa: int, /): ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // Call once at the end of the program - currently only used for MPI
|
|
|
|
# // Call once at the end of the program - currently only used for MPI
|
|
|
@ -886,8 +890,7 @@ def llama_backend_free():
|
|
|
|
)
|
|
|
|
)
|
|
|
|
def llama_load_model_from_file(
|
|
|
|
def llama_load_model_from_file(
|
|
|
|
path_model: bytes, params: llama_model_params, /
|
|
|
|
path_model: bytes, params: llama_model_params, /
|
|
|
|
) -> Optional[llama_model_p]:
|
|
|
|
) -> Optional[llama_model_p]: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API void llama_free_model(struct llama_model * model);
|
|
|
|
# LLAMA_API void llama_free_model(struct llama_model * model);
|
|
|
@ -896,8 +899,7 @@ def llama_load_model_from_file(
|
|
|
|
[llama_model_p_ctypes],
|
|
|
|
[llama_model_p_ctypes],
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
def llama_free_model(model: llama_model_p, /):
|
|
|
|
def llama_free_model(model: llama_model_p, /): ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API struct llama_context * llama_new_context_with_model(
|
|
|
|
# LLAMA_API struct llama_context * llama_new_context_with_model(
|
|
|
@ -910,8 +912,7 @@ def llama_free_model(model: llama_model_p, /):
|
|
|
|
)
|
|
|
|
)
|
|
|
|
def llama_new_context_with_model(
|
|
|
|
def llama_new_context_with_model(
|
|
|
|
model: llama_model_p, params: llama_context_params, /
|
|
|
|
model: llama_model_p, params: llama_context_params, /
|
|
|
|
) -> Optional[llama_context_p]:
|
|
|
|
) -> Optional[llama_context_p]: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // Frees all allocated memory
|
|
|
|
# // Frees all allocated memory
|
|
|
@ -932,80 +933,77 @@ def llama_free(ctx: llama_context_p, /):
|
|
|
|
[],
|
|
|
|
[],
|
|
|
|
ctypes.c_int64,
|
|
|
|
ctypes.c_int64,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
def llama_time_us() -> int:
|
|
|
|
def llama_time_us() -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API size_t llama_max_devices(void);
|
|
|
|
# LLAMA_API size_t llama_max_devices(void);
|
|
|
|
@ctypes_function("llama_max_devices", [], ctypes.c_size_t)
|
|
|
|
@ctypes_function("llama_max_devices", [], ctypes.c_size_t)
|
|
|
|
def llama_max_devices() -> int:
|
|
|
|
def llama_max_devices() -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API bool llama_supports_mmap (void);
|
|
|
|
# LLAMA_API bool llama_supports_mmap (void);
|
|
|
|
@ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
|
|
|
|
@ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
|
|
|
|
def llama_supports_mmap() -> bool:
|
|
|
|
def llama_supports_mmap() -> bool: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API bool llama_supports_mlock (void);
|
|
|
|
# LLAMA_API bool llama_supports_mlock (void);
|
|
|
|
@ctypes_function("llama_supports_mlock", [], ctypes.c_bool)
|
|
|
|
@ctypes_function("llama_supports_mlock", [], ctypes.c_bool)
|
|
|
|
def llama_supports_mlock() -> bool:
|
|
|
|
def llama_supports_mlock() -> bool: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API bool llama_supports_gpu_offload(void);
|
|
|
|
# LLAMA_API bool llama_supports_gpu_offload(void);
|
|
|
|
@ctypes_function("llama_supports_gpu_offload", [], ctypes.c_bool)
|
|
|
|
@ctypes_function("llama_supports_gpu_offload", [], ctypes.c_bool)
|
|
|
|
def llama_supports_gpu_offload() -> bool:
|
|
|
|
def llama_supports_gpu_offload() -> bool: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
|
|
|
# LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
|
|
|
@ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
|
|
|
|
@ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
|
|
|
|
def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
|
|
|
|
def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
|
|
|
# LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
|
|
|
@ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
|
|
|
|
@ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
|
|
|
|
def llama_n_ctx(ctx: llama_context_p, /) -> int:
|
|
|
|
def llama_n_ctx(ctx: llama_context_p, /) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
|
|
|
# LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
|
|
|
@ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32)
|
|
|
|
@ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32)
|
|
|
|
def llama_n_batch(ctx: llama_context_p, /) -> int:
|
|
|
|
def llama_n_batch(ctx: llama_context_p, /) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
|
|
|
|
|
|
|
@ctypes_function("llama_n_ubatch", [llama_context_p_ctypes], ctypes.c_uint32)
|
|
|
|
|
|
|
|
def llama_n_ubatch(ctx: llama_context_p, /) -> int: ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
|
|
|
|
|
|
|
@ctypes_function("llama_n_seq_max", [llama_context_p_ctypes], ctypes.c_uint32)
|
|
|
|
|
|
|
|
def llama_n_seq_max(ctx: llama_context_p, /) -> int: ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
|
|
|
# LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
|
|
|
@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int)
|
|
|
|
@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int)
|
|
|
|
def llama_vocab_type(model: llama_model_p, /) -> int:
|
|
|
|
def llama_vocab_type(model: llama_model_p, /) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
|
|
|
# LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
|
|
|
@ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int)
|
|
|
|
@ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int)
|
|
|
|
def llama_rope_type(model: llama_model_p, /) -> int:
|
|
|
|
def llama_rope_type(model: llama_model_p, /) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
|
|
|
# LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
|
|
|
@ctypes_function("llama_n_vocab", [llama_model_p_ctypes], ctypes.c_int32)
|
|
|
|
@ctypes_function("llama_n_vocab", [llama_model_p_ctypes], ctypes.c_int32)
|
|
|
|
def llama_n_vocab(model: llama_model_p, /) -> int:
|
|
|
|
def llama_n_vocab(model: llama_model_p, /) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
|
|
|
# LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
|
|
|
@ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
|
|
|
|
@ctypes_function("llama_n_ctx_train", [llama_model_p_ctypes], ctypes.c_int32)
|
|
|
|
def llama_n_ctx_train(model: llama_model_p, /) -> int:
|
|
|
|
def llama_n_ctx_train(model: llama_model_p, /) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
|
|
|
# LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
|
|
|
@ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
|
|
|
|
@ctypes_function("llama_n_embd", [llama_model_p_ctypes], ctypes.c_int32)
|
|
|
|
def llama_n_embd(model: llama_model_p, /) -> int:
|
|
|
|
def llama_n_embd(model: llama_model_p, /) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // Get the model's RoPE frequency scaling factor
|
|
|
|
# // Get the model's RoPE frequency scaling factor
|
|
|
@ -1192,8 +1190,7 @@ def llama_model_apply_lora_from_file(
|
|
|
|
path_base_model: Union[ctypes.c_char_p, bytes, None],
|
|
|
|
path_base_model: Union[ctypes.c_char_p, bytes, None],
|
|
|
|
n_threads: Union[ctypes.c_int32, int],
|
|
|
|
n_threads: Union[ctypes.c_int32, int],
|
|
|
|
/,
|
|
|
|
/,
|
|
|
|
) -> int:
|
|
|
|
) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# //
|
|
|
|
# //
|
|
|
@ -1219,7 +1216,7 @@ class llama_kv_cache_view_cell(ctypes.Structure):
|
|
|
|
# // Maximum number of sequences that can exist in a cell. It's not an error
|
|
|
|
# // Maximum number of sequences that can exist in a cell. It's not an error
|
|
|
|
# // if there are more sequences in a cell than this value, however they will
|
|
|
|
# // if there are more sequences in a cell than this value, however they will
|
|
|
|
# // not be visible in the view cells_sequences.
|
|
|
|
# // not be visible in the view cells_sequences.
|
|
|
|
# int32_t n_max_seq;
|
|
|
|
# int32_t n_seq_max;
|
|
|
|
|
|
|
|
|
|
|
|
# // Number of tokens in the cache. For example, if there are two populated
|
|
|
|
# // Number of tokens in the cache. For example, if there are two populated
|
|
|
|
# // cells, the first with 1 sequence id in it and the second with 2 sequence
|
|
|
|
# // cells, the first with 1 sequence id in it and the second with 2 sequence
|
|
|
@ -1240,7 +1237,7 @@ class llama_kv_cache_view_cell(ctypes.Structure):
|
|
|
|
# struct llama_kv_cache_view_cell * cells;
|
|
|
|
# struct llama_kv_cache_view_cell * cells;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // The sequences for each cell. There will be n_max_seq items per cell.
|
|
|
|
# // The sequences for each cell. There will be n_seq_max items per cell.
|
|
|
|
# llama_seq_id * cells_sequences;
|
|
|
|
# llama_seq_id * cells_sequences;
|
|
|
|
# };
|
|
|
|
# };
|
|
|
|
class llama_kv_cache_view(ctypes.Structure):
|
|
|
|
class llama_kv_cache_view(ctypes.Structure):
|
|
|
@ -1260,14 +1257,14 @@ llama_kv_cache_view_p = ctypes.POINTER(llama_kv_cache_view)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // Create an empty KV cache view. (use only for debugging purposes)
|
|
|
|
# // Create an empty KV cache view. (use only for debugging purposes)
|
|
|
|
# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
|
|
|
|
# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
|
|
|
|
@ctypes_function(
|
|
|
|
@ctypes_function(
|
|
|
|
"llama_kv_cache_view_init",
|
|
|
|
"llama_kv_cache_view_init",
|
|
|
|
[llama_context_p_ctypes, ctypes.c_int32],
|
|
|
|
[llama_context_p_ctypes, ctypes.c_int32],
|
|
|
|
llama_kv_cache_view,
|
|
|
|
llama_kv_cache_view,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
def llama_kv_cache_view_init(
|
|
|
|
def llama_kv_cache_view_init(
|
|
|
|
ctx: llama_context_p, n_max_seq: Union[ctypes.c_int32, int], /
|
|
|
|
ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], /
|
|
|
|
) -> llama_kv_cache_view:
|
|
|
|
) -> llama_kv_cache_view:
|
|
|
|
"""Create an empty KV cache view. (use only for debugging purposes)"""
|
|
|
|
"""Create an empty KV cache view. (use only for debugging purposes)"""
|
|
|
|
...
|
|
|
|
...
|
|
|
@ -1582,8 +1579,7 @@ def llama_load_session_file(
|
|
|
|
n_token_capacity: Union[ctypes.c_size_t, int],
|
|
|
|
n_token_capacity: Union[ctypes.c_size_t, int],
|
|
|
|
n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
|
|
|
|
n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
|
|
|
|
/,
|
|
|
|
/,
|
|
|
|
) -> int:
|
|
|
|
) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API bool llama_save_session_file(
|
|
|
|
# LLAMA_API bool llama_save_session_file(
|
|
|
@ -1607,8 +1603,7 @@ def llama_save_session_file(
|
|
|
|
tokens: CtypesArray[llama_token],
|
|
|
|
tokens: CtypesArray[llama_token],
|
|
|
|
n_token_count: Union[ctypes.c_size_t, int],
|
|
|
|
n_token_count: Union[ctypes.c_size_t, int],
|
|
|
|
/,
|
|
|
|
/,
|
|
|
|
) -> int:
|
|
|
|
) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# //
|
|
|
|
# //
|
|
|
@ -1728,6 +1723,17 @@ def llama_set_n_threads(
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // Set whether to use causal attention or not
|
|
|
|
|
|
|
|
# // If set to true, the model will only attend to the past tokens
|
|
|
|
|
|
|
|
# LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
|
|
|
|
|
|
|
@ctypes_function("llama_set_causal_attn", [llama_context_p_ctypes, ctypes.c_bool], None)
|
|
|
|
|
|
|
|
def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /):
|
|
|
|
|
|
|
|
"""Set whether to use causal attention or not
|
|
|
|
|
|
|
|
If set to true, the model will only attend to the past tokens"""
|
|
|
|
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // Set abort callback
|
|
|
|
# // Set abort callback
|
|
|
|
# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
|
|
|
# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
|
|
|
@ctypes_function(
|
|
|
|
@ctypes_function(
|
|
|
@ -1745,6 +1751,18 @@ def llama_set_abort_callback(
|
|
|
|
...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // Wait until all computations are finished
|
|
|
|
|
|
|
|
# // This is automatically done when using one of the functions below to obtain the computation results
|
|
|
|
|
|
|
|
# // and is not necessary to call it explicitly in most cases
|
|
|
|
|
|
|
|
# LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
|
|
|
|
|
|
|
@ctypes_function("llama_synchronize", [llama_context_p_ctypes], None)
|
|
|
|
|
|
|
|
def llama_synchronize(ctx: llama_context_p, /):
|
|
|
|
|
|
|
|
"""Wait until all computations are finished
|
|
|
|
|
|
|
|
This is automatically done when using one of the functions below to obtain the computation results
|
|
|
|
|
|
|
|
and is not necessary to call it explicitly in most cases"""
|
|
|
|
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // Token logits obtained from the last call to llama_decode()
|
|
|
|
# // Token logits obtained from the last call to llama_decode()
|
|
|
|
# // The logits for the last token are stored in the last row
|
|
|
|
# // The logits for the last token are stored in the last row
|
|
|
|
# // Logits for which llama_batch.logits[i] == 0 are undefined
|
|
|
|
# // Logits for which llama_batch.logits[i] == 0 are undefined
|
|
|
@ -1760,7 +1778,7 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
|
|
|
|
Logits for which llama_batch.logits[i] == 0 are undefined
|
|
|
|
Logits for which llama_batch.logits[i] == 0 are undefined
|
|
|
|
Rows: n_tokens provided with llama_batch
|
|
|
|
Rows: n_tokens provided with llama_batch
|
|
|
|
Cols: n_vocab
|
|
|
|
Cols: n_vocab
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Returns:
|
|
|
|
Pointer to the logits buffer of shape (n_tokens, n_vocab)"""
|
|
|
|
Pointer to the logits buffer of shape (n_tokens, n_vocab)"""
|
|
|
|
...
|
|
|
|
...
|
|
|
@ -1828,6 +1846,7 @@ def llama_get_embeddings_seq(
|
|
|
|
shape: [n_embd] (1-dimensional)"""
|
|
|
|
shape: [n_embd] (1-dimensional)"""
|
|
|
|
...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# //
|
|
|
|
# //
|
|
|
|
# // Vocab
|
|
|
|
# // Vocab
|
|
|
|
# //
|
|
|
|
# //
|
|
|
@ -1839,8 +1858,7 @@ def llama_get_embeddings_seq(
|
|
|
|
)
|
|
|
|
)
|
|
|
|
def llama_token_get_text(
|
|
|
|
def llama_token_get_text(
|
|
|
|
model: llama_model_p, token: Union[llama_token, int], /
|
|
|
|
model: llama_model_p, token: Union[llama_token, int], /
|
|
|
|
) -> bytes:
|
|
|
|
) -> bytes: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
|
|
|
# LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
|
|
@ -1849,8 +1867,7 @@ def llama_token_get_text(
|
|
|
|
)
|
|
|
|
)
|
|
|
|
def llama_token_get_score(
|
|
|
|
def llama_token_get_score(
|
|
|
|
model: llama_model_p, token: Union[llama_token, int], /
|
|
|
|
model: llama_model_p, token: Union[llama_token, int], /
|
|
|
|
) -> float:
|
|
|
|
) -> float: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
|
|
|
# LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
|
|
@ -1859,8 +1876,7 @@ def llama_token_get_score(
|
|
|
|
)
|
|
|
|
)
|
|
|
|
def llama_token_get_type(
|
|
|
|
def llama_token_get_type(
|
|
|
|
model: llama_model_p, token: Union[llama_token, int], /
|
|
|
|
model: llama_model_p, token: Union[llama_token, int], /
|
|
|
|
) -> int:
|
|
|
|
) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# // Special tokens
|
|
|
|
# // Special tokens
|
|
|
@ -1913,20 +1929,17 @@ def llama_token_prefix(model: llama_model_p) -> int:
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
|
|
|
# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
|
|
|
@ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token)
|
|
|
|
@ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token)
|
|
|
|
def llama_token_middle(model: llama_model_p, /) -> int:
|
|
|
|
def llama_token_middle(model: llama_model_p, /) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
|
|
|
# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
|
|
|
@ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token)
|
|
|
|
@ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token)
|
|
|
|
def llama_token_suffix(model: llama_model_p, /) -> int:
|
|
|
|
def llama_token_suffix(model: llama_model_p, /) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
|
|
|
|
# LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
|
|
|
|
@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
|
|
|
|
@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
|
|
|
|
def llama_token_eot(model: llama_model_p, /) -> int:
|
|
|
|
def llama_token_eot(model: llama_model_p, /) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# //
|
|
|
|
# //
|
|
|
@ -1936,7 +1949,7 @@ def llama_token_eot(model: llama_model_p, /) -> int:
|
|
|
|
|
|
|
|
|
|
|
|
# /// @details Convert the provided text into tokens.
|
|
|
|
# /// @details Convert the provided text into tokens.
|
|
|
|
# /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
|
|
|
# /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
|
|
|
# /// @return Returns the number of tokens on success, no more than n_max_tokens
|
|
|
|
# /// @return Returns the number of tokens on success, no more than n_tokens_max
|
|
|
|
# /// @return Returns a negative number on failure - the number of tokens that would have been returned
|
|
|
|
# /// @return Returns a negative number on failure - the number of tokens that would have been returned
|
|
|
|
# /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
|
|
|
# /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
|
|
|
# /// Does not insert a leading space.
|
|
|
|
# /// Does not insert a leading space.
|
|
|
@ -1945,7 +1958,7 @@ def llama_token_eot(model: llama_model_p, /) -> int:
|
|
|
|
# const char * text,
|
|
|
|
# const char * text,
|
|
|
|
# int32_t text_len,
|
|
|
|
# int32_t text_len,
|
|
|
|
# llama_token * tokens,
|
|
|
|
# llama_token * tokens,
|
|
|
|
# int32_t n_max_tokens,
|
|
|
|
# int32_t n_tokens_max,
|
|
|
|
# bool add_bos,
|
|
|
|
# bool add_bos,
|
|
|
|
# bool special);
|
|
|
|
# bool special);
|
|
|
|
@ctypes_function(
|
|
|
|
@ctypes_function(
|
|
|
@ -1966,12 +1979,26 @@ def llama_tokenize(
|
|
|
|
text: bytes,
|
|
|
|
text: bytes,
|
|
|
|
text_len: Union[ctypes.c_int, int],
|
|
|
|
text_len: Union[ctypes.c_int, int],
|
|
|
|
tokens: CtypesArray[llama_token],
|
|
|
|
tokens: CtypesArray[llama_token],
|
|
|
|
n_max_tokens: Union[ctypes.c_int, int],
|
|
|
|
n_tokens_max: Union[ctypes.c_int, int],
|
|
|
|
add_bos: Union[ctypes.c_bool, bool],
|
|
|
|
add_bos: Union[ctypes.c_bool, bool],
|
|
|
|
special: Union[ctypes.c_bool, bool],
|
|
|
|
special: Union[ctypes.c_bool, bool],
|
|
|
|
/,
|
|
|
|
/,
|
|
|
|
) -> int:
|
|
|
|
) -> int:
|
|
|
|
"""Convert the provided text into tokens."""
|
|
|
|
"""Convert the provided text into tokens.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
model: The model to use for tokenization.
|
|
|
|
|
|
|
|
text: The text to tokenize.
|
|
|
|
|
|
|
|
text_len: The length of the text.
|
|
|
|
|
|
|
|
tokens: The tokens pointer must be large enough to hold the resulting tokens.
|
|
|
|
|
|
|
|
n_max_tokens: The maximum number of tokens to return.
|
|
|
|
|
|
|
|
add_bos: Whether to add a beginning-of-sentence token.
|
|
|
|
|
|
|
|
special: Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
|
|
|
|
|
|
|
Does not insert a leading space.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
Returns the number of tokens on success, no more than n_tokens_max
|
|
|
|
|
|
|
|
Returns a negative number on failure - the number of tokens that would have been returned"""
|
|
|
|
...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -2043,8 +2070,7 @@ def llama_chat_apply_template(
|
|
|
|
chat: CtypesArray[llama_chat_message],
|
|
|
|
chat: CtypesArray[llama_chat_message],
|
|
|
|
n_msg: int,
|
|
|
|
n_msg: int,
|
|
|
|
/,
|
|
|
|
/,
|
|
|
|
) -> int:
|
|
|
|
) -> int: ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# //
|
|
|
|
# //
|
|
|
@ -2645,8 +2671,7 @@ def llama_beam_search(
|
|
|
|
n_past: Union[ctypes.c_int, int],
|
|
|
|
n_past: Union[ctypes.c_int, int],
|
|
|
|
n_predict: Union[ctypes.c_int, int],
|
|
|
|
n_predict: Union[ctypes.c_int, int],
|
|
|
|
/,
|
|
|
|
/,
|
|
|
|
):
|
|
|
|
): ...
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Performance information
|
|
|
|
# Performance information
|
|
|
@ -2723,5 +2748,4 @@ def llama_log_set(
|
|
|
|
[ctypes.c_void_p, llama_context_p_ctypes],
|
|
|
|
[ctypes.c_void_p, llama_context_p_ctypes],
|
|
|
|
None,
|
|
|
|
None,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /):
|
|
|
|
def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /): ...
|
|
|
|
...
|
|
|
|
|
|
|
|