feat: Update llama.cpp
This commit is contained in:
parent
c89be28ef9
commit
e325a831f0
2 changed files with 57 additions and 9 deletions
|
@ -668,13 +668,15 @@ It might not exist for progress report where '.' is output repeatedly."""
|
|||
|
||||
# // model quantization parameters
|
||||
# typedef struct llama_model_quantize_params {
|
||||
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
# enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
# bool quantize_output_tensor; // quantize output.weight
|
||||
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
# bool pure; // quantize all tensors to the default type
|
||||
# void * imatrix; // pointer to importance matrix data
|
||||
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
# enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
# enum ggml_type output_tensor_type; // output tensor type
|
||||
# enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
||||
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
# bool quantize_output_tensor; // quantize output.weight
|
||||
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
# bool pure; // quantize all tensors to the default type
|
||||
# void * imatrix; // pointer to importance matrix data
|
||||
# } llama_model_quantize_params;
|
||||
class llama_model_quantize_params(ctypes.Structure):
|
||||
"""Parameters for llama_model_quantize
|
||||
|
@ -682,16 +684,20 @@ class llama_model_quantize_params(ctypes.Structure):
|
|||
Attributes:
|
||||
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
ftype (int): quantize to this llama_ftype
|
||||
output_tensor_type (int): output tensor type
|
||||
token_embedding_type (int): itoken embeddings tensor type
|
||||
allow_requantize (bool): allow quantizing non-f32/f16 tensors
|
||||
quantize_output_tensor (bool): quantize output.weight
|
||||
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
pure (bool): quantize all tensors to the default type
|
||||
imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data
|
||||
imatrix (ctypes.c_void_p): pointer to importance matrix data
|
||||
"""
|
||||
|
||||
_fields_ = [
|
||||
("nthread", ctypes.c_int32),
|
||||
("ftype", ctypes.c_int),
|
||||
("output_tensor_type", ctypes.c_int),
|
||||
("token_embedding_type", ctypes.c_int),
|
||||
("allow_requantize", ctypes.c_bool),
|
||||
("quantize_output_tensor", ctypes.c_bool),
|
||||
("only_copy", ctypes.c_bool),
|
||||
|
@ -2743,6 +2749,48 @@ def llama_beam_search(
|
|||
): ...
|
||||
|
||||
|
||||
# /// @details Build a split GGUF final path for this chunk.
|
||||
# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
||||
# // Returns the split_path length.
|
||||
# LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
||||
@ctypes_function(
|
||||
"llama_split_path",
|
||||
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
|
||||
ctypes.c_int,
|
||||
)
|
||||
def llama_split_path(
|
||||
split_path: bytes,
|
||||
maxlen: Union[ctypes.c_size_t, int],
|
||||
path_prefix: bytes,
|
||||
split_no: Union[ctypes.c_int, int],
|
||||
split_count: Union[ctypes.c_int, int],
|
||||
/,
|
||||
) -> int:
|
||||
"""Build a split GGUF final path for this chunk."""
|
||||
...
|
||||
|
||||
|
||||
# /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
||||
# /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
||||
# // Returns the split_prefix length.
|
||||
# LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
||||
@ctypes_function(
|
||||
"llama_split_prefix",
|
||||
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
|
||||
ctypes.c_int,
|
||||
)
|
||||
def llama_split_prefix(
|
||||
split_prefix: bytes,
|
||||
maxlen: Union[ctypes.c_size_t, int],
|
||||
split_path: bytes,
|
||||
split_no: Union[ctypes.c_int, int],
|
||||
split_count: Union[ctypes.c_int, int],
|
||||
/,
|
||||
) -> int:
|
||||
"""Extract the path prefix from the split_path if and only if the split_no and split_count match."""
|
||||
...
|
||||
|
||||
|
||||
# Performance information
|
||||
|
||||
|
||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 42e21c68826f2e56b9592dccd9f3c43895b6890d
|
||||
Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652
|
Loading…
Reference in a new issue