feat: Update llama.cpp
This commit is contained in:
parent
c89be28ef9
commit
e325a831f0
2 changed files with 57 additions and 9 deletions
|
@ -670,6 +670,8 @@ It might not exist for progress report where '.' is output repeatedly."""
|
||||||
# typedef struct llama_model_quantize_params {
|
# typedef struct llama_model_quantize_params {
|
||||||
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||||
# enum llama_ftype ftype; // quantize to this llama_ftype
|
# enum llama_ftype ftype; // quantize to this llama_ftype
|
||||||
|
# enum ggml_type output_tensor_type; // output tensor type
|
||||||
|
# enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
||||||
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
# bool quantize_output_tensor; // quantize output.weight
|
# bool quantize_output_tensor; // quantize output.weight
|
||||||
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
|
@ -682,16 +684,20 @@ class llama_model_quantize_params(ctypes.Structure):
|
||||||
Attributes:
|
Attributes:
|
||||||
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||||
ftype (int): quantize to this llama_ftype
|
ftype (int): quantize to this llama_ftype
|
||||||
|
output_tensor_type (int): output tensor type
|
||||||
|
token_embedding_type (int): itoken embeddings tensor type
|
||||||
allow_requantize (bool): allow quantizing non-f32/f16 tensors
|
allow_requantize (bool): allow quantizing non-f32/f16 tensors
|
||||||
quantize_output_tensor (bool): quantize output.weight
|
quantize_output_tensor (bool): quantize output.weight
|
||||||
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
pure (bool): quantize all tensors to the default type
|
pure (bool): quantize all tensors to the default type
|
||||||
imatrix (ctypes.ctypes.c_void_p): pointer to importance matrix data
|
imatrix (ctypes.c_void_p): pointer to importance matrix data
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_fields_ = [
|
_fields_ = [
|
||||||
("nthread", ctypes.c_int32),
|
("nthread", ctypes.c_int32),
|
||||||
("ftype", ctypes.c_int),
|
("ftype", ctypes.c_int),
|
||||||
|
("output_tensor_type", ctypes.c_int),
|
||||||
|
("token_embedding_type", ctypes.c_int),
|
||||||
("allow_requantize", ctypes.c_bool),
|
("allow_requantize", ctypes.c_bool),
|
||||||
("quantize_output_tensor", ctypes.c_bool),
|
("quantize_output_tensor", ctypes.c_bool),
|
||||||
("only_copy", ctypes.c_bool),
|
("only_copy", ctypes.c_bool),
|
||||||
|
@ -2743,6 +2749,48 @@ def llama_beam_search(
|
||||||
): ...
|
): ...
|
||||||
|
|
||||||
|
|
||||||
|
# /// @details Build a split GGUF final path for this chunk.
|
||||||
|
# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
||||||
|
# // Returns the split_path length.
|
||||||
|
# LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_split_path",
|
||||||
|
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
|
||||||
|
ctypes.c_int,
|
||||||
|
)
|
||||||
|
def llama_split_path(
|
||||||
|
split_path: bytes,
|
||||||
|
maxlen: Union[ctypes.c_size_t, int],
|
||||||
|
path_prefix: bytes,
|
||||||
|
split_no: Union[ctypes.c_int, int],
|
||||||
|
split_count: Union[ctypes.c_int, int],
|
||||||
|
/,
|
||||||
|
) -> int:
|
||||||
|
"""Build a split GGUF final path for this chunk."""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
||||||
|
# /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
||||||
|
# // Returns the split_prefix length.
|
||||||
|
# LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
||||||
|
@ctypes_function(
|
||||||
|
"llama_split_prefix",
|
||||||
|
[ctypes.c_char_p, ctypes.c_size_t, ctypes.c_char_p, ctypes.c_int, ctypes.c_int],
|
||||||
|
ctypes.c_int,
|
||||||
|
)
|
||||||
|
def llama_split_prefix(
|
||||||
|
split_prefix: bytes,
|
||||||
|
maxlen: Union[ctypes.c_size_t, int],
|
||||||
|
split_path: bytes,
|
||||||
|
split_no: Union[ctypes.c_int, int],
|
||||||
|
split_count: Union[ctypes.c_int, int],
|
||||||
|
/,
|
||||||
|
) -> int:
|
||||||
|
"""Extract the path prefix from the split_path if and only if the split_no and split_count match."""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
# Performance information
|
# Performance information
|
||||||
|
|
||||||
|
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 42e21c68826f2e56b9592dccd9f3c43895b6890d
|
Subproject commit 50ccaf5eacb50a2ca378a4ef0dc7aeb45fead652
|
Loading…
Reference in a new issue