Convert constants to python types and allow python types in low-level api

This commit is contained in:
Andrei Betlen 2023-07-24 15:42:07 -04:00
parent 343480364f
commit 1b6997d69f

View file

@ -87,29 +87,29 @@ c_size_t_p = POINTER(c_size_t)
# llama.h bindings # llama.h bindings
GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas") GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas")
GGML_CUDA_MAX_DEVICES = ctypes.c_int(16) GGML_CUDA_MAX_DEVICES = 16
LLAMA_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1) LLAMA_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else 1
# #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' # #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) LLAMA_FILE_MAGIC_GGJT = 0x67676A74
# #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' # #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) LLAMA_FILE_MAGIC_GGLA = 0x67676C61
# #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' # #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) LLAMA_FILE_MAGIC_GGMF = 0x67676D66
# #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' # #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) LLAMA_FILE_MAGIC_GGML = 0x67676D6C
# #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' # #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) LLAMA_FILE_MAGIC_GGSN = 0x6767736E
# #define LLAMA_FILE_VERSION 3 # #define LLAMA_FILE_VERSION 3
LLAMA_FILE_VERSION = c_int(3) LLAMA_FILE_VERSION = 3
LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT
LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
LLAMA_SESSION_VERSION = c_int(1) LLAMA_SESSION_VERSION = 1
# #define LLAMA_DEFAULT_SEED 0xFFFFFFFF # #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
LLAMA_DEFAULT_SEED = c_int(0xFFFFFFFF) LLAMA_DEFAULT_SEED = 0xFFFFFFFF
# struct llama_model; # struct llama_model;
llama_model_p = c_void_p llama_model_p = c_void_p
@ -235,23 +235,23 @@ llama_context_params_p = POINTER(llama_context_params)
# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors # LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors # LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
# }; # };
LLAMA_FTYPE_ALL_F32 = c_int(0) LLAMA_FTYPE_ALL_F32 = 0
LLAMA_FTYPE_MOSTLY_F16 = c_int(1) LLAMA_FTYPE_MOSTLY_F16 = 1
LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) LLAMA_FTYPE_MOSTLY_Q4_0 = 2
LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) LLAMA_FTYPE_MOSTLY_Q4_1 = 3
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4
LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) LLAMA_FTYPE_MOSTLY_Q8_0 = 7
LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) LLAMA_FTYPE_MOSTLY_Q5_0 = 8
LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) LLAMA_FTYPE_MOSTLY_Q5_1 = 9
LLAMA_FTYPE_MOSTLY_Q2_K = c_int(10) LLAMA_FTYPE_MOSTLY_Q2_K = 10
LLAMA_FTYPE_MOSTLY_Q3_K_S = c_int(11) LLAMA_FTYPE_MOSTLY_Q3_K_S = 11
LLAMA_FTYPE_MOSTLY_Q3_K_M = c_int(12) LLAMA_FTYPE_MOSTLY_Q3_K_M = 12
LLAMA_FTYPE_MOSTLY_Q3_K_L = c_int(13) LLAMA_FTYPE_MOSTLY_Q3_K_L = 13
LLAMA_FTYPE_MOSTLY_Q4_K_S = c_int(14) LLAMA_FTYPE_MOSTLY_Q4_K_S = 14
LLAMA_FTYPE_MOSTLY_Q4_K_M = c_int(15) LLAMA_FTYPE_MOSTLY_Q4_K_M = 15
LLAMA_FTYPE_MOSTLY_Q5_K_S = c_int(16) LLAMA_FTYPE_MOSTLY_Q5_K_S = 16
LLAMA_FTYPE_MOSTLY_Q5_K_M = c_int(17) LLAMA_FTYPE_MOSTLY_Q5_K_M = 17
LLAMA_FTYPE_MOSTLY_Q6_K = c_int(18) LLAMA_FTYPE_MOSTLY_Q6_K = 18
# // model quantization parameters # // model quantization parameters
@ -299,13 +299,13 @@ llama_grammar_p = c_void_p
# // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) # // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
# LLAMA_GRETYPE_CHAR_ALT = 6, # LLAMA_GRETYPE_CHAR_ALT = 6,
# }; # };
LLAMA_GRETYPE_END = c_int(0) LLAMA_GRETYPE_END = 0
LLAMA_GRETYPE_ALT = c_int(1) LLAMA_GRETYPE_ALT = 1
LLAMA_GRETYPE_RULE_REF = c_int(2) LLAMA_GRETYPE_RULE_REF = 2
LLAMA_GRETYPE_CHAR = c_int(3) LLAMA_GRETYPE_CHAR = 3
LLAMA_GRETYPE_CHAR_NOT = c_int(4) LLAMA_GRETYPE_CHAR_NOT = 4
LLAMA_GRETYPE_CHAR_RNG_UPPER = c_int(5) LLAMA_GRETYPE_CHAR_RNG_UPPER = 5
LLAMA_GRETYPE_CHAR_ALT = c_int(6) LLAMA_GRETYPE_CHAR_ALT = 6
# typedef struct llama_grammar_element { # typedef struct llama_grammar_element {
@ -399,7 +399,7 @@ _lib.llama_mlock_supported.restype = c_bool
# // If numa is true, use NUMA optimizations # // If numa is true, use NUMA optimizations
# // Call once at the start of the program # // Call once at the start of the program
# LLAMA_API void llama_backend_init(bool numa); # LLAMA_API void llama_backend_init(bool numa);
def llama_backend_init(numa: c_bool): def llama_backend_init(numa: Union[c_bool, bool]):
return _lib.llama_backend_init(numa) return _lib.llama_backend_init(numa)
@ -521,9 +521,9 @@ _lib.llama_model_quantize.restype = c_int
# int n_threads); # int n_threads);
def llama_apply_lora_from_file( def llama_apply_lora_from_file(
ctx: llama_context_p, ctx: llama_context_p,
path_lora: c_char_p, path_lora: Union[c_char_p, bytes],
path_base_model: c_char_p, path_base_model: Union[c_char_p, bytes],
n_threads: c_int, n_threads: Union[c_int, int],
) -> int: ) -> int:
return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
@ -541,7 +541,7 @@ def llama_model_apply_lora_from_file(
model: llama_model_p, model: llama_model_p,
path_lora: Union[c_char_p, bytes], path_lora: Union[c_char_p, bytes],
path_base_model: Union[c_char_p, bytes], path_base_model: Union[c_char_p, bytes],
n_threads: c_int, n_threads: Union[c_int, int],
) -> int: ) -> int:
return _lib.llama_model_apply_lora_from_file( return _lib.llama_model_apply_lora_from_file(
model, path_lora, path_base_model, n_threads model, path_lora, path_base_model, n_threads
@ -621,7 +621,7 @@ def llama_load_session_file(
ctx: llama_context_p, ctx: llama_context_p,
path_session: bytes, path_session: bytes,
tokens_out, # type: Array[llama_token] tokens_out, # type: Array[llama_token]
n_token_capacity: c_size_t, n_token_capacity: Union[c_size_t, int],
n_token_count_out, # type: _Pointer[c_size_t] n_token_count_out, # type: _Pointer[c_size_t]
) -> int: ) -> int:
return _lib.llama_load_session_file( return _lib.llama_load_session_file(
@ -644,7 +644,7 @@ def llama_save_session_file(
ctx: llama_context_p, ctx: llama_context_p,
path_session: bytes, path_session: bytes,
tokens, # type: Array[llama_token] tokens, # type: Array[llama_token]
n_token_count: c_size_t, n_token_count: Union[c_size_t, int],
) -> int: ) -> int:
return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
@ -671,9 +671,9 @@ _lib.llama_save_session_file.restype = c_size_t
def llama_eval( def llama_eval(
ctx: llama_context_p, ctx: llama_context_p,
tokens, # type: Array[llama_token] tokens, # type: Array[llama_token]
n_tokens: c_int, n_tokens: Union[c_int, int],
n_past: c_int, n_past: Union[c_int, int],
n_threads: c_int, n_threads: Union[c_int, int],
) -> int: ) -> int:
return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads) return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
@ -692,9 +692,9 @@ _lib.llama_eval.restype = c_int
def llama_eval_embd( def llama_eval_embd(
ctx: llama_context_p, ctx: llama_context_p,
embd, # type: Array[c_float] embd, # type: Array[c_float]
n_tokens: c_int, n_tokens: Union[c_int, int],
n_past: c_int, n_past: Union[c_int, int],
n_threads: c_int, n_threads: Union[c_int, int],
) -> int: ) -> int:
return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads) return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads)
@ -718,8 +718,8 @@ def llama_tokenize(
ctx: llama_context_p, ctx: llama_context_p,
text: bytes, text: bytes,
tokens, # type: Array[llama_token] tokens, # type: Array[llama_token]
n_max_tokens: c_int, n_max_tokens: Union[c_int, int],
add_bos: c_bool, add_bos: Union[c_bool, bool],
) -> int: ) -> int:
return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos) return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
@ -738,8 +738,8 @@ def llama_tokenize_with_model(
model: llama_model_p, model: llama_model_p,
text: bytes, text: bytes,
tokens, # type: Array[llama_token] tokens, # type: Array[llama_token]
n_max_tokens: c_int, n_max_tokens: Union[c_int, int],
add_bos: c_bool, add_bos: Union[c_bool, bool],
) -> int: ) -> int:
return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos) return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos)
@ -809,7 +809,7 @@ def llama_get_vocab(
ctx: llama_context_p, ctx: llama_context_p,
strings, # type: Array[c_char_p] # type: ignore strings, # type: Array[c_char_p] # type: ignore
scores, # type: Array[c_float] # type: ignore scores, # type: Array[c_float] # type: ignore
capacity: c_int, capacity: Union[c_int, int],
) -> int: ) -> int:
return _lib.llama_get_vocab(ctx, strings, scores, capacity) return _lib.llama_get_vocab(ctx, strings, scores, capacity)
@ -832,7 +832,7 @@ def llama_get_vocab_from_model(
model: llama_model_p, model: llama_model_p,
strings, # type: Array[c_char_p] # type: ignore strings, # type: Array[c_char_p] # type: ignore
scores, # type: Array[c_float] # type: ignore scores, # type: Array[c_float] # type: ignore
capacity: c_int, capacity: Union[c_int, int],
) -> int: ) -> int:
return _lib.llama_get_vocab_from_model(model, strings, scores, capacity) return _lib.llama_get_vocab_from_model(model, strings, scores, capacity)
@ -935,8 +935,8 @@ _lib.llama_token_nl.restype = llama_token
# size_t start_rule_index); # size_t start_rule_index);
def llama_grammar_init( def llama_grammar_init(
rules, # type: Array[llama_grammar_element_p] # type: ignore rules, # type: Array[llama_grammar_element_p] # type: ignore
n_rules: c_size_t, n_rules: Union[c_size_t, int],
start_rule_index: c_size_t, start_rule_index: Union[c_size_t, int],
) -> llama_grammar_p: ) -> llama_grammar_p:
return _lib.llama_grammar_init(rules, n_rules, start_rule_index) return _lib.llama_grammar_init(rules, n_rules, start_rule_index)
@ -967,8 +967,8 @@ def llama_sample_repetition_penalty(
ctx: llama_context_p, ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array] candidates, # type: _Pointer[llama_token_data_array]
last_tokens_data, # type: Array[llama_token] last_tokens_data, # type: Array[llama_token]
last_tokens_size: c_int, last_tokens_size: Union[c_int, int],
penalty: c_float, penalty: Union[c_float, float],
): ):
return _lib.llama_sample_repetition_penalty( return _lib.llama_sample_repetition_penalty(
ctx, candidates, last_tokens_data, last_tokens_size, penalty ctx, candidates, last_tokens_data, last_tokens_size, penalty
@ -991,9 +991,9 @@ def llama_sample_frequency_and_presence_penalties(
ctx: llama_context_p, ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array] candidates, # type: _Pointer[llama_token_data_array]
last_tokens_data, # type: Array[llama_token] last_tokens_data, # type: Array[llama_token]
last_tokens_size: c_int, last_tokens_size: Union[c_int, int],
alpha_frequency: c_float, alpha_frequency: Union[c_float, float],
alpha_presence: c_float, alpha_presence: Union[c_float, float],
): ):
return _lib.llama_sample_frequency_and_presence_penalties( return _lib.llama_sample_frequency_and_presence_penalties(
ctx, ctx,
@ -1029,7 +1029,7 @@ def llama_sample_classifier_free_guidance(
ctx: llama_context_p, ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array] candidates, # type: _Pointer[llama_token_data_array]
guidance_ctx: llama_context_p, guidance_ctx: llama_context_p,
scale: c_float, scale: Union[c_float, float],
): ):
return _lib.llama_sample_classifier_free_guidance( return _lib.llama_sample_classifier_free_guidance(
ctx, candidates, guidance_ctx, scale ctx, candidates, guidance_ctx, scale
@ -1065,8 +1065,8 @@ _lib.llama_sample_softmax.restype = None
def llama_sample_top_k( def llama_sample_top_k(
ctx: llama_context_p, ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array] candidates, # type: _Pointer[llama_token_data_array]
k: c_int, k: Union[c_int, int],
min_keep: c_size_t, min_keep: Union[c_size_t, int],
): ):
return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
@ -1085,8 +1085,8 @@ _lib.llama_sample_top_k.restype = None
def llama_sample_top_p( def llama_sample_top_p(
ctx: llama_context_p, ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array] candidates, # type: _Pointer[llama_token_data_array]
p: c_float, p: Union[c_float, float],
min_keep: c_size_t, min_keep: Union[c_size_t, int],
): ):
return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
@ -1105,8 +1105,8 @@ _lib.llama_sample_top_p.restype = None
def llama_sample_tail_free( def llama_sample_tail_free(
ctx: llama_context_p, ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array] candidates, # type: _Pointer[llama_token_data_array]
z: c_float, z: Union[c_float, float],
min_keep: c_size_t, min_keep: Union[c_size_t, int],
): ):
return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
@ -1125,8 +1125,8 @@ _lib.llama_sample_tail_free.restype = None
def llama_sample_typical( def llama_sample_typical(
ctx: llama_context_p, ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array] candidates, # type: _Pointer[llama_token_data_array]
p: c_float, p: Union[c_float, float],
min_keep: c_size_t, min_keep: Union[c_size_t, int],
): ):
return _lib.llama_sample_typical(ctx, candidates, p, min_keep) return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
@ -1144,7 +1144,7 @@ _lib.llama_sample_typical.restype = None
def llama_sample_temperature( def llama_sample_temperature(
ctx: llama_context_p, ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array] candidates, # type: _Pointer[llama_token_data_array]
temp: c_float, temp: Union[c_float, float],
): ):
return _lib.llama_sample_temperature(ctx, candidates, temp) return _lib.llama_sample_temperature(ctx, candidates, temp)
@ -1167,9 +1167,9 @@ _lib.llama_sample_temperature.restype = None
def llama_sample_token_mirostat( def llama_sample_token_mirostat(
ctx: llama_context_p, ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array] candidates, # type: _Pointer[llama_token_data_array]
tau: c_float, tau: Union[c_float, float],
eta: c_float, eta: Union[c_float, float],
m: c_int, m: Union[c_int, int],
mu, # type: _Pointer[c_float] mu, # type: _Pointer[c_float]
) -> int: ) -> int:
return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
@ -1195,8 +1195,8 @@ _lib.llama_sample_token_mirostat.restype = llama_token
def llama_sample_token_mirostat_v2( def llama_sample_token_mirostat_v2(
ctx: llama_context_p, ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array] candidates, # type: _Pointer[llama_token_data_array]
tau: c_float, tau: Union[c_float, float],
eta: c_float, eta: Union[c_float, float],
mu, # type: _Pointer[c_float] mu, # type: _Pointer[c_float]
) -> int: ) -> int:
return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
@ -1289,5 +1289,5 @@ _lib.llama_print_system_info.restype = c_char_p
_llama_initialized = False _llama_initialized = False
if not _llama_initialized: if not _llama_initialized:
llama_backend_init(c_bool(False)) llama_backend_init(False)
_llama_initialized = True _llama_initialized = True