Compare commits

...

7 commits

Author SHA1 Message Date
d296cf3e06
Merge https://github.com/abetlen/llama-cpp-python 2024-01-05 09:39:18 +05:30
Andrei Betlen
75d0527fd7 Bump version 2024-01-04 18:30:12 -05:00
Andrei Betlen
fffcd0181c Update llama.cpp 2024-01-04 18:26:00 -05:00
Fedor Moiseev
907b9e9d42
Add Saiga chat format. (#1050) 2024-01-04 18:12:58 -05:00
Caleb Hoff
f766b70c9a
Fix: Correct typo in README.md (#1058)
In Llama.create_chat_completion, the `tool_choice` property does not have an s on the end.
2024-01-04 18:12:32 -05:00
xaviviro
cf743ec5d3
Added ChatGLM chat format (#1059)
Co-authored-by: Xavier Vinaixa Rosello <xaviviro@MacBook-Pro-de-Xavier.local>
2024-01-04 18:12:02 -05:00
Andrei Betlen
eb9c7d4ed8 Update llama.cpp 2024-01-03 22:04:04 -05:00
6 changed files with 135 additions and 68 deletions

View file

@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
## [0.2.27]
- feat: Update llama.cpp to ggerganov/llama.cpp@b3a7c20b5c035250257d2b62851c379b159c899a
- feat: Add `saiga` chat format by @femoiseev in #1050
- feat: Added `chatglm3` chat format by @xaviviro in #1059
- fix: Correct typo in README.md by @qeleb in (#1058)
## [0.2.26] ## [0.2.26]
- feat: Update llama.cpp to ggerganov/llama.cpp@f6793491b5af6da75edad34d6f503ef86d31b09f - feat: Update llama.cpp to ggerganov/llama.cpp@f6793491b5af6da75edad34d6f503ef86d31b09f

View file

@ -238,7 +238,7 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
} }
} }
}], }],
tool_choices=[{ tool_choice=[{
"type": "function", "type": "function",
"function": { "function": {
"name": "UserDetail" "name": "UserDetail"

View file

@ -1,4 +1,4 @@
from .llama_cpp import * from .llama_cpp import *
from .llama import * from .llama import *
__version__ = "0.2.26" __version__ = "0.2.27"

View file

@ -172,6 +172,20 @@ def _format_chatml(
ret += role + "\n" ret += role + "\n"
return ret return ret
def _format_chatglm3(
system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str
) -> str:
"""Format the prompt with the chatglm3 style."""
ret = ""
if system_message:
ret += system_message
for role, message in messages:
if message:
ret += role + "\n" + " " + message
else:
ret += role
return ret
@dataclasses.dataclass @dataclasses.dataclass
class ChatFormatterResponse: class ChatFormatterResponse:
@ -685,6 +699,22 @@ def format_chatml(
_prompt = _format_chatml(system_message, _messages, _sep) _prompt = _format_chatml(system_message, _messages, _sep)
return ChatFormatterResponse(prompt=_prompt, stop=_sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep)
@register_chat_format("chatglm3")
def format_chatglm3(
messages: List[llama_types.ChatCompletionRequestMessage],
**kwargs: Any,
) -> ChatFormatterResponse:
system_template = """<|system|>
{system_message}"""
system_message = _get_system_message(messages)
system_message = system_template.format(system_message=system_message)
_roles = dict(user="<|user|>", assistant="<|assistant|>")
_sep = "</s>"
_messages = _map_roles(messages, _roles)
_messages.append((_roles["assistant"], None))
_prompt = _format_chatglm3(system_message, _messages, _sep)
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
@register_chat_format("openchat") @register_chat_format("openchat")
def format_openchat( def format_openchat(
@ -704,6 +734,28 @@ def format_openchat(
return ChatFormatterResponse(prompt=_prompt, stop=_sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep)
# Chat format for Saiga models, see more details and available models:
# https://huggingface.co/collections/IlyaGusev/saiga2-saigamistral-6505d4ccc3d1e53166b636cd
@register_chat_format("saiga")
def format_saiga(
messages: list[llama_types.ChatCompletionRequestMessage],
**kwargs,
) -> ChatFormatterResponse:
_message_template = "<s>{role}\n{content}</s>"
_roles = dict(user="user", bot="bot", system="system")
_messages = _map_roles(messages, _roles)
_prompt = ""
for role, content in _messages:
if content:
_prompt += _message_template.format(role=role, content=content)
else:
_prompt += f"<s>{role}\n"
# Response template
_prompt += "<s>bot"
return ChatFormatterResponse(prompt=_prompt.strip())
@register_chat_completion_handler("functionary") @register_chat_completion_handler("functionary")
def functionary_chat_handler( def functionary_chat_handler(
llama: llama.Llama, llama: llama.Llama,

View file

@ -93,6 +93,9 @@ c_size_t_p = POINTER(c_size_t)
# llama.h bindings # llama.h bindings
_lib.llama_max_devices.argtypes = []
_lib.llama_max_devices.restype = ctypes.c_int32
LLAMA_MAX_DEVICES = _lib.llama_max_devices() LLAMA_MAX_DEVICES = _lib.llama_max_devices()
# define LLAMA_DEFAULT_SEED 0xFFFFFFFF # define LLAMA_DEFAULT_SEED 0xFFFFFFFF
@ -481,7 +484,7 @@ It might not exist for progress report where '.' is output repeatedly."""
# // model quantization parameters # // model quantization parameters
# typedef struct llama_model_quantize_params { # typedef struct llama_model_quantize_params {
# int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() # int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
# enum llama_ftype ftype; // quantize to this llama_ftype # enum llama_ftype ftype; // quantize to this llama_ftype
# bool allow_requantize; // allow quantizing non-f32/f16 tensors # bool allow_requantize; // allow quantizing non-f32/f16 tensors
# bool quantize_output_tensor; // quantize output.weight # bool quantize_output_tensor; // quantize output.weight
@ -499,7 +502,7 @@ class llama_model_quantize_params(Structure):
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
pure (bool): disable k-quant mixtures and quantize all tensors to the same type""" pure (bool): disable k-quant mixtures and quantize all tensors to the same type"""
_fields_ = [ _fields_ = [
("nthread", c_int), ("nthread", c_int32),
("ftype", c_int), ("ftype", c_int),
("allow_requantize", c_bool), ("allow_requantize", c_bool),
("quantize_output_tensor", c_bool), ("quantize_output_tensor", c_bool),
@ -698,13 +701,13 @@ _lib.llama_time_us.argtypes = []
_lib.llama_time_us.restype = ctypes.c_int64 _lib.llama_time_us.restype = ctypes.c_int64
# LLAMA_API int llama_max_devices (void); # LLAMA_API int32_t llama_max_devices(void);
def llama_max_devices() -> int: def llama_max_devices() -> int:
return _lib.llama_max_devices() return _lib.llama_max_devices()
_lib.llama_max_devices.argtypes = [] _lib.llama_max_devices.argtypes = []
_lib.llama_max_devices.restype = c_int _lib.llama_max_devices.restype = ctypes.c_int32
# LLAMA_API bool llama_mmap_supported (void); # LLAMA_API bool llama_mmap_supported (void);
@ -734,7 +737,7 @@ _lib.llama_get_model.argtypes = [llama_context_p]
_lib.llama_get_model.restype = llama_model_p _lib.llama_get_model.restype = llama_model_p
# LLAMA_API int llama_n_ctx (const struct llama_context * ctx); # LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
def llama_n_ctx(ctx: llama_context_p) -> int: def llama_n_ctx(ctx: llama_context_p) -> int:
return _lib.llama_n_ctx(ctx) return _lib.llama_n_ctx(ctx)
@ -758,31 +761,31 @@ _lib.llama_vocab_type.argtypes = [llama_model_p]
_lib.llama_vocab_type.restype = c_int _lib.llama_vocab_type.restype = c_int
# LLAMA_API int llama_n_vocab (const struct llama_model * model); # LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
def llama_n_vocab(model: llama_model_p) -> int: def llama_n_vocab(model: llama_model_p) -> int:
return _lib.llama_n_vocab(model) return _lib.llama_n_vocab(model)
_lib.llama_n_vocab.argtypes = [llama_model_p] _lib.llama_n_vocab.argtypes = [llama_model_p]
_lib.llama_n_vocab.restype = c_int _lib.llama_n_vocab.restype = c_int32
# LLAMA_API int llama_n_ctx_train(const struct llama_model * model); # LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
def llama_n_ctx_train(model: llama_model_p) -> int: def llama_n_ctx_train(model: llama_model_p) -> int:
return _lib.llama_n_ctx_train(model) return _lib.llama_n_ctx_train(model)
_lib.llama_n_ctx_train.argtypes = [llama_model_p] _lib.llama_n_ctx_train.argtypes = [llama_model_p]
_lib.llama_n_ctx_train.restype = c_int _lib.llama_n_ctx_train.restype = c_int32
# LLAMA_API int llama_n_embd (const struct llama_model * model); # LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
def llama_n_embd(model: llama_model_p) -> int: def llama_n_embd(model: llama_model_p) -> int:
return _lib.llama_n_embd(model) return _lib.llama_n_embd(model)
_lib.llama_n_embd.argtypes = [llama_model_p] _lib.llama_n_embd.argtypes = [llama_model_p]
_lib.llama_n_embd.restype = c_int _lib.llama_n_embd.restype = c_int32
# // Get the model's RoPE frequency scaling factor # // Get the model's RoPE frequency scaling factor
@ -802,7 +805,7 @@ _lib.llama_rope_freq_scale_train.restype = c_float
# // Get metadata value as a string by key name # // Get metadata value as a string by key name
# LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size); # LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
def llama_model_meta_val_str( def llama_model_meta_val_str(
model: llama_model_p, key: Union[c_char_p, bytes], buf: bytes, buf_size: int model: llama_model_p, key: Union[c_char_p, bytes], buf: bytes, buf_size: int
) -> int: ) -> int:
@ -811,22 +814,22 @@ def llama_model_meta_val_str(
_lib.llama_model_meta_val_str.argtypes = [llama_model_p, c_char_p, c_char_p, c_size_t] _lib.llama_model_meta_val_str.argtypes = [llama_model_p, c_char_p, c_char_p, c_size_t]
_lib.llama_model_meta_val_str.restype = c_int _lib.llama_model_meta_val_str.restype = c_int32
# // Get the number of metadata key/value pairs # // Get the number of metadata key/value pairs
# LLAMA_API int llama_model_meta_count(const struct llama_model * model); # LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
def llama_model_meta_count(model: llama_model_p) -> int: def llama_model_meta_count(model: llama_model_p) -> int:
"""Get the number of metadata key/value pairs""" """Get the number of metadata key/value pairs"""
return _lib.llama_model_meta_count(model) return _lib.llama_model_meta_count(model)
_lib.llama_model_meta_count.argtypes = [llama_model_p] _lib.llama_model_meta_count.argtypes = [llama_model_p]
_lib.llama_model_meta_count.restype = c_int _lib.llama_model_meta_count.restype = c_int32
# // Get metadata key name by index # // Get metadata key name by index
# LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size); # LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
def llama_model_meta_key_by_index( def llama_model_meta_key_by_index(
model: llama_model_p, i: Union[c_int, int], buf: bytes, buf_size: int model: llama_model_p, i: Union[c_int, int], buf: bytes, buf_size: int
) -> int: ) -> int:
@ -834,12 +837,17 @@ def llama_model_meta_key_by_index(
return _lib.llama_model_meta_key_by_index(model, i, buf, buf_size) return _lib.llama_model_meta_key_by_index(model, i, buf, buf_size)
_lib.llama_model_meta_key_by_index.argtypes = [llama_model_p, c_int, c_char_p, c_size_t] _lib.llama_model_meta_key_by_index.argtypes = [
_lib.llama_model_meta_key_by_index.restype = c_int llama_model_p,
c_int32,
c_char_p,
c_size_t,
]
_lib.llama_model_meta_key_by_index.restype = c_int32
# // Get metadata value as a string by index # // Get metadata value as a string by index
# LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size); # LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
def llama_model_meta_val_str_by_index( def llama_model_meta_val_str_by_index(
model: llama_model_p, i: Union[c_int, int], buf: bytes, buf_size: int model: llama_model_p, i: Union[c_int, int], buf: bytes, buf_size: int
) -> int: ) -> int:
@ -849,15 +857,15 @@ def llama_model_meta_val_str_by_index(
_lib.llama_model_meta_val_str_by_index.argtypes = [ _lib.llama_model_meta_val_str_by_index.argtypes = [
llama_model_p, llama_model_p,
c_int, c_int32,
c_char_p, c_char_p,
c_size_t, c_size_t,
] ]
_lib.llama_model_meta_val_str_by_index.restype = c_int _lib.llama_model_meta_val_str_by_index.restype = c_int32
# // Get a string describing the model type # // Get a string describing the model type
# LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size); # LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
def llama_model_desc( def llama_model_desc(
model: llama_model_p, buf: bytes, buf_size: Union[c_size_t, int] model: llama_model_p, buf: bytes, buf_size: Union[c_size_t, int]
) -> int: ) -> int:
@ -866,7 +874,7 @@ def llama_model_desc(
_lib.llama_model_desc.argtypes = [llama_model_p, c_char_p, c_size_t] _lib.llama_model_desc.argtypes = [llama_model_p, c_char_p, c_size_t]
_lib.llama_model_desc.restype = c_int _lib.llama_model_desc.restype = c_int32
# // Returns the total size of all the tensors in the model in bytes # // Returns the total size of all the tensors in the model in bytes
@ -905,7 +913,7 @@ _lib.llama_get_model_tensor.restype = c_void_p
# // Returns 0 on success # // Returns 0 on success
# LLAMA_API int llama_model_quantize( # LLAMA_API uint32_t llama_model_quantize(
# const char * fname_inp, # const char * fname_inp,
# const char * fname_out, # const char * fname_out,
# const llama_model_quantize_params * params); # const llama_model_quantize_params * params);
@ -923,7 +931,7 @@ _lib.llama_model_quantize.argtypes = [
c_char_p, c_char_p,
POINTER(llama_model_quantize_params), POINTER(llama_model_quantize_params),
] ]
_lib.llama_model_quantize.restype = c_int _lib.llama_model_quantize.restype = c_uint32
# // Apply a LoRA adapter to a loaded model # // Apply a LoRA adapter to a loaded model
@ -932,12 +940,12 @@ _lib.llama_model_quantize.restype = c_int
# // The model needs to be reloaded before applying a new adapter, otherwise the adapter # // The model needs to be reloaded before applying a new adapter, otherwise the adapter
# // will be applied on top of the previous one # // will be applied on top of the previous one
# // Returns 0 on success # // Returns 0 on success
# LLAMA_API DEPRECATED(int llama_apply_lora_from_file( # LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
# struct llama_context * ctx, # struct llama_context * ctx,
# const char * path_lora, # const char * path_lora,
# float scale, # float scale,
# const char * path_base_model, # const char * path_base_model,
# int n_threads), # int32_t n_threads),
# "use llama_model_apply_lora_from_file instead"); # "use llama_model_apply_lora_from_file instead");
def llama_apply_lora_from_file( def llama_apply_lora_from_file(
ctx: llama_context_p, ctx: llama_context_p,
@ -962,17 +970,17 @@ _lib.llama_apply_lora_from_file.argtypes = [
c_char_p, c_char_p,
c_float, c_float,
c_char_p, c_char_p,
c_int, c_int32,
] ]
_lib.llama_apply_lora_from_file.restype = c_int _lib.llama_apply_lora_from_file.restype = c_int32
# LLAMA_API int llama_model_apply_lora_from_file( # LLAMA_API int32_t llama_model_apply_lora_from_file(
# const struct llama_model * model, # const struct llama_model * model,
# const char * path_lora, # const char * path_lora,
# float scale, # float scale,
# const char * path_base_model, # const char * path_base_model,
# int n_threads); # int32_t n_threads);
def llama_model_apply_lora_from_file( def llama_model_apply_lora_from_file(
model: llama_model_p, model: llama_model_p,
path_lora: Union[c_char_p, bytes], path_lora: Union[c_char_p, bytes],
@ -990,9 +998,9 @@ _lib.llama_model_apply_lora_from_file.argtypes = [
c_char_p, c_char_p,
c_float, c_float,
c_char_p, c_char_p,
c_int, c_int32,
] ]
_lib.llama_model_apply_lora_from_file.restype = c_int _lib.llama_model_apply_lora_from_file.restype = c_int32
# // # //
# // KV cache # // KV cache
@ -1094,7 +1102,7 @@ _lib.llama_kv_cache_view_update.restype = None
# // Returns the number of tokens in the KV cache (slow, use only for debug) # // Returns the number of tokens in the KV cache (slow, use only for debug)
# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); # LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int: def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
"""Returns the number of tokens in the KV cache (slow, use only for debug) """Returns the number of tokens in the KV cache (slow, use only for debug)
If a KV cell has multiple sequences assigned to it, it will be counted multiple times If a KV cell has multiple sequences assigned to it, it will be counted multiple times
@ -1103,18 +1111,18 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
_lib.llama_get_kv_cache_token_count.restype = c_int _lib.llama_get_kv_cache_token_count.restype = c_int32
# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
# LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx); # LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
def llama_get_kv_cache_used_cells(ctx: llama_context_p) -> int: def llama_get_kv_cache_used_cells(ctx: llama_context_p) -> int:
"""Returns the number of used KV cells (i.e. have at least one sequence assigned to them)""" """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
return _lib.llama_get_kv_cache_used_cells(ctx) return _lib.llama_get_kv_cache_used_cells(ctx)
_lib.llama_get_kv_cache_used_cells.argtypes = [llama_context_p] _lib.llama_get_kv_cache_used_cells.argtypes = [llama_context_p]
_lib.llama_get_kv_cache_used_cells.restype = c_int _lib.llama_get_kv_cache_used_cells.restype = c_int32
# // Clear the KV cache # // Clear the KV cache
@ -1361,7 +1369,7 @@ _lib.llama_save_session_file.restype = c_size_t
# struct llama_context * ctx, # struct llama_context * ctx,
# llama_token * tokens, # llama_token * tokens,
# int32_t n_tokens, # int32_t n_tokens,
# int n_past), # int32_t n_past),
# "use llama_decode() instead"); # "use llama_decode() instead");
def llama_eval( def llama_eval(
ctx: llama_context_p, ctx: llama_context_p,
@ -1377,7 +1385,7 @@ def llama_eval(
return _lib.llama_eval(ctx, tokens, n_tokens, n_past) return _lib.llama_eval(ctx, tokens, n_tokens, n_past)
_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int] _lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int32, c_int32]
_lib.llama_eval.restype = c_int _lib.llama_eval.restype = c_int
@ -1387,7 +1395,7 @@ _lib.llama_eval.restype = c_int
# struct llama_context * ctx, # struct llama_context * ctx,
# float * embd, # float * embd,
# int32_t n_tokens, # int32_t n_tokens,
# int n_past), # int32_t n_past),
# "use llama_decode() instead"); # "use llama_decode() instead");
def llama_eval_embd( def llama_eval_embd(
ctx: llama_context_p, ctx: llama_context_p,
@ -1400,7 +1408,7 @@ def llama_eval_embd(
return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past) return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past)
_lib.llama_eval_embd.argtypes = [llama_context_p, c_float_p, c_int, c_int] _lib.llama_eval_embd.argtypes = [llama_context_p, c_float_p, c_int32, c_int32]
_lib.llama_eval_embd.restype = c_int _lib.llama_eval_embd.restype = c_int
@ -1480,7 +1488,7 @@ _lib.llama_batch_free.restype = None
# // 0 - success # // 0 - success
# // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) # // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
# // < 0 - error # // < 0 - error
# LLAMA_API int llama_decode( # LLAMA_API int32_t llama_decode(
# struct llama_context * ctx, # struct llama_context * ctx,
# struct llama_batch batch); # struct llama_batch batch);
def llama_decode(ctx: llama_context_p, batch: llama_batch) -> int: def llama_decode(ctx: llama_context_p, batch: llama_batch) -> int:
@ -1492,7 +1500,7 @@ def llama_decode(ctx: llama_context_p, batch: llama_batch) -> int:
_lib.llama_decode.argtypes = [llama_context_p, llama_batch] _lib.llama_decode.argtypes = [llama_context_p, llama_batch]
_lib.llama_decode.restype = c_int _lib.llama_decode.restype = c_int32
# // Set the number of threads used for decoding # // Set the number of threads used for decoding
@ -1634,25 +1642,25 @@ _lib.llama_token_nl.restype = llama_token
# // Returns -1 if unknown, 1 for true or 0 for false. # // Returns -1 if unknown, 1 for true or 0 for false.
# LLAMA_API int llama_add_bos_token(const struct llama_model * model); # LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
def llama_add_bos_token(model: llama_model_p) -> int: def llama_add_bos_token(model: llama_model_p) -> int:
"""Returns -1 if unknown, 1 for true or 0 for false.""" """Returns -1 if unknown, 1 for true or 0 for false."""
return _lib.llama_add_bos_token(model) return _lib.llama_add_bos_token(model)
_lib.llama_add_bos_token.argtypes = [llama_model_p] _lib.llama_add_bos_token.argtypes = [llama_model_p]
_lib.llama_add_bos_token.restype = c_int _lib.llama_add_bos_token.restype = c_int32
# // Returns -1 if unknown, 1 for true or 0 for false. # // Returns -1 if unknown, 1 for true or 0 for false.
# LLAMA_API int llama_add_eos_token(const struct llama_model * model); # LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
def llama_add_eos_token(model: llama_model_p) -> int: def llama_add_eos_token(model: llama_model_p) -> int:
"""Returns -1 if unknown, 1 for true or 0 for false.""" """Returns -1 if unknown, 1 for true or 0 for false."""
return _lib.llama_add_eos_token(model) return _lib.llama_add_eos_token(model)
_lib.llama_add_eos_token.argtypes = [llama_model_p] _lib.llama_add_eos_token.argtypes = [llama_model_p]
_lib.llama_add_eos_token.restype = c_int _lib.llama_add_eos_token.restype = c_int32
# // codellama infill tokens # // codellama infill tokens
@ -1704,12 +1712,12 @@ _lib.llama_token_eot.restype = llama_token
# /// @return Returns a negative number on failure - the number of tokens that would have been returned # /// @return Returns a negative number on failure - the number of tokens that would have been returned
# /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. # /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
# /// Does not insert a leading space. # /// Does not insert a leading space.
# LLAMA_API int llama_tokenize( # LLAMA_API int32_t llama_tokenize(
# const struct llama_model * model, # const struct llama_model * model,
# const char * text, # const char * text,
# int text_len, # int32_t text_len,
# llama_token * tokens, # llama_token * tokens,
# int n_max_tokens, # int32_t n_max_tokens,
# bool add_bos, # bool add_bos,
# bool special); # bool special);
def llama_tokenize( def llama_tokenize(
@ -1730,24 +1738,24 @@ def llama_tokenize(
_lib.llama_tokenize.argtypes = [ _lib.llama_tokenize.argtypes = [
llama_model_p, llama_model_p,
c_char_p, c_char_p,
c_int, c_int32,
llama_token_p, llama_token_p,
c_int, c_int32,
c_bool, c_bool,
c_bool, c_bool,
] ]
_lib.llama_tokenize.restype = c_int _lib.llama_tokenize.restype = c_int32
# // Token Id -> Piece. # // Token Id -> Piece.
# // Uses the vocabulary in the provided context. # // Uses the vocabulary in the provided context.
# // Does not write null terminator to the buffer. # // Does not write null terminator to the buffer.
# // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. # // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
# LLAMA_API int llama_token_to_piece( # LLAMA_API int32_t llama_token_to_piece(
# const struct llama_model * model, # const struct llama_model * model,
# llama_token token, # llama_token token,
# char * buf, # char * buf,
# int length); # int32_t length);
def llama_token_to_piece( def llama_token_to_piece(
model: llama_model_p, model: llama_model_p,
token: Union[llama_token, int], token: Union[llama_token, int],
@ -1762,8 +1770,8 @@ def llama_token_to_piece(
return _lib.llama_token_to_piece(model, token, buf, length) return _lib.llama_token_to_piece(model, token, buf, length)
_lib.llama_token_to_piece.argtypes = [llama_model_p, llama_token, c_char_p, c_int] _lib.llama_token_to_piece.argtypes = [llama_model_p, llama_token, c_char_p, c_int32]
_lib.llama_token_to_piece.restype = c_int _lib.llama_token_to_piece.restype = c_int32
# // # //
@ -1924,7 +1932,7 @@ _lib.llama_sample_softmax.restype = None
# LLAMA_API void llama_sample_top_k( # LLAMA_API void llama_sample_top_k(
# struct llama_context * ctx, # struct llama_context * ctx,
# llama_token_data_array * candidates, # llama_token_data_array * candidates,
# int k, # int32_t k,
# size_t min_keep); # size_t min_keep);
def llama_sample_top_k( def llama_sample_top_k(
ctx: llama_context_p, ctx: llama_context_p,
@ -1939,7 +1947,7 @@ def llama_sample_top_k(
_lib.llama_sample_top_k.argtypes = [ _lib.llama_sample_top_k.argtypes = [
llama_context_p, llama_context_p,
llama_token_data_array_p, llama_token_data_array_p,
c_int, c_int32,
c_size_t, c_size_t,
] ]
_lib.llama_sample_top_k.restype = None _lib.llama_sample_top_k.restype = None
@ -2129,7 +2137,7 @@ _lib.llama_sample_grammar.restype = None
# llama_token_data_array * candidates, # llama_token_data_array * candidates,
# float tau, # float tau,
# float eta, # float eta,
# int m, # int32_t m,
# float * mu); # float * mu);
def llama_sample_token_mirostat( def llama_sample_token_mirostat(
ctx: llama_context_p, ctx: llama_context_p,
@ -2155,7 +2163,7 @@ _lib.llama_sample_token_mirostat.argtypes = [
llama_token_data_array_p, llama_token_data_array_p,
c_float, c_float,
c_float, c_float,
c_int, c_int32,
c_float_p, c_float_p,
] ]
_lib.llama_sample_token_mirostat.restype = llama_token _lib.llama_sample_token_mirostat.restype = llama_token
@ -2320,8 +2328,8 @@ llama_beam_search_callback_fn_t = ctypes.CFUNCTYPE(None, c_void_p, llama_beams_s
# llama_beam_search_callback_fn_t callback, # llama_beam_search_callback_fn_t callback,
# void * callback_data, # void * callback_data,
# size_t n_beams, # size_t n_beams,
# int n_past, # int32_t n_past,
# int n_predict); # int32_t n_predict);
def llama_beam_search( def llama_beam_search(
ctx: llama_context_p, ctx: llama_context_p,
callback: "ctypes._CFuncPtr[None, c_void_p, llama_beams_state]", # type: ignore callback: "ctypes._CFuncPtr[None, c_void_p, llama_beams_state]", # type: ignore
@ -2340,8 +2348,8 @@ _lib.llama_beam_search.argtypes = [
llama_beam_search_callback_fn_t, llama_beam_search_callback_fn_t,
c_void_p, c_void_p,
c_size_t, c_size_t,
c_int, c_int32,
c_int, c_int32,
] ]
_lib.llama_beam_search.restype = None _lib.llama_beam_search.restype = None

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit f6793491b5af6da75edad34d6f503ef86d31b09f Subproject commit b3a7c20b5c035250257d2b62851c379b159c899a