llama.cpp/llama_cpp/llama_cpp.py

import ctypes

from ctypes import c_int, c_float, c_char_p, c_void_p, c_bool, POINTER, Structure, Array, c_uint8, c_size_t

import pathlib
from itertools import chain

# Load the library
# TODO: fragile, should fix
_base_path = pathlib.Path(__file__).parent
(_lib_path,) = chain(
    _base_path.glob("*.so"), _base_path.glob("*.dylib"), _base_path.glob("*.dll")
)
_lib = ctypes.CDLL(str(_lib_path))

# C types
llama_context_p = c_void_p


llama_token = c_int
llama_token_p = POINTER(llama_token)


class llama_token_data(Structure):
    _fields_ = [
        ("id", llama_token),  # token id
        ("p", c_float),  # probability of the token
        ("plog", c_float),  # log probability of the token
    ]


llama_token_data_p = POINTER(llama_token_data)

llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)


class llama_context_params(Structure):
    _fields_ = [
        ("n_ctx", c_int),  # text context
        ("n_parts", c_int),  # -1 for default
        ("seed", c_int),  # RNG seed, 0 for random
        ("f16_kv", c_bool),  # use fp16 for KV cache
        (
            "logits_all",
            c_bool,
        ),  # the llama_eval() call computes all logits, not just the last one
        ("vocab_only", c_bool),  # only load the vocabulary, no weights
        ("use_mlock", c_bool),  # force system to keep model in RAM
        ("embedding", c_bool),  # embedding mode only
        # called with a progress value between 0 and 1, pass NULL to disable
        ("progress_callback", llama_progress_callback),
        # context pointer passed to the progress callback
        ("progress_callback_user_data", c_void_p),
    ]


llama_context_params_p = POINTER(llama_context_params)


# Functions


def llama_context_default_params() -> llama_context_params:
    return _lib.llama_context_default_params()


_lib.llama_context_default_params.argtypes = []
_lib.llama_context_default_params.restype = llama_context_params


# Various functions for loading a ggml llama model.
# Allocate (almost) all memory needed for the model.
# Return NULL on failure
def llama_init_from_file(
    path_model: bytes, params: llama_context_params
) -> llama_context_p:
    return _lib.llama_init_from_file(path_model, params)


_lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params]
_lib.llama_init_from_file.restype = llama_context_p


# Frees all allocated memory
def llama_free(ctx: llama_context_p):
    _lib.llama_free(ctx)


_lib.llama_free.argtypes = [llama_context_p]
_lib.llama_free.restype = None


# TODO: not great API - very likely to change
# Returns 0 on success
def llama_model_quantize(
    fname_inp: bytes, fname_out: bytes, itype: c_int, qk: c_int
) -> c_int:
    return _lib.llama_model_quantize(fname_inp, fname_out, itype, qk)


_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
_lib.llama_model_quantize.restype = c_int

# Returns the KV cache that will contain the context for the
# ongoing prediction with the model.
def llama_get_kv_cache(ctx: llama_context_p):
    return _lib.llama_get_kv_cache(ctx)

_lib.llama_get_kv_cache.argtypes = [llama_context_p]
_lib.llama_get_kv_cache.restype = POINTER(c_uint8)

# Returns the size of the KV cache
def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t:
    return _lib.llama_get_kv_cache_size(ctx)

_lib.llama_get_kv_cache_size.argtypes = [llama_context_p]
_lib.llama_get_kv_cache_size.restype = c_size_t

# Returns the number of tokens in the KV cache
def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
    return _lib.llama_get_kv_cache_token_count(ctx)

_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
_lib.llama_get_kv_cache_token_count.restype = c_int


# Sets the KV cache containing the current context for the model
def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int):
    return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count)

_lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int]
_lib.llama_set_kv_cache.restype = None


# Run the llama inference to obtain the logits and probabilities for the next token.
# tokens + n_tokens is the provided batch of new tokens to process
# n_past is the number of tokens to use from previous eval calls
# Returns 0 on success
def llama_eval(
    ctx: llama_context_p,
    tokens,  # type: Array[llama_token]
    n_tokens: c_int,
    n_past: c_int,
    n_threads: c_int,
) -> c_int:
    return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)


_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int]
_lib.llama_eval.restype = c_int


# Convert the provided text into tokens.
# The tokens pointer must be large enough to hold the resulting tokens.
# Returns the number of tokens on success, no more than n_max_tokens
# Returns a negative number on failure - the number of tokens that would have been returned
# TODO: not sure if correct
def llama_tokenize(
    ctx: llama_context_p,
    text: bytes,
    tokens,  # type: Array[llama_token]
    n_max_tokens: c_int,
    add_bos: c_bool,
) -> c_int:
    return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)


_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
_lib.llama_tokenize.restype = c_int


def llama_n_vocab(ctx: llama_context_p) -> c_int:
    return _lib.llama_n_vocab(ctx)


_lib.llama_n_vocab.argtypes = [llama_context_p]
_lib.llama_n_vocab.restype = c_int


def llama_n_ctx(ctx: llama_context_p) -> c_int:
    return _lib.llama_n_ctx(ctx)


_lib.llama_n_ctx.argtypes = [llama_context_p]
_lib.llama_n_ctx.restype = c_int


def llama_n_embd(ctx: llama_context_p) -> c_int:
    return _lib.llama_n_ctx(ctx)


_lib.llama_n_embd.argtypes = [llama_context_p]
_lib.llama_n_embd.restype = c_int


# Token logits obtained from the last call to llama_eval()
# The logits for the last token are stored in the last row
# Can be mutated in order to change the probabilities of the next token
# Rows: n_tokens
# Cols: n_vocab
def llama_get_logits(ctx: llama_context_p):
    return _lib.llama_get_logits(ctx)


_lib.llama_get_logits.argtypes = [llama_context_p]
_lib.llama_get_logits.restype = POINTER(c_float)


# Get the embeddings for the input
# shape: [n_embd] (1-dimensional)
def llama_get_embeddings(ctx: llama_context_p):
    return _lib.llama_get_embeddings(ctx)


_lib.llama_get_embeddings.argtypes = [llama_context_p]
_lib.llama_get_embeddings.restype = POINTER(c_float)


# Token Id -> String. Uses the vocabulary in the provided context
def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
    return _lib.llama_token_to_str(ctx, token)


_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
_lib.llama_token_to_str.restype = c_char_p

# Special tokens


def llama_token_bos() -> llama_token:
    return _lib.llama_token_bos()


_lib.llama_token_bos.argtypes = []
_lib.llama_token_bos.restype = llama_token


def llama_token_eos() -> llama_token:
    return _lib.llama_token_eos()


_lib.llama_token_eos.argtypes = []
_lib.llama_token_eos.restype = llama_token


# TODO: improve the last_n_tokens interface ?
def llama_sample_top_p_top_k(
    ctx: llama_context_p,
    last_n_tokens_data,  # type: Array[llama_token]
    last_n_tokens_size: c_int,
    top_k: c_int,
    top_p: c_float,
    temp: c_float,
    repeat_penalty: c_float,
) -> llama_token:
    return _lib.llama_sample_top_p_top_k(
        ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty
    )


_lib.llama_sample_top_p_top_k.argtypes = [
    llama_context_p,
    llama_token_p,
    c_int,
    c_int,
    c_float,
    c_float,
    c_float,
]
_lib.llama_sample_top_p_top_k.restype = llama_token


# Performance information


def llama_print_timings(ctx: llama_context_p):
    _lib.llama_print_timings(ctx)


_lib.llama_print_timings.argtypes = [llama_context_p]
_lib.llama_print_timings.restype = None


def llama_reset_timings(ctx: llama_context_p):
    _lib.llama_reset_timings(ctx)


_lib.llama_reset_timings.argtypes = [llama_context_p]
_lib.llama_reset_timings.restype = None


# Print system information
def llama_print_system_info() -> bytes:
    return _lib.llama_print_system_info()


_lib.llama_print_system_info.argtypes = []
_lib.llama_print_system_info.restype = c_char_p
Initial commit 2023-03-23 09:33:06 +00:00			`import ctypes`

Update llama_cpp and add kv_cache api support 2023-04-02 17:33:49 +00:00			`from ctypes import c_int, c_float, c_char_p, c_void_p, c_bool, POINTER, Structure, Array, c_uint8, c_size_t`
Initial commit 2023-03-23 09:33:06 +00:00
			`import pathlib`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`from itertools import chain`
Initial commit 2023-03-23 09:33:06 +00:00
			`# Load the library`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`# TODO: fragile, should fix`
			`_base_path = pathlib.Path(__file__).parent`
			`(_lib_path,) = chain(`
			`_base_path.glob(".so"), _base_path.glob(".dylib"), _base_path.glob("*.dll")`
			`)`
			`_lib = ctypes.CDLL(str(_lib_path))`
Initial commit 2023-03-23 09:33:06 +00:00
			`# C types`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00			`llama_context_p = c_void_p`


Initial commit 2023-03-23 09:33:06 +00:00			`llama_token = c_int`
			`llama_token_p = POINTER(llama_token)`

Black formatting 2023-03-24 18:35:41 +00:00
Initial commit 2023-03-23 09:33:06 +00:00			`class llama_token_data(Structure):`
			`_fields_ = [`
Black formatting 2023-03-24 18:35:41 +00:00			`("id", llama_token), # token id`
			`("p", c_float), # probability of the token`
			`("plog", c_float), # log probability of the token`
Initial commit 2023-03-23 09:33:06 +00:00			`]`

Black formatting 2023-03-24 18:35:41 +00:00
Initial commit 2023-03-23 09:33:06 +00:00			`llama_token_data_p = POINTER(llama_token_data)`

Update llama.cpp 2023-03-29 01:10:23 +00:00			`llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)`
Black formatting 2023-03-24 18:35:41 +00:00
Update llama.cpp 2023-03-25 20:26:03 +00:00
Initial commit 2023-03-23 09:33:06 +00:00			`class llama_context_params(Structure):`
			`_fields_ = [`
Black formatting 2023-03-24 18:35:41 +00:00			`("n_ctx", c_int), # text context`
			`("n_parts", c_int), # -1 for default`
			`("seed", c_int), # RNG seed, 0 for random`
			`("f16_kv", c_bool), # use fp16 for KV cache`
			`(`
			`"logits_all",`
			`c_bool,`
			`), # the llama_eval() call computes all logits, not just the last one`
			`("vocab_only", c_bool), # only load the vocabulary, no weights`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00			`("use_mlock", c_bool), # force system to keep model in RAM`
			`("embedding", c_bool), # embedding mode only`
Update llama.cpp 2023-03-25 16:12:09 +00:00			`# called with a progress value between 0 and 1, pass NULL to disable`
			`("progress_callback", llama_progress_callback),`
			`# context pointer passed to the progress callback`
			`("progress_callback_user_data", c_void_p),`
Initial commit 2023-03-23 09:33:06 +00:00			`]`

Black formatting 2023-03-24 18:35:41 +00:00
Initial commit 2023-03-23 09:33:06 +00:00			`llama_context_params_p = POINTER(llama_context_params)`


Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00			`# Functions`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Initial commit 2023-03-23 09:33:06 +00:00			`def llama_context_default_params() -> llama_context_params:`
Update llama.cpp 2023-03-29 01:10:23 +00:00			`return _lib.llama_context_default_params()`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_context_default_params.argtypes = []`
			`_lib.llama_context_default_params.restype = llama_context_params`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00
Black formatting 2023-03-24 18:59:29 +00:00
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00			`# Various functions for loading a ggml llama model.`
			`# Allocate (almost) all memory needed for the model.`
			`# Return NULL on failure`
Black formatting 2023-03-24 18:35:41 +00:00			`def llama_init_from_file(`
			`path_model: bytes, params: llama_context_params`
			`) -> llama_context_p:`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_init_from_file(path_model, params)`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params]`
			`_lib.llama_init_from_file.restype = llama_context_p`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00
Black formatting 2023-03-24 18:59:29 +00:00
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00			`# Frees all allocated memory`
Initial commit 2023-03-23 09:33:06 +00:00			`def llama_free(ctx: llama_context_p):`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_free(ctx)`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_free.argtypes = [llama_context_p]`
			`_lib.llama_free.restype = None`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00
Black formatting 2023-03-24 18:59:29 +00:00
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00			`# TODO: not great API - very likely to change`
			`# Returns 0 on success`
Black formatting 2023-03-24 18:35:41 +00:00			`def llama_model_quantize(`
			`fname_inp: bytes, fname_out: bytes, itype: c_int, qk: c_int`
			`) -> c_int:`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_model_quantize(fname_inp, fname_out, itype, qk)`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]`
			`_lib.llama_model_quantize.restype = c_int`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00
Update llama_cpp and add kv_cache api support 2023-04-02 17:33:49 +00:00			`# Returns the KV cache that will contain the context for the`
			`# ongoing prediction with the model.`
			`def llama_get_kv_cache(ctx: llama_context_p):`
			`return _lib.llama_get_kv_cache(ctx)`

			`_lib.llama_get_kv_cache.argtypes = [llama_context_p]`
			`_lib.llama_get_kv_cache.restype = POINTER(c_uint8)`

			`# Returns the size of the KV cache`
			`def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t:`
			`return _lib.llama_get_kv_cache_size(ctx)`

			`_lib.llama_get_kv_cache_size.argtypes = [llama_context_p]`
			`_lib.llama_get_kv_cache_size.restype = c_size_t`

			`# Returns the number of tokens in the KV cache`
			`def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:`
			`return _lib.llama_get_kv_cache_token_count(ctx)`

			`_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]`
			`_lib.llama_get_kv_cache_token_count.restype = c_int`


			`# Sets the KV cache containing the current context for the model`
			`def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int):`
			`return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count)`

			`_lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int]`
			`_lib.llama_set_kv_cache.restype = None`

Black formatting 2023-03-24 18:59:29 +00:00
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00			`# Run the llama inference to obtain the logits and probabilities for the next token.`
			`# tokens + n_tokens is the provided batch of new tokens to process`
			`# n_past is the number of tokens to use from previous eval calls`
			`# Returns 0 on success`
Black formatting 2023-03-24 18:35:41 +00:00			`def llama_eval(`
			`ctx: llama_context_p,`
Fix ctypes typing issue for Arrays 2023-03-31 07:20:15 +00:00			`tokens, # type: Array[llama_token]`
Black formatting 2023-03-24 18:35:41 +00:00			`n_tokens: c_int,`
			`n_past: c_int,`
			`n_threads: c_int,`
			`) -> c_int:`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int]`
			`_lib.llama_eval.restype = c_int`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00

			`# Convert the provided text into tokens.`
			`# The tokens pointer must be large enough to hold the resulting tokens.`
			`# Returns the number of tokens on success, no more than n_max_tokens`
			`# Returns a negative number on failure - the number of tokens that would have been returned`
			`# TODO: not sure if correct`
Black formatting 2023-03-24 18:35:41 +00:00			`def llama_tokenize(`
			`ctx: llama_context_p,`
			`text: bytes,`
Fix ctypes typing issue for Arrays 2023-03-31 07:20:15 +00:00			`tokens, # type: Array[llama_token]`
Black formatting 2023-03-24 18:35:41 +00:00			`n_max_tokens: c_int,`
			`add_bos: c_bool,`
			`) -> c_int:`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]`
			`_lib.llama_tokenize.restype = c_int`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00

Initial commit 2023-03-23 09:33:06 +00:00			`def llama_n_vocab(ctx: llama_context_p) -> c_int:`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_n_vocab(ctx)`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_n_vocab.argtypes = [llama_context_p]`
			`_lib.llama_n_vocab.restype = c_int`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00

Initial commit 2023-03-23 09:33:06 +00:00			`def llama_n_ctx(ctx: llama_context_p) -> c_int:`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_n_ctx(ctx)`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_n_ctx.argtypes = [llama_context_p]`
			`_lib.llama_n_ctx.restype = c_int`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00
Black formatting 2023-03-24 18:59:29 +00:00
Update llama.cpp 2023-03-25 20:26:03 +00:00			`def llama_n_embd(ctx: llama_context_p) -> c_int:`
			`return _lib.llama_n_ctx(ctx)`


			`_lib.llama_n_embd.argtypes = [llama_context_p]`
			`_lib.llama_n_embd.restype = c_int`


Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00			`# Token logits obtained from the last call to llama_eval()`
			`# The logits for the last token are stored in the last row`
			`# Can be mutated in order to change the probabilities of the next token`
			`# Rows: n_tokens`
			`# Cols: n_vocab`
Fix ctypes typing issue for Arrays 2023-03-31 07:20:15 +00:00			`def llama_get_logits(ctx: llama_context_p):`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_get_logits(ctx)`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_get_logits.argtypes = [llama_context_p]`
			`_lib.llama_get_logits.restype = POINTER(c_float)`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00
Black formatting 2023-03-24 18:59:29 +00:00
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00			`# Get the embeddings for the input`
			`# shape: [n_embd] (1-dimensional)`
Fix ctypes typing issue for Arrays 2023-03-31 07:20:15 +00:00			`def llama_get_embeddings(ctx: llama_context_p):`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_get_embeddings(ctx)`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00
Black formatting 2023-03-24 18:59:29 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_get_embeddings.argtypes = [llama_context_p]`
			`_lib.llama_get_embeddings.restype = POINTER(c_float)`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00
Black formatting 2023-03-24 18:59:29 +00:00
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00			`# Token Id -> String. Uses the vocabulary in the provided context`
Fix type signature of token_to_str 2023-03-31 07:25:12 +00:00			`def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_token_to_str(ctx, token)`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]`
			`_lib.llama_token_to_str.restype = c_char_p`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00
			`# Special tokens`

Black formatting 2023-03-24 18:59:29 +00:00
Initial commit 2023-03-23 09:33:06 +00:00			`def llama_token_bos() -> llama_token:`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_token_bos()`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_token_bos.argtypes = []`
			`_lib.llama_token_bos.restype = llama_token`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00

Initial commit 2023-03-23 09:33:06 +00:00			`def llama_token_eos() -> llama_token:`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_token_eos()`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_token_eos.argtypes = []`
			`_lib.llama_token_eos.restype = llama_token`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00

			`# TODO: improve the last_n_tokens interface ?`
Black formatting 2023-03-24 18:35:41 +00:00			`def llama_sample_top_p_top_k(`
			`ctx: llama_context_p,`
Fix ctypes typing issue for Arrays 2023-03-31 07:20:15 +00:00			`last_n_tokens_data, # type: Array[llama_token]`
Black formatting 2023-03-24 18:35:41 +00:00			`last_n_tokens_size: c_int,`
			`top_k: c_int,`
Update llama.cpp 2023-03-29 01:10:23 +00:00			`top_p: c_float,`
			`temp: c_float,`
			`repeat_penalty: c_float,`
Black formatting 2023-03-24 18:35:41 +00:00			`) -> llama_token:`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_sample_top_p_top_k(`
Black formatting 2023-03-24 18:35:41 +00:00			`ctx, last_n_tokens_data, last_n_tokens_size, top_k, top_p, temp, repeat_penalty`
			`)`

Initial commit 2023-03-23 09:33:06 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_sample_top_p_top_k.argtypes = [`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00			`llama_context_p,`
			`llama_token_p,`
			`c_int,`
			`c_int,`
Update llama.cpp 2023-03-29 01:10:23 +00:00			`c_float,`
			`c_float,`
			`c_float,`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00			`]`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_sample_top_p_top_k.restype = llama_token`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00

			`# Performance information`

Black formatting 2023-03-24 18:59:29 +00:00
Initial commit 2023-03-23 09:33:06 +00:00			`def llama_print_timings(ctx: llama_context_p):`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_print_timings(ctx)`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_print_timings.argtypes = [llama_context_p]`
			`_lib.llama_print_timings.restype = None`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00

Initial commit 2023-03-23 09:33:06 +00:00			`def llama_reset_timings(ctx: llama_context_p):`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_reset_timings(ctx)`
Initial commit 2023-03-23 09:33:06 +00:00
Black formatting 2023-03-24 18:35:41 +00:00
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_reset_timings.argtypes = [llama_context_p]`
			`_lib.llama_reset_timings.restype = None`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00

			`# Print system information`
Initial commit 2023-03-23 09:33:06 +00:00			`def llama_print_system_info() -> bytes:`
Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`return _lib.llama_print_system_info()`
Update llama.cpp and re-organize low-level api 2023-03-24 18:58:42 +00:00

Bugfix: cross-platform method to find shared lib 2023-03-24 22:43:29 +00:00			`_lib.llama_print_system_info.argtypes = []`
			`_lib.llama_print_system_info.restype = c_char_p`