Merge branch 'main' into v0.2-wip
This commit is contained in:
commit
77c9f496b0
4 changed files with 98 additions and 7 deletions
|
@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.1.76]
|
||||||
|
|
||||||
|
- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
|
||||||
|
|
||||||
## [0.1.75]
|
## [0.1.75]
|
||||||
|
|
||||||
- Update llama.cpp
|
- Update llama.cpp
|
||||||
|
|
|
@ -159,11 +159,14 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
|
||||||
|
|
||||||
|
|
||||||
# struct llama_context_params {
|
# struct llama_context_params {
|
||||||
# uint32_t seed; // RNG seed, -1 for random
|
# uint32_t seed; // RNG seed, -1 for random
|
||||||
# int32_t n_ctx; // text context
|
# int32_t n_ctx; // text context
|
||||||
# int32_t n_batch; // prompt processing batch size
|
# int32_t n_batch; // prompt processing batch size
|
||||||
# int32_t n_gpu_layers; // number of layers to store in VRAM
|
# int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
||||||
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
# float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
||||||
|
# int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
|
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
|
#
|
||||||
# const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
# const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
||||||
|
|
||||||
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
|
@ -190,6 +193,8 @@ class llama_context_params(Structure):
|
||||||
("seed", c_uint32),
|
("seed", c_uint32),
|
||||||
("n_ctx", c_int32),
|
("n_ctx", c_int32),
|
||||||
("n_batch", c_int32),
|
("n_batch", c_int32),
|
||||||
|
("n_gqa", c_int32),
|
||||||
|
("rms_norm_eps", c_float),
|
||||||
("n_gpu_layers", c_int32),
|
("n_gpu_layers", c_int32),
|
||||||
("main_gpu", c_int32),
|
("main_gpu", c_int32),
|
||||||
("tensor_split", POINTER(c_float)),
|
("tensor_split", POINTER(c_float)),
|
||||||
|
@ -265,6 +270,57 @@ class llama_model_quantize_params(Structure):
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# // grammar types
|
||||||
|
# struct llama_grammar;
|
||||||
|
llama_grammar_p = c_void_p
|
||||||
|
|
||||||
|
# // grammar element type
|
||||||
|
# enum llama_gretype {
|
||||||
|
# // end of rule definition
|
||||||
|
# LLAMA_GRETYPE_END = 0,
|
||||||
|
|
||||||
|
# // start of alternate definition for rule
|
||||||
|
# LLAMA_GRETYPE_ALT = 1,
|
||||||
|
|
||||||
|
# // non-terminal element: reference to rule
|
||||||
|
# LLAMA_GRETYPE_RULE_REF = 2,
|
||||||
|
|
||||||
|
# // terminal element: character (code point)
|
||||||
|
# LLAMA_GRETYPE_CHAR = 3,
|
||||||
|
|
||||||
|
# // inverse char(s) ([^a], [^a-b] [^abc])
|
||||||
|
# LLAMA_GRETYPE_CHAR_NOT = 4,
|
||||||
|
|
||||||
|
# // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
||||||
|
# // be an inclusive range ([a-z])
|
||||||
|
# LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
|
||||||
|
|
||||||
|
# // modifies a preceding LLAMA_GRETYPE_CHAR or
|
||||||
|
# // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
||||||
|
# LLAMA_GRETYPE_CHAR_ALT = 6,
|
||||||
|
# };
|
||||||
|
LLAMA_GRETYPE_END = c_int(0)
|
||||||
|
LLAMA_GRETYPE_ALT = c_int(1)
|
||||||
|
LLAMA_GRETYPE_RULE_REF = c_int(2)
|
||||||
|
LLAMA_GRETYPE_CHAR = c_int(3)
|
||||||
|
LLAMA_GRETYPE_CHAR_NOT = c_int(4)
|
||||||
|
LLAMA_GRETYPE_CHAR_RNG_UPPER = c_int(5)
|
||||||
|
LLAMA_GRETYPE_CHAR_ALT = c_int(6)
|
||||||
|
|
||||||
|
|
||||||
|
# typedef struct llama_grammar_element {
|
||||||
|
# enum llama_gretype type;
|
||||||
|
# uint32_t value; // Unicode code point or rule ID
|
||||||
|
# } llama_grammar_element;
|
||||||
|
class llama_grammar_element(Structure):
|
||||||
|
_fields_ = [
|
||||||
|
("type", c_int),
|
||||||
|
("value", c_uint32),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
llama_grammar_element_p = POINTER(llama_grammar_element)
|
||||||
|
|
||||||
# // performance timing information
|
# // performance timing information
|
||||||
# struct llama_timings {
|
# struct llama_timings {
|
||||||
# double t_start_ms;
|
# double t_start_ms;
|
||||||
|
@ -871,6 +927,37 @@ _lib.llama_token_nl.argtypes = []
|
||||||
_lib.llama_token_nl.restype = llama_token
|
_lib.llama_token_nl.restype = llama_token
|
||||||
|
|
||||||
|
|
||||||
|
# // Grammar
|
||||||
|
# //
|
||||||
|
# LLAMA_API struct llama_grammar * llama_grammar_init(
|
||||||
|
# const llama_grammar_element ** rules,
|
||||||
|
# size_t n_rules,
|
||||||
|
# size_t start_rule_index);
|
||||||
|
def llama_grammar_init(
|
||||||
|
rules, # type: Array[llama_grammar_element_p] # type: ignore
|
||||||
|
n_rules: c_size_t,
|
||||||
|
start_rule_index: c_size_t,
|
||||||
|
) -> llama_grammar_p:
|
||||||
|
return _lib.llama_grammar_init(rules, n_rules, start_rule_index)
|
||||||
|
|
||||||
|
|
||||||
|
_lib.llama_grammar_init.argtypes = [
|
||||||
|
POINTER(llama_grammar_element_p),
|
||||||
|
c_size_t,
|
||||||
|
c_size_t,
|
||||||
|
]
|
||||||
|
_lib.llama_grammar_init.restype = llama_grammar_p
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
||||||
|
def llama_grammar_free(grammar: llama_grammar_p):
|
||||||
|
return _lib.llama_grammar_free(grammar)
|
||||||
|
|
||||||
|
|
||||||
|
_lib.llama_grammar_free.argtypes = [llama_grammar_p]
|
||||||
|
_lib.llama_grammar_free.restype = None
|
||||||
|
|
||||||
|
|
||||||
# Sampling functions
|
# Sampling functions
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
|
||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "llama_cpp_python"
|
name = "llama_cpp_python"
|
||||||
version = "0.1.75"
|
version = "0.1.76"
|
||||||
description = "Python bindings for the llama.cpp library"
|
description = "Python bindings for the llama.cpp library"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = { text = "MIT" }
|
license = { text = "MIT" }
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit d924522a46c5ef097af4a88087d91673e8e87e4d
|
Subproject commit 41c674161fb2459bdf7806d1eebead15bc5d046e
|
Loading…
Reference in a new issue