Merge branch 'main' into v0.2-wip

2023-07-24 13:19:54 -04:00 · 2023-07-24 13:19:54 -04:00 · 77c9f496b0
commit 77c9f496b0
parent 436036aa67 4aaaec561d
4 changed files with 98 additions and 7 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

+## [0.1.76]
+
+- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
+
 ## [0.1.75]

 - Update llama.cpp
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -162,8 +162,11 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 #     uint32_t seed;         // RNG seed, -1 for random
 #     int32_t  n_ctx;        // text context
 #     int32_t  n_batch;      // prompt processing batch size
+#     int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
+#     float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
 #     int32_t  n_gpu_layers; // number of layers to store in VRAM
 #     int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
+#
 #     const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)

 #     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
@ -190,6 +193,8 @@ class llama_context_params(Structure):
        ("seed", c_uint32),
        ("n_ctx", c_int32),
        ("n_batch", c_int32),
+        ("n_gqa", c_int32),
+        ("rms_norm_eps", c_float),
        ("n_gpu_layers", c_int32),
        ("main_gpu", c_int32),
        ("tensor_split", POINTER(c_float)),
@ -265,6 +270,57 @@ class llama_model_quantize_params(Structure):
    ]


+# // grammar types
+# struct llama_grammar;
+llama_grammar_p = c_void_p
+
+# // grammar element type
+# enum llama_gretype {
+#     // end of rule definition
+#     LLAMA_GRETYPE_END            = 0,
+
+#     // start of alternate definition for rule
+#     LLAMA_GRETYPE_ALT            = 1,
+
+#     // non-terminal element: reference to rule
+#     LLAMA_GRETYPE_RULE_REF       = 2,
+
+#     // terminal element: character (code point)
+#     LLAMA_GRETYPE_CHAR           = 3,
+
+#     // inverse char(s) ([^a], [^a-b] [^abc])
+#     LLAMA_GRETYPE_CHAR_NOT       = 4,
+
+#     // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
+#     // be an inclusive range ([a-z])
+#     LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
+
+#     // modifies a preceding LLAMA_GRETYPE_CHAR or
+#     // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+#     LLAMA_GRETYPE_CHAR_ALT       = 6,
+# };
+LLAMA_GRETYPE_END = c_int(0)
+LLAMA_GRETYPE_ALT = c_int(1)
+LLAMA_GRETYPE_RULE_REF = c_int(2)
+LLAMA_GRETYPE_CHAR = c_int(3)
+LLAMA_GRETYPE_CHAR_NOT = c_int(4)
+LLAMA_GRETYPE_CHAR_RNG_UPPER = c_int(5)
+LLAMA_GRETYPE_CHAR_ALT = c_int(6)
+
+
+# typedef struct llama_grammar_element {
+#     enum llama_gretype type;
+#     uint32_t           value; // Unicode code point or rule ID
+# } llama_grammar_element;
+class llama_grammar_element(Structure):
+    _fields_ = [
+        ("type", c_int),
+        ("value", c_uint32),
+    ]
+
+
+llama_grammar_element_p = POINTER(llama_grammar_element)
+
 # // performance timing information
 # struct llama_timings {
 #     double t_start_ms;
@ -871,6 +927,37 @@ _lib.llama_token_nl.argtypes = []
 _lib.llama_token_nl.restype = llama_token


+# // Grammar
+# //
+# LLAMA_API struct llama_grammar * llama_grammar_init(
+#         const llama_grammar_element ** rules,
+#                                 size_t    n_rules,
+#                                 size_t    start_rule_index);
+def llama_grammar_init(
+    rules,  # type: Array[llama_grammar_element_p] # type: ignore
+    n_rules: c_size_t,
+    start_rule_index: c_size_t,
+) -> llama_grammar_p:
+    return _lib.llama_grammar_init(rules, n_rules, start_rule_index)
+
+
+_lib.llama_grammar_init.argtypes = [
+    POINTER(llama_grammar_element_p),
+    c_size_t,
+    c_size_t,
+]
+_lib.llama_grammar_init.restype = llama_grammar_p
+
+
+# LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
+def llama_grammar_free(grammar: llama_grammar_p):
+    return _lib.llama_grammar_free(grammar)
+
+
+_lib.llama_grammar_free.argtypes = [llama_grammar_p]
+_lib.llama_grammar_free.restype = None
+
+
 # Sampling functions


--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"

 [project]
 name = "llama_cpp_python"
-version = "0.1.75"
+version = "0.1.76"
 description = "Python bindings for the llama.cpp library"
 readme = "README.md"
 license = { text = "MIT" }
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit d924522a46c5ef097af4a88087d91673e8e87e4d
+Subproject commit 41c674161fb2459bdf7806d1eebead15bc5d046e