From 8be7d67f7e649196b63210408bd7bb54ef1cf791 Mon Sep 17 00:00:00 2001 From: bretello Date: Mon, 24 Jul 2023 14:42:37 +0200 Subject: [PATCH 1/5] raise exception when `llama_load_model_from_file` fails --- llama_cpp/llama_cpp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index eea26ac..949c6af 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -367,7 +367,10 @@ _lib.llama_backend_free.restype = None def llama_load_model_from_file( path_model: bytes, params: llama_context_params ) -> llama_model_p: - return _lib.llama_load_model_from_file(path_model, params) + result = _lib.llama_load_model_from_file(path_model, params) + if result is None: + raise Exception(f"Failed to load model from {path_model}") + return result _lib.llama_load_model_from_file.argtypes = [c_char_p, llama_context_params] From 985d559971cf2db595c7947d38074558eb9d893b Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Jul 2023 13:04:34 -0400 Subject: [PATCH 2/5] Update llama.cpp --- llama_cpp/llama_cpp.py | 95 +++++++++++++++++++++++++++++++++++++++--- vendor/llama.cpp | 2 +- 2 files changed, 91 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index eea26ac..c9d79b9 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -159,11 +159,13 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # struct llama_context_params { -# uint32_t seed; // RNG seed, -1 for random -# int32_t n_ctx; // text context -# int32_t n_batch; // prompt processing batch size -# int32_t n_gpu_layers; // number of layers to store in VRAM -# int32_t main_gpu; // the GPU that is used for scratch and small tensors +# uint32_t seed; // RNG seed, -1 for random +# int32_t n_ctx; // text context +# int32_t n_batch; // prompt processing batch size +# int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams) +# int32_t n_gpu_layers; // number of layers to store in VRAM +# int32_t main_gpu; // the GPU that is used for scratch and small tensors +# # const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) # // ref: https://github.com/ggerganov/llama.cpp/pull/2054 @@ -190,6 +192,7 @@ class llama_context_params(Structure): ("seed", c_uint32), ("n_ctx", c_int32), ("n_batch", c_int32), + ("n_gqa", c_int32), ("n_gpu_layers", c_int32), ("main_gpu", c_int32), ("tensor_split", POINTER(c_float)), @@ -265,6 +268,57 @@ class llama_model_quantize_params(Structure): ] +# // grammar types +# struct llama_grammar; +llama_grammar_p = c_void_p + +# // grammar element type +# enum llama_gretype { +# // end of rule definition +# LLAMA_GRETYPE_END = 0, + +# // start of alternate definition for rule +# LLAMA_GRETYPE_ALT = 1, + +# // non-terminal element: reference to rule +# LLAMA_GRETYPE_RULE_REF = 2, + +# // terminal element: character (code point) +# LLAMA_GRETYPE_CHAR = 3, + +# // inverse char(s) ([^a], [^a-b] [^abc]) +# LLAMA_GRETYPE_CHAR_NOT = 4, + +# // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to +# // be an inclusive range ([a-z]) +# LLAMA_GRETYPE_CHAR_RNG_UPPER = 5, + +# // modifies a preceding LLAMA_GRETYPE_CHAR or +# // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) +# LLAMA_GRETYPE_CHAR_ALT = 6, +# }; +LLAMA_GRETYPE_END = c_int(0) +LLAMA_GRETYPE_ALT = c_int(1) +LLAMA_GRETYPE_RULE_REF = c_int(2) +LLAMA_GRETYPE_CHAR = c_int(3) +LLAMA_GRETYPE_CHAR_NOT = c_int(4) +LLAMA_GRETYPE_CHAR_RNG_UPPER = c_int(5) +LLAMA_GRETYPE_CHAR_ALT = c_int(6) + + +# typedef struct llama_grammar_element { +# enum llama_gretype type; +# uint32_t value; // Unicode code point or rule ID +# } llama_grammar_element; +class llama_grammar_element(Structure): + _fields_ = [ + ("type", c_int), + ("value", c_uint32), + ] + + +llama_grammar_element_p = POINTER(llama_grammar_element) + # // performance timing information # struct llama_timings { # double t_start_ms; @@ -871,6 +925,37 @@ _lib.llama_token_nl.argtypes = [] _lib.llama_token_nl.restype = llama_token +# // Grammar +# // +# LLAMA_API struct llama_grammar * llama_grammar_init( +# const llama_grammar_element ** rules, +# size_t n_rules, +# size_t start_rule_index); +def llama_grammar_init( + rules, # type: Array[llama_grammar_element_p] # type: ignore + n_rules: c_size_t, + start_rule_index: c_size_t, +) -> llama_grammar_p: + return _lib.llama_grammar_init(rules, n_rules, start_rule_index) + + +_lib.llama_grammar_init.argtypes = [ + POINTER(llama_grammar_element_p), + c_size_t, + c_size_t, +] +_lib.llama_grammar_init.restype = llama_grammar_p + + +# LLAMA_API void llama_grammar_free(struct llama_grammar * grammar); +def llama_grammar_free(grammar: llama_grammar_p): + return _lib.llama_grammar_free(grammar) + + +_lib.llama_grammar_free.argtypes = [llama_grammar_p] +_lib.llama_grammar_free.restype = None + + # Sampling functions diff --git a/vendor/llama.cpp b/vendor/llama.cpp index d924522..84e09a7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit d924522a46c5ef097af4a88087d91673e8e87e4d +Subproject commit 84e09a7d8bc4ab6d658b5cd81295ac0add60be78 From d8a3ddbb1cf4d3a9051f778351caf44550b9caed Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Jul 2023 13:08:06 -0400 Subject: [PATCH 3/5] Update llama.cpp --- llama_cpp/llama_cpp.py | 2 ++ vendor/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index c9d79b9..423a4a0 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -163,6 +163,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # int32_t n_ctx; // text context # int32_t n_batch; // prompt processing batch size # int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams) +# float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams) # int32_t n_gpu_layers; // number of layers to store in VRAM # int32_t main_gpu; // the GPU that is used for scratch and small tensors # @@ -193,6 +194,7 @@ class llama_context_params(Structure): ("n_ctx", c_int32), ("n_batch", c_int32), ("n_gqa", c_int32), + ("rms_norm_eps", c_float), ("n_gpu_layers", c_int32), ("main_gpu", c_int32), ("tensor_split", POINTER(c_float)), diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 84e09a7..41c6741 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 84e09a7d8bc4ab6d658b5cd81295ac0add60be78 +Subproject commit 41c674161fb2459bdf7806d1eebead15bc5d046e From 401309d11c3eccc1ef491e37dd0cb454874b455f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Jul 2023 13:11:10 -0400 Subject: [PATCH 4/5] Revert "Merge pull request #521 from bretello/main" This reverts commit 07f0f3a3860aca25682d1088f0da93b4a894fd1d, reversing changes made to d8a3ddbb1cf4d3a9051f778351caf44550b9caed. --- llama_cpp/llama_cpp.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 87a3b9a..423a4a0 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -423,10 +423,7 @@ _lib.llama_backend_free.restype = None def llama_load_model_from_file( path_model: bytes, params: llama_context_params ) -> llama_model_p: - result = _lib.llama_load_model_from_file(path_model, params) - if result is None: - raise Exception(f"Failed to load model from {path_model}") - return result + return _lib.llama_load_model_from_file(path_model, params) _lib.llama_load_model_from_file.argtypes = [c_char_p, llama_context_params] From 4aaaec561d108438b26319b7c1beeb5d70acb95e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Jul 2023 13:12:38 -0400 Subject: [PATCH 5/5] Bump version --- CHANGELOG.md | 4 ++++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 360b8e8..56db2b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.76] + +- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B + ## [0.1.75] - Update llama.cpp diff --git a/pyproject.toml b/pyproject.toml index 02273b9..5861af8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.75" +version = "0.1.76" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 48836df..5ea2781 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.75", + version="0.1.76", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT",