From 8be7d67f7e649196b63210408bd7bb54ef1cf791 Mon Sep 17 00:00:00 2001
From: bretello <bretello@distruzione.org>
Date: Mon, 24 Jul 2023 14:42:37 +0200
Subject: [PATCH 1/5] raise exception when `llama_load_model_from_file` fails

---
 llama_cpp/llama_cpp.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index eea26ac..949c6af 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -367,7 +367,10 @@ _lib.llama_backend_free.restype = None
 def llama_load_model_from_file(
     path_model: bytes, params: llama_context_params
 ) -> llama_model_p:
-    return _lib.llama_load_model_from_file(path_model, params)
+    result = _lib.llama_load_model_from_file(path_model, params)
+    if result is None:
+        raise Exception(f"Failed to load model from {path_model}")
+    return result
 
 
 _lib.llama_load_model_from_file.argtypes = [c_char_p, llama_context_params]

From 985d559971cf2db595c7947d38074558eb9d893b Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 24 Jul 2023 13:04:34 -0400
Subject: [PATCH 2/5] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 95 +++++++++++++++++++++++++++++++++++++++---
 vendor/llama.cpp       |  2 +-
 2 files changed, 91 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index eea26ac..c9d79b9 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -159,11 +159,13 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 
 
 # struct llama_context_params {
-#     uint32_t seed;                         // RNG seed, -1 for random
-#     int32_t  n_ctx;                        // text context
-#     int32_t  n_batch;                      // prompt processing batch size
-#     int32_t  n_gpu_layers;                 // number of layers to store in VRAM
-#     int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
+#     uint32_t seed;         // RNG seed, -1 for random
+#     int32_t  n_ctx;        // text context
+#     int32_t  n_batch;      // prompt processing batch size
+#     int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
+#     int32_t  n_gpu_layers; // number of layers to store in VRAM
+#     int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
+#
 #     const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
 
 #     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
@@ -190,6 +192,7 @@ class llama_context_params(Structure):
         ("seed", c_uint32),
         ("n_ctx", c_int32),
         ("n_batch", c_int32),
+        ("n_gqa", c_int32),
         ("n_gpu_layers", c_int32),
         ("main_gpu", c_int32),
         ("tensor_split", POINTER(c_float)),
@@ -265,6 +268,57 @@ class llama_model_quantize_params(Structure):
     ]
 
 
+# // grammar types
+# struct llama_grammar;
+llama_grammar_p = c_void_p
+
+# // grammar element type
+# enum llama_gretype {
+#     // end of rule definition
+#     LLAMA_GRETYPE_END            = 0,
+
+#     // start of alternate definition for rule
+#     LLAMA_GRETYPE_ALT            = 1,
+
+#     // non-terminal element: reference to rule
+#     LLAMA_GRETYPE_RULE_REF       = 2,
+
+#     // terminal element: character (code point)
+#     LLAMA_GRETYPE_CHAR           = 3,
+
+#     // inverse char(s) ([^a], [^a-b] [^abc])
+#     LLAMA_GRETYPE_CHAR_NOT       = 4,
+
+#     // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
+#     // be an inclusive range ([a-z])
+#     LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
+
+#     // modifies a preceding LLAMA_GRETYPE_CHAR or
+#     // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+#     LLAMA_GRETYPE_CHAR_ALT       = 6,
+# };
+LLAMA_GRETYPE_END = c_int(0)
+LLAMA_GRETYPE_ALT = c_int(1)
+LLAMA_GRETYPE_RULE_REF = c_int(2)
+LLAMA_GRETYPE_CHAR = c_int(3)
+LLAMA_GRETYPE_CHAR_NOT = c_int(4)
+LLAMA_GRETYPE_CHAR_RNG_UPPER = c_int(5)
+LLAMA_GRETYPE_CHAR_ALT = c_int(6)
+
+
+# typedef struct llama_grammar_element {
+#     enum llama_gretype type;
+#     uint32_t           value; // Unicode code point or rule ID
+# } llama_grammar_element;
+class llama_grammar_element(Structure):
+    _fields_ = [
+        ("type", c_int),
+        ("value", c_uint32),
+    ]
+
+
+llama_grammar_element_p = POINTER(llama_grammar_element)
+
 # // performance timing information
 # struct llama_timings {
 #     double t_start_ms;
@@ -871,6 +925,37 @@ _lib.llama_token_nl.argtypes = []
 _lib.llama_token_nl.restype = llama_token
 
 
+# // Grammar
+# //
+# LLAMA_API struct llama_grammar * llama_grammar_init(
+#         const llama_grammar_element ** rules,
+#                                 size_t    n_rules,
+#                                 size_t    start_rule_index);
+def llama_grammar_init(
+    rules,  # type: Array[llama_grammar_element_p] # type: ignore
+    n_rules: c_size_t,
+    start_rule_index: c_size_t,
+) -> llama_grammar_p:
+    return _lib.llama_grammar_init(rules, n_rules, start_rule_index)
+
+
+_lib.llama_grammar_init.argtypes = [
+    POINTER(llama_grammar_element_p),
+    c_size_t,
+    c_size_t,
+]
+_lib.llama_grammar_init.restype = llama_grammar_p
+
+
+# LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
+def llama_grammar_free(grammar: llama_grammar_p):
+    return _lib.llama_grammar_free(grammar)
+
+
+_lib.llama_grammar_free.argtypes = [llama_grammar_p]
+_lib.llama_grammar_free.restype = None
+
+
 # Sampling functions
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index d924522..84e09a7 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit d924522a46c5ef097af4a88087d91673e8e87e4d
+Subproject commit 84e09a7d8bc4ab6d658b5cd81295ac0add60be78

From d8a3ddbb1cf4d3a9051f778351caf44550b9caed Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 24 Jul 2023 13:08:06 -0400
Subject: [PATCH 3/5] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 2 ++
 vendor/llama.cpp       | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index c9d79b9..423a4a0 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -163,6 +163,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 #     int32_t  n_ctx;        // text context
 #     int32_t  n_batch;      // prompt processing batch size
 #     int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
+#     float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
 #     int32_t  n_gpu_layers; // number of layers to store in VRAM
 #     int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
 #
@@ -193,6 +194,7 @@ class llama_context_params(Structure):
         ("n_ctx", c_int32),
         ("n_batch", c_int32),
         ("n_gqa", c_int32),
+        ("rms_norm_eps", c_float),
         ("n_gpu_layers", c_int32),
         ("main_gpu", c_int32),
         ("tensor_split", POINTER(c_float)),
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 84e09a7..41c6741 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 84e09a7d8bc4ab6d658b5cd81295ac0add60be78
+Subproject commit 41c674161fb2459bdf7806d1eebead15bc5d046e

From 401309d11c3eccc1ef491e37dd0cb454874b455f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 24 Jul 2023 13:11:10 -0400
Subject: [PATCH 4/5] Revert "Merge pull request #521 from bretello/main"

This reverts commit 07f0f3a3860aca25682d1088f0da93b4a894fd1d, reversing
changes made to d8a3ddbb1cf4d3a9051f778351caf44550b9caed.
---
 llama_cpp/llama_cpp.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 87a3b9a..423a4a0 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -423,10 +423,7 @@ _lib.llama_backend_free.restype = None
 def llama_load_model_from_file(
     path_model: bytes, params: llama_context_params
 ) -> llama_model_p:
-    result = _lib.llama_load_model_from_file(path_model, params)
-    if result is None:
-        raise Exception(f"Failed to load model from {path_model}")
-    return result
+    return _lib.llama_load_model_from_file(path_model, params)
 
 
 _lib.llama_load_model_from_file.argtypes = [c_char_p, llama_context_params]

From 4aaaec561d108438b26319b7c1beeb5d70acb95e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 24 Jul 2023 13:12:38 -0400
Subject: [PATCH 5/5] Bump version

---
 CHANGELOG.md   | 4 ++++
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 360b8e8..56db2b3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.76]
+
+- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
+
 ## [0.1.75]
 
 - Update llama.cpp
diff --git a/pyproject.toml b/pyproject.toml
index 02273b9..5861af8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.75"
+version = "0.1.76"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 48836df..5ea2781 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.75",
+    version="0.1.76",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",