From b994296c7576067d0862247c284dbf6eae96a46f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 5 Jul 2023 01:00:14 -0400
Subject: [PATCH] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 28 ++++++++++++++++------------
 vendor/llama.cpp       |  2 +-
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 52fc14e..c68fb18 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -5,6 +5,8 @@ from ctypes import (
     c_int,
     c_float,
     c_char_p,
+    c_int32,
+    c_uint32,
     c_void_p,
     c_bool,
     POINTER,
@@ -105,6 +107,9 @@ LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML
 LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
 LLAMA_SESSION_VERSION = c_int(1)
 
+# #define LLAMA_DEFAULT_SEED           0xFFFFFFFF
+LLAMA_DEFAULT_SEED = c_int(0xFFFFFFFF)
+
 # struct llama_model;
 llama_model_p = c_void_p
 
@@ -153,18 +158,17 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 
 
 # struct llama_context_params {
-#     int seed;                              // RNG seed, -1 for random
-#     int n_ctx;                             // text context
-#     int n_batch;                           // prompt processing batch size
-#     int n_gpu_layers;                      // number of layers to store in VRAM
-#     int main_gpu;                          // the GPU that is used for scratch and small tensors
+#     uint32_t seed;                         // RNG seed, -1 for random
+#     int32_t  n_ctx;                        // text context
+#     int32_t  n_batch;                      // prompt processing batch size
+#     int32_t  n_gpu_layers;                 // number of layers to store in VRAM
+#     int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
 #     float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
 #     // called with a progress value between 0 and 1, pass NULL to disable
 #     llama_progress_callback progress_callback;
 #     // context pointer passed to the progress callback
 #     void * progress_callback_user_data;
 
-
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
 #     bool f16_kv;     // use fp16 for KV cache
@@ -176,11 +180,11 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 # };
 class llama_context_params(Structure):
     _fields_ = [
-        ("seed", c_int),
-        ("n_ctx", c_int),
-        ("n_batch", c_int),
-        ("n_gpu_layers", c_int),
-        ("main_gpu", c_int),
+        ("seed", c_uint32),
+        ("n_ctx", c_int32),
+        ("n_batch", c_int32),
+        ("n_gpu_layers", c_int32),
+        ("main_gpu", c_int32),
         ("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
         ("progress_callback", llama_progress_callback),
         ("progress_callback_user_data", c_void_p),
@@ -453,7 +457,7 @@ _lib.llama_get_kv_cache_token_count.restype = c_int
 
 # Sets the current rng seed.
 # LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
-def llama_set_rng_seed(ctx: llama_context_p, seed: c_int):
+def llama_set_rng_seed(ctx: llama_context_p, seed: c_uint32):
     return _lib.llama_set_rng_seed(ctx, seed)
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 96a712c..7f0e9a7 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 96a712ca1b7f427e3bd7ffc0c70b2105cfc7fbf1
+Subproject commit 7f0e9a775ecc4c6ade271c217f63d6dc93e79eaa