diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index c9d79b9..423a4a0 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -163,6 +163,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 #     int32_t  n_ctx;        // text context
 #     int32_t  n_batch;      // prompt processing batch size
 #     int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
+#     float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
 #     int32_t  n_gpu_layers; // number of layers to store in VRAM
 #     int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
 #
@@ -193,6 +194,7 @@ class llama_context_params(Structure):
         ("n_ctx", c_int32),
         ("n_batch", c_int32),
         ("n_gqa", c_int32),
+        ("rms_norm_eps", c_float),
         ("n_gpu_layers", c_int32),
         ("main_gpu", c_int32),
         ("tensor_split", POINTER(c_float)),
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 84e09a7..41c6741 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 84e09a7d8bc4ab6d658b5cd81295ac0add60be78
+Subproject commit 41c674161fb2459bdf7806d1eebead15bc5d046e