From 39978ccaf5b8ca85bc6b72d719e746ea305ad37f Mon Sep 17 00:00:00 2001
From: bretello <bretello@distruzione.org>
Date: Thu, 3 Aug 2023 18:22:52 +0200
Subject: [PATCH] add `mul_mat_q` parameter

This also fixes a crash when loading the 70b llama2 model on MacOS with
metal and `n_gpu_layers=1`
---
 llama_cpp/llama_cpp.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 423a4a0..bbb2a1e 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -181,6 +181,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
+#     bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
 #     bool f16_kv;     // use fp16 for KV cache
 #     bool logits_all; // the llama_eval() call computes all logits, not just the last one
 #     bool vocab_only; // only load the vocabulary, no weights
@@ -203,6 +204,7 @@ class llama_context_params(Structure):
         ("progress_callback", llama_progress_callback),
         ("progress_callback_user_data", c_void_p),
         ("low_vram", c_bool),
+        ("mul_mat_q", c_bool),
         ("f16_kv", c_bool),
         ("logits_all", c_bool),
         ("vocab_only", c_bool),