add mul_mat_q
parameter
This also fixes a crash when loading the 70b llama2 model on MacOS with metal and `n_gpu_layers=1`
This commit is contained in:
parent
91bf8fac1b
commit
39978ccaf5
1 changed files with 2 additions and 0 deletions
|
@ -181,6 +181,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
|
||||||
|
|
||||||
# // Keep the booleans together to avoid misalignment during copy-by-value.
|
# // Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
# bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
# bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
||||||
|
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
||||||
# bool f16_kv; // use fp16 for KV cache
|
# bool f16_kv; // use fp16 for KV cache
|
||||||
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
# bool vocab_only; // only load the vocabulary, no weights
|
# bool vocab_only; // only load the vocabulary, no weights
|
||||||
|
@ -203,6 +204,7 @@ class llama_context_params(Structure):
|
||||||
("progress_callback", llama_progress_callback),
|
("progress_callback", llama_progress_callback),
|
||||||
("progress_callback_user_data", c_void_p),
|
("progress_callback_user_data", c_void_p),
|
||||||
("low_vram", c_bool),
|
("low_vram", c_bool),
|
||||||
|
("mul_mat_q", c_bool),
|
||||||
("f16_kv", c_bool),
|
("f16_kv", c_bool),
|
||||||
("logits_all", c_bool),
|
("logits_all", c_bool),
|
||||||
("vocab_only", c_bool),
|
("vocab_only", c_bool),
|
||||||
|
|
Loading…
Reference in a new issue