diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 04de046..aef4f65 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -165,12 +165,16 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) # int32_t n_gpu_layers; // number of layers to store in VRAM # int32_t main_gpu; // the GPU that is used for scratch and small tensors # float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs + +# // ref: https://github.com/ggerganov/llama.cpp/pull/2054 +# float rope_freq_base; // RoPE base frequency +# float rope_freq_scale; // RoPE frequency scaling factor + # // called with a progress value between 0 and 1, pass NULL to disable # llama_progress_callback progress_callback; # // context pointer passed to the progress callback # void * progress_callback_user_data; - # // Keep the booleans together to avoid misalignment during copy-by-value. # bool low_vram; // if true, reduce VRAM usage at the cost of performance # bool f16_kv; // use fp16 for KV cache @@ -188,6 +192,8 @@ class llama_context_params(Structure): ("n_gpu_layers", c_int32), ("main_gpu", c_int32), ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), + ("rope_freq_base", c_float), + ("rope_freq_scale", c_float), ("progress_callback", llama_progress_callback), ("progress_callback_user_data", c_void_p), ("low_vram", c_bool), diff --git a/vendor/llama.cpp b/vendor/llama.cpp index a6803ca..6e7cca4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit a6803cab946c817fb7aaf2a40b317f5d3e373bd1 +Subproject commit 6e7cca404748dd4b1a3affd0d1296e37f4ac0a6f