From 71e3e4c435826677ff671f1d82748fe1dd4d64e1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 31 Jan 2024 10:41:42 -0500 Subject: [PATCH] Update llama.cpp --- llama_cpp/llama_cpp.py | 41 ++++++++++++++++++++++++++++++++++------- vendor/llama.cpp | 2 +- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 2168579..431a99f 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -98,7 +98,7 @@ ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(c_bool, c_void_p, c_bool, c_ # llama.h bindings _lib.llama_max_devices.argtypes = [] -_lib.llama_max_devices.restype = ctypes.c_int32 +_lib.llama_max_devices.restype = ctypes.c_size_t LLAMA_MAX_DEVICES = _lib.llama_max_devices() @@ -390,7 +390,7 @@ class llama_model_kv_override(Structure): # // LLAMA_SPLIT_LAYER: ignored # int32_t main_gpu; -# // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES +# // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() # const float * tensor_split; # // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. @@ -417,7 +417,7 @@ class llama_model_params(Structure): n_gpu_layers (int): number of layers to store in VRAM split_mode (int): how to split the model across multiple GPUs main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored - tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES + tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted. progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data @@ -760,16 +760,43 @@ _lib.llama_time_us.argtypes = [] _lib.llama_time_us.restype = ctypes.c_int64 -# LLAMA_API int32_t llama_max_devices(void); +# LLAMA_API size_t llama_max_devices(void); def llama_max_devices() -> int: return _lib.llama_max_devices() _lib.llama_max_devices.argtypes = [] -_lib.llama_max_devices.restype = ctypes.c_int32 +_lib.llama_max_devices.restype = ctypes.c_size_t -# LLAMA_API bool llama_mmap_supported (void); +# LLAMA_API bool llama_supports_mmap (void); +def llama_supports_mmap() -> bool: + return _lib.llama_supports_mmap() + + +_lib.llama_supports_mmap.argtypes = [] +_lib.llama_supports_mmap.restype = c_bool + + +# LLAMA_API bool llama_supports_mlock (void); +def llama_supports_mlock() -> bool: + return _lib.llama_supports_mlock() + + +_lib.llama_supports_mlock.argtypes = [] +_lib.llama_supports_mlock.restype = c_bool + + +# LLAMA_API bool llama_supports_gpu_offload(void); +def llama_supports_gpu_offload() -> bool: + return _lib.llama_supports_gpu_offload() + + +_lib.llama_supports_gpu_offload.argtypes = [] +_lib.llama_supports_gpu_offload.restype = c_bool + + +# LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead"); def llama_mmap_supported() -> bool: return _lib.llama_mmap_supported() @@ -778,7 +805,7 @@ _lib.llama_mmap_supported.argtypes = [] _lib.llama_mmap_supported.restype = c_bool -# LLAMA_API bool llama_mlock_supported(void); +# LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead"); def llama_mlock_supported() -> bool: return _lib.llama_mlock_supported() diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1560630..5cb04db 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 15606309a05ccf7fadbaad5538cb7c32acb1e06b +Subproject commit 5cb04dbc16d1da38c8fdcc0111b40e67d00dd1c3