Update llama.cpp
This commit is contained in:
parent
2b37d8e438
commit
71e3e4c435
2 changed files with 35 additions and 8 deletions
|
@ -98,7 +98,7 @@ ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(c_bool, c_void_p, c_bool, c_
|
||||||
# llama.h bindings
|
# llama.h bindings
|
||||||
|
|
||||||
_lib.llama_max_devices.argtypes = []
|
_lib.llama_max_devices.argtypes = []
|
||||||
_lib.llama_max_devices.restype = ctypes.c_int32
|
_lib.llama_max_devices.restype = ctypes.c_size_t
|
||||||
|
|
||||||
LLAMA_MAX_DEVICES = _lib.llama_max_devices()
|
LLAMA_MAX_DEVICES = _lib.llama_max_devices()
|
||||||
|
|
||||||
|
@ -390,7 +390,7 @@ class llama_model_kv_override(Structure):
|
||||||
# // LLAMA_SPLIT_LAYER: ignored
|
# // LLAMA_SPLIT_LAYER: ignored
|
||||||
# int32_t main_gpu;
|
# int32_t main_gpu;
|
||||||
|
|
||||||
# // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
# // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||||
# const float * tensor_split;
|
# const float * tensor_split;
|
||||||
|
|
||||||
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||||
|
@ -417,7 +417,7 @@ class llama_model_params(Structure):
|
||||||
n_gpu_layers (int): number of layers to store in VRAM
|
n_gpu_layers (int): number of layers to store in VRAM
|
||||||
split_mode (int): how to split the model across multiple GPUs
|
split_mode (int): how to split the model across multiple GPUs
|
||||||
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
|
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
|
||||||
tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||||
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
|
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
|
||||||
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
|
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
|
||||||
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
|
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
|
||||||
|
@ -760,16 +760,43 @@ _lib.llama_time_us.argtypes = []
|
||||||
_lib.llama_time_us.restype = ctypes.c_int64
|
_lib.llama_time_us.restype = ctypes.c_int64
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API int32_t llama_max_devices(void);
|
# LLAMA_API size_t llama_max_devices(void);
|
||||||
def llama_max_devices() -> int:
|
def llama_max_devices() -> int:
|
||||||
return _lib.llama_max_devices()
|
return _lib.llama_max_devices()
|
||||||
|
|
||||||
|
|
||||||
_lib.llama_max_devices.argtypes = []
|
_lib.llama_max_devices.argtypes = []
|
||||||
_lib.llama_max_devices.restype = ctypes.c_int32
|
_lib.llama_max_devices.restype = ctypes.c_size_t
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API bool llama_mmap_supported (void);
|
# LLAMA_API bool llama_supports_mmap (void);
|
||||||
|
def llama_supports_mmap() -> bool:
|
||||||
|
return _lib.llama_supports_mmap()
|
||||||
|
|
||||||
|
|
||||||
|
_lib.llama_supports_mmap.argtypes = []
|
||||||
|
_lib.llama_supports_mmap.restype = c_bool
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API bool llama_supports_mlock (void);
|
||||||
|
def llama_supports_mlock() -> bool:
|
||||||
|
return _lib.llama_supports_mlock()
|
||||||
|
|
||||||
|
|
||||||
|
_lib.llama_supports_mlock.argtypes = []
|
||||||
|
_lib.llama_supports_mlock.restype = c_bool
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API bool llama_supports_gpu_offload(void);
|
||||||
|
def llama_supports_gpu_offload() -> bool:
|
||||||
|
return _lib.llama_supports_gpu_offload()
|
||||||
|
|
||||||
|
|
||||||
|
_lib.llama_supports_gpu_offload.argtypes = []
|
||||||
|
_lib.llama_supports_gpu_offload.restype = c_bool
|
||||||
|
|
||||||
|
|
||||||
|
# LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
|
||||||
def llama_mmap_supported() -> bool:
|
def llama_mmap_supported() -> bool:
|
||||||
return _lib.llama_mmap_supported()
|
return _lib.llama_mmap_supported()
|
||||||
|
|
||||||
|
@ -778,7 +805,7 @@ _lib.llama_mmap_supported.argtypes = []
|
||||||
_lib.llama_mmap_supported.restype = c_bool
|
_lib.llama_mmap_supported.restype = c_bool
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API bool llama_mlock_supported(void);
|
# LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
|
||||||
def llama_mlock_supported() -> bool:
|
def llama_mlock_supported() -> bool:
|
||||||
return _lib.llama_mlock_supported()
|
return _lib.llama_mlock_supported()
|
||||||
|
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 15606309a05ccf7fadbaad5538cb7c32acb1e06b
|
Subproject commit 5cb04dbc16d1da38c8fdcc0111b40e67d00dd1c3
|
Loading…
Reference in a new issue