Update llama.cpp

This commit is contained in:
Andrei Betlen 2024-01-13 22:37:49 -05:00
parent bb610b9428
commit 7c898d5684
2 changed files with 26 additions and 8 deletions

View file

@ -112,8 +112,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
# define LLAMA_SESSION_VERSION 3
LLAMA_SESSION_VERSION = 3
# define LLAMA_SESSION_VERSION 4
LLAMA_SESSION_VERSION = 4
# struct llama_model;
@ -220,6 +220,14 @@ LLAMA_ROPE_SCALING_LINEAR = 1
LLAMA_ROPE_SCALING_YARN = 2
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
# enum llama_split_mode {
# LLAMA_SPLIT_NONE = 0, // single GPU
# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
# LLAMA_SPLIT_ROW = 2, // split rows across GPUs
# };
LLAMA_SPLIT_NONE = 0
LLAMA_SPLIT_LAYER = 1
LLAMA_SPLIT_ROW = 2
# typedef struct llama_token_data {
# llama_token id; // token id
@ -365,20 +373,28 @@ class llama_model_kv_override(Structure):
# struct llama_model_params {
# int32_t n_gpu_layers; // number of layers to store in VRAM
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
# const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
# enum llama_split_mode split_mode; // how to split the model across multiple GPUs
# // main_gpu interpretation depends on split_mode:
# // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
# // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
# // LLAMA_SPLIT_LAYER: ignored
# int32_t main_gpu;
# // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
# const float * tensor_split;
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
# // If the provided progress_callback returns true, model loading continues.
# // If it returns false, model loading is immediately aborted.
# llama_progress_callback progress_callback;
# // context pointer passed to the progress callback
# void * progress_callback_user_data;
# // override key-value pairs of the model meta data
# const struct llama_model_kv_override * kv_overrides;
# // Keep the booleans together to avoid misalignment during copy-by-value.
# bool vocab_only; // only load the vocabulary, no weights
# bool use_mmap; // use mmap if possible
@ -389,8 +405,9 @@ class llama_model_params(Structure):
Attributes:
n_gpu_layers (int): number of layers to store in VRAM
main_gpu (int): the GPU that is used for scratch and small tensors
tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
split_mode (int): how to split the model across multiple GPUs
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
@ -400,6 +417,7 @@ class llama_model_params(Structure):
_fields_ = [
("n_gpu_layers", c_int32),
("split_mode", c_int),
("main_gpu", c_int32),
("tensor_split", c_float_p),
("progress_callback", llama_progress_callback),

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 1d118386fea031f01550f8cd47a5c86296e5333f
Subproject commit 76484fbfd355df388f71d6edaa98e1692a74de7e