feat: Update llama.cpp
This commit is contained in:
parent
221edb9ef1
commit
2292af5796
4 changed files with 41 additions and 37 deletions
|
@ -65,7 +65,7 @@ class Llama:
|
||||||
*,
|
*,
|
||||||
# Model Params
|
# Model Params
|
||||||
n_gpu_layers: int = 0,
|
n_gpu_layers: int = 0,
|
||||||
split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER,
|
split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
|
||||||
main_gpu: int = 0,
|
main_gpu: int = 0,
|
||||||
tensor_split: Optional[List[float]] = None,
|
tensor_split: Optional[List[float]] = None,
|
||||||
vocab_only: bool = False,
|
vocab_only: bool = False,
|
||||||
|
@ -78,7 +78,7 @@ class Llama:
|
||||||
n_batch: int = 512,
|
n_batch: int = 512,
|
||||||
n_threads: Optional[int] = None,
|
n_threads: Optional[int] = None,
|
||||||
n_threads_batch: Optional[int] = None,
|
n_threads_batch: Optional[int] = None,
|
||||||
rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED,
|
rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
||||||
rope_freq_base: float = 0.0,
|
rope_freq_base: float = 0.0,
|
||||||
rope_freq_scale: float = 0.0,
|
rope_freq_scale: float = 0.0,
|
||||||
yarn_ext_factor: float = -1.0,
|
yarn_ext_factor: float = -1.0,
|
||||||
|
@ -238,13 +238,13 @@ class Llama:
|
||||||
for i, (k, v) in enumerate(kv_overrides.items()):
|
for i, (k, v) in enumerate(kv_overrides.items()):
|
||||||
self._kv_overrides_array[i].key = k.encode("utf-8")
|
self._kv_overrides_array[i].key = k.encode("utf-8")
|
||||||
if isinstance(v, bool):
|
if isinstance(v, bool):
|
||||||
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
|
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
|
||||||
self._kv_overrides_array[i].value.bool_value = v
|
self._kv_overrides_array[i].value.bool_value = v
|
||||||
elif isinstance(v, int):
|
elif isinstance(v, int):
|
||||||
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
|
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
|
||||||
self._kv_overrides_array[i].value.int_value = v
|
self._kv_overrides_array[i].value.int_value = v
|
||||||
elif isinstance(v, float):
|
elif isinstance(v, float):
|
||||||
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
|
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
|
||||||
self._kv_overrides_array[i].value.float_value = v
|
self._kv_overrides_array[i].value.float_value = v
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown value type for {k}: {v}")
|
raise ValueError(f"Unknown value type for {k}: {v}")
|
||||||
|
@ -270,7 +270,7 @@ class Llama:
|
||||||
self.context_params.rope_scaling_type = (
|
self.context_params.rope_scaling_type = (
|
||||||
rope_scaling_type
|
rope_scaling_type
|
||||||
if rope_scaling_type is not None
|
if rope_scaling_type is not None
|
||||||
else llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
|
else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
|
||||||
)
|
)
|
||||||
self.context_params.rope_freq_base = (
|
self.context_params.rope_freq_base = (
|
||||||
rope_freq_base if rope_freq_base != 0.0 else 0
|
rope_freq_base if rope_freq_base != 0.0 else 0
|
||||||
|
|
|
@ -279,35 +279,35 @@ LLAMA_FTYPE_MOSTLY_IQ3_M = 27
|
||||||
LLAMA_FTYPE_GUESSED = 1024
|
LLAMA_FTYPE_GUESSED = 1024
|
||||||
|
|
||||||
# enum llama_rope_scaling_type {
|
# enum llama_rope_scaling_type {
|
||||||
# LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
# LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
|
||||||
# LLAMA_ROPE_SCALING_NONE = 0,
|
# LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
||||||
# LLAMA_ROPE_SCALING_LINEAR = 1,
|
# LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
||||||
# LLAMA_ROPE_SCALING_YARN = 2,
|
# LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
||||||
# LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
# LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
|
||||||
# };
|
# };
|
||||||
LLAMA_ROPE_SCALING_UNSPECIFIED = -1
|
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1
|
||||||
LLAMA_ROPE_SCALING_NONE = 0
|
LLAMA_ROPE_SCALING_TYPE_NONE = 0
|
||||||
LLAMA_ROPE_SCALING_LINEAR = 1
|
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1
|
||||||
LLAMA_ROPE_SCALING_YARN = 2
|
LLAMA_ROPE_SCALING_TYPE_YARN = 2
|
||||||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
|
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN
|
||||||
|
|
||||||
# enum llama_pooling_type {
|
# enum llama_pooling_type {
|
||||||
# LLAMA_POOLING_NONE = 0,
|
# LLAMA_POOLING_TYPE_NONE = 0,
|
||||||
# LLAMA_POOLING_MEAN = 1,
|
# LLAMA_POOLING_TYPE_MEAN = 1,
|
||||||
# LLAMA_POOLING_CLS = 2,
|
# LLAMA_POOLING_TYPE_CLS = 2,
|
||||||
# };
|
# };
|
||||||
LLAMA_POOLING_NONE = 0
|
LLAMA_POOLING_TYPE_NONE = 0
|
||||||
LLAMA_POOLING_MEAN = 1
|
LLAMA_POOLING_TYPE_MEAN = 1
|
||||||
LLAMA_POOLING_CLS = 2
|
LLAMA_POOLING_TYPE_CLS = 2
|
||||||
|
|
||||||
# enum llama_split_mode {
|
# enum llama_split_mode {
|
||||||
# LLAMA_SPLIT_NONE = 0, // single GPU
|
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||||
# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||||
# LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
# LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
||||||
# };
|
# };
|
||||||
LLAMA_SPLIT_NONE = 0
|
LLAMA_SPLIT_MODE_NONE = 0
|
||||||
LLAMA_SPLIT_LAYER = 1
|
LLAMA_SPLIT_MODE_LAYER = 1
|
||||||
LLAMA_SPLIT_ROW = 2
|
LLAMA_SPLIT_MODE_ROW = 2
|
||||||
|
|
||||||
|
|
||||||
# typedef struct llama_token_data {
|
# typedef struct llama_token_data {
|
||||||
|
@ -420,13 +420,13 @@ class llama_batch(ctypes.Structure):
|
||||||
|
|
||||||
|
|
||||||
# enum llama_model_kv_override_type {
|
# enum llama_model_kv_override_type {
|
||||||
# LLAMA_KV_OVERRIDE_INT,
|
# LLAMA_KV_OVERRIDE_TYPE_INT,
|
||||||
# LLAMA_KV_OVERRIDE_FLOAT,
|
# LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
||||||
# LLAMA_KV_OVERRIDE_BOOL,
|
# LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
||||||
# };
|
# };
|
||||||
LLAMA_KV_OVERRIDE_INT = 0
|
LLAMA_KV_OVERRIDE_TYPE_INT = 0
|
||||||
LLAMA_KV_OVERRIDE_FLOAT = 1
|
LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1
|
||||||
LLAMA_KV_OVERRIDE_BOOL = 2
|
LLAMA_KV_OVERRIDE_TYPE_BOOL = 2
|
||||||
|
|
||||||
|
|
||||||
# struct llama_model_kv_override {
|
# struct llama_model_kv_override {
|
||||||
|
|
|
@ -29,7 +29,7 @@ class ModelSettings(BaseSettings):
|
||||||
description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
|
description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
|
||||||
)
|
)
|
||||||
split_mode: int = Field(
|
split_mode: int = Field(
|
||||||
default=llama_cpp.LLAMA_SPLIT_LAYER,
|
default=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
|
||||||
description="The split mode to use.",
|
description="The split mode to use.",
|
||||||
)
|
)
|
||||||
main_gpu: int = Field(
|
main_gpu: int = Field(
|
||||||
|
@ -74,7 +74,7 @@ class ModelSettings(BaseSettings):
|
||||||
ge=0,
|
ge=0,
|
||||||
description="The number of threads to use when batch processing.",
|
description="The number of threads to use when batch processing.",
|
||||||
)
|
)
|
||||||
rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED)
|
rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED)
|
||||||
rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
|
rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
|
||||||
rope_freq_scale: float = Field(
|
rope_freq_scale: float = Field(
|
||||||
default=0.0, description="RoPE frequency scaling factor"
|
default=0.0, description="RoPE frequency scaling factor"
|
||||||
|
@ -143,6 +143,10 @@ class ModelSettings(BaseSettings):
|
||||||
default=None,
|
default=None,
|
||||||
description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
|
description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
|
||||||
)
|
)
|
||||||
|
hf_model_repo_id: Optional[str] = Field(
|
||||||
|
default=None,
|
||||||
|
description="The HuggingFace repo_id to use to load model files from",
|
||||||
|
)
|
||||||
# Speculative Decoding
|
# Speculative Decoding
|
||||||
draft_model: Optional[str] = Field(
|
draft_model: Optional[str] = Field(
|
||||||
default=None,
|
default=None,
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit 9e359a4f47c1b2dceb99e29706c9f7403d32ab5e
|
Subproject commit f7625019c51ca437a5840576d92362cfa710e4a2
|
Loading…
Reference in a new issue