diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 01a2e70..81bfce4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -65,7 +65,7 @@ class Llama: *, # Model Params n_gpu_layers: int = 0, - split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER, + split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER, main_gpu: int = 0, tensor_split: Optional[List[float]] = None, vocab_only: bool = False, @@ -78,7 +78,7 @@ class Llama: n_batch: int = 512, n_threads: Optional[int] = None, n_threads_batch: Optional[int] = None, - rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED, + rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, rope_freq_base: float = 0.0, rope_freq_scale: float = 0.0, yarn_ext_factor: float = -1.0, @@ -238,13 +238,13 @@ class Llama: for i, (k, v) in enumerate(kv_overrides.items()): self._kv_overrides_array[i].key = k.encode("utf-8") if isinstance(v, bool): - self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL + self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL self._kv_overrides_array[i].value.bool_value = v elif isinstance(v, int): - self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT + self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT self._kv_overrides_array[i].value.int_value = v elif isinstance(v, float): - self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT + self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT self._kv_overrides_array[i].value.float_value = v else: raise ValueError(f"Unknown value type for {k}: {v}") @@ -270,7 +270,7 @@ class Llama: self.context_params.rope_scaling_type = ( rope_scaling_type if rope_scaling_type is not None - else llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED + else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED ) self.context_params.rope_freq_base = ( rope_freq_base if rope_freq_base != 0.0 else 0 diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index b8e74d7..37d4637 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -279,35 +279,35 @@ LLAMA_FTYPE_MOSTLY_IQ3_M = 27 LLAMA_FTYPE_GUESSED = 1024 # enum llama_rope_scaling_type { -# LLAMA_ROPE_SCALING_UNSPECIFIED = -1, -# LLAMA_ROPE_SCALING_NONE = 0, -# LLAMA_ROPE_SCALING_LINEAR = 1, -# LLAMA_ROPE_SCALING_YARN = 2, -# LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, +# LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1, +# LLAMA_ROPE_SCALING_TYPE_NONE = 0, +# LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, +# LLAMA_ROPE_SCALING_TYPE_YARN = 2, +# LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, # }; -LLAMA_ROPE_SCALING_UNSPECIFIED = -1 -LLAMA_ROPE_SCALING_NONE = 0 -LLAMA_ROPE_SCALING_LINEAR = 1 -LLAMA_ROPE_SCALING_YARN = 2 -LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN +LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1 +LLAMA_ROPE_SCALING_TYPE_NONE = 0 +LLAMA_ROPE_SCALING_TYPE_LINEAR = 1 +LLAMA_ROPE_SCALING_TYPE_YARN = 2 +LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN # enum llama_pooling_type { -# LLAMA_POOLING_NONE = 0, -# LLAMA_POOLING_MEAN = 1, -# LLAMA_POOLING_CLS = 2, +# LLAMA_POOLING_TYPE_NONE = 0, +# LLAMA_POOLING_TYPE_MEAN = 1, +# LLAMA_POOLING_TYPE_CLS = 2, # }; -LLAMA_POOLING_NONE = 0 -LLAMA_POOLING_MEAN = 1 -LLAMA_POOLING_CLS = 2 +LLAMA_POOLING_TYPE_NONE = 0 +LLAMA_POOLING_TYPE_MEAN = 1 +LLAMA_POOLING_TYPE_CLS = 2 # enum llama_split_mode { -# LLAMA_SPLIT_NONE = 0, // single GPU -# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs -# LLAMA_SPLIT_ROW = 2, // split rows across GPUs +# LLAMA_SPLIT_MODE_NONE = 0, // single GPU +# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs +# LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs # }; -LLAMA_SPLIT_NONE = 0 -LLAMA_SPLIT_LAYER = 1 -LLAMA_SPLIT_ROW = 2 +LLAMA_SPLIT_MODE_NONE = 0 +LLAMA_SPLIT_MODE_LAYER = 1 +LLAMA_SPLIT_MODE_ROW = 2 # typedef struct llama_token_data { @@ -420,13 +420,13 @@ class llama_batch(ctypes.Structure): # enum llama_model_kv_override_type { -# LLAMA_KV_OVERRIDE_INT, -# LLAMA_KV_OVERRIDE_FLOAT, -# LLAMA_KV_OVERRIDE_BOOL, +# LLAMA_KV_OVERRIDE_TYPE_INT, +# LLAMA_KV_OVERRIDE_TYPE_FLOAT, +# LLAMA_KV_OVERRIDE_TYPE_BOOL, # }; -LLAMA_KV_OVERRIDE_INT = 0 -LLAMA_KV_OVERRIDE_FLOAT = 1 -LLAMA_KV_OVERRIDE_BOOL = 2 +LLAMA_KV_OVERRIDE_TYPE_INT = 0 +LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1 +LLAMA_KV_OVERRIDE_TYPE_BOOL = 2 # struct llama_model_kv_override { diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 790c6b1..8989ffa 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -29,7 +29,7 @@ class ModelSettings(BaseSettings): description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.", ) split_mode: int = Field( - default=llama_cpp.LLAMA_SPLIT_LAYER, + default=llama_cpp.LLAMA_SPLIT_MODE_LAYER, description="The split mode to use.", ) main_gpu: int = Field( @@ -74,7 +74,7 @@ class ModelSettings(BaseSettings): ge=0, description="The number of threads to use when batch processing.", ) - rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED) + rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) rope_freq_base: float = Field(default=0.0, description="RoPE base frequency") rope_freq_scale: float = Field( default=0.0, description="RoPE frequency scaling factor" @@ -143,6 +143,10 @@ class ModelSettings(BaseSettings): default=None, description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().", ) + hf_model_repo_id: Optional[str] = Field( + default=None, + description="The HuggingFace repo_id to use to load model files from", + ) # Speculative Decoding draft_model: Optional[str] = Field( default=None, diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 9e359a4..f762501 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 9e359a4f47c1b2dceb99e29706c9f7403d32ab5e +Subproject commit f7625019c51ca437a5840576d92362cfa710e4a2