Reorder init params to match llama.cpp order
This commit is contained in:
parent
c8f9b8a734
commit
6a20293fc2
1 changed files with 23 additions and 28 deletions
|
@ -214,10 +214,16 @@ class Llama:
|
||||||
model_path: str,
|
model_path: str,
|
||||||
*,
|
*,
|
||||||
# NOTE: These parameters are likely to change in the future.
|
# NOTE: These parameters are likely to change in the future.
|
||||||
|
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
|
||||||
n_ctx: int = 512,
|
n_ctx: int = 512,
|
||||||
n_parts: int = -1,
|
n_batch: int = 512,
|
||||||
n_gpu_layers: int = 0,
|
n_gpu_layers: int = 0,
|
||||||
seed: int = 1337,
|
main_gpu: int = 0,
|
||||||
|
tensor_split: Optional[List[float]] = None,
|
||||||
|
rope_freq_base: float = 10000.0,
|
||||||
|
rope_freq_scale: float = 1.0,
|
||||||
|
low_vram: bool = False,
|
||||||
|
mul_mat_q: bool = True,
|
||||||
f16_kv: bool = True,
|
f16_kv: bool = True,
|
||||||
logits_all: bool = False,
|
logits_all: bool = False,
|
||||||
vocab_only: bool = False,
|
vocab_only: bool = False,
|
||||||
|
@ -225,17 +231,9 @@ class Llama:
|
||||||
use_mlock: bool = False,
|
use_mlock: bool = False,
|
||||||
embedding: bool = False,
|
embedding: bool = False,
|
||||||
n_threads: Optional[int] = None,
|
n_threads: Optional[int] = None,
|
||||||
n_batch: int = 512,
|
|
||||||
last_n_tokens_size: int = 64,
|
last_n_tokens_size: int = 64,
|
||||||
lora_base: Optional[str] = None,
|
lora_base: Optional[str] = None,
|
||||||
lora_path: Optional[str] = None,
|
lora_path: Optional[str] = None,
|
||||||
low_vram: bool = False,
|
|
||||||
tensor_split: Optional[List[float]] = None,
|
|
||||||
rope_freq_base: float = 10000.0,
|
|
||||||
rope_freq_scale: float = 1.0,
|
|
||||||
n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b
|
|
||||||
rms_norm_eps: Optional[float] = None, # (TEMPORARY)
|
|
||||||
mul_mat_q: Optional[bool] = None,
|
|
||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
**kwargs # type: ignore
|
**kwargs # type: ignore
|
||||||
):
|
):
|
||||||
|
@ -243,10 +241,16 @@ class Llama:
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_path: Path to the model.
|
model_path: Path to the model.
|
||||||
n_ctx: Maximum context size.
|
|
||||||
n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
|
|
||||||
seed: Random seed. -1 for random.
|
seed: Random seed. -1 for random.
|
||||||
|
n_ctx: Maximum context size.
|
||||||
|
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
|
||||||
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
|
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
|
||||||
|
main_gpu: Main GPU to use.
|
||||||
|
tensor_split: Optional list of floats to split the model across multiple GPUs. If None, the model is not split.
|
||||||
|
rope_freq_base: Base frequency for rope sampling.
|
||||||
|
rope_freq_scale: Scale factor for rope sampling.
|
||||||
|
low_vram: Use low VRAM mode.
|
||||||
|
mul_mat_q: if true, use experimental mul_mat_q kernels
|
||||||
f16_kv: Use half-precision for key/value cache.
|
f16_kv: Use half-precision for key/value cache.
|
||||||
logits_all: Return logits for all tokens, not just the last token.
|
logits_all: Return logits for all tokens, not just the last token.
|
||||||
vocab_only: Only load the vocabulary no weights.
|
vocab_only: Only load the vocabulary no weights.
|
||||||
|
@ -254,14 +258,11 @@ class Llama:
|
||||||
use_mlock: Force the system to keep the model in RAM.
|
use_mlock: Force the system to keep the model in RAM.
|
||||||
embedding: Embedding mode only.
|
embedding: Embedding mode only.
|
||||||
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
|
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
|
||||||
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
|
|
||||||
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
|
||||||
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
|
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
|
||||||
lora_path: Path to a LoRA file to apply to the model.
|
lora_path: Path to a LoRA file to apply to the model.
|
||||||
tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split.
|
|
||||||
rope_freq_base: Base frequency for rope sampling.
|
|
||||||
rope_freq_scale: Scale factor for rope sampling.
|
|
||||||
verbose: Print verbose output to stderr.
|
verbose: Print verbose output to stderr.
|
||||||
|
kwargs: Unused keyword arguments (for additional backwards compatibility).
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If the model path does not exist.
|
ValueError: If the model path does not exist.
|
||||||
|
@ -274,16 +275,20 @@ class Llama:
|
||||||
self.model_path = model_path
|
self.model_path = model_path
|
||||||
|
|
||||||
self.params = llama_cpp.llama_context_default_params()
|
self.params = llama_cpp.llama_context_default_params()
|
||||||
|
self.params.seed = seed
|
||||||
self.params.n_ctx = n_ctx
|
self.params.n_ctx = n_ctx
|
||||||
self.params.n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers # 0x7FFFFFFF is INT32 max, will be auto set to all layers
|
self.params.n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers # 0x7FFFFFFF is INT32 max, will be auto set to all layers
|
||||||
self.params.seed = seed
|
self.params.main_gpu = main_gpu
|
||||||
|
self.params.rope_freq_base = rope_freq_base
|
||||||
|
self.params.rope_freq_scale = rope_freq_scale
|
||||||
|
self.params.low_vram = low_vram
|
||||||
|
self.params.mul_mat_q = mul_mat_q
|
||||||
self.params.f16_kv = f16_kv
|
self.params.f16_kv = f16_kv
|
||||||
self.params.logits_all = logits_all
|
self.params.logits_all = logits_all
|
||||||
self.params.vocab_only = vocab_only
|
self.params.vocab_only = vocab_only
|
||||||
self.params.use_mmap = use_mmap if lora_path is None else False
|
self.params.use_mmap = use_mmap if lora_path is None else False
|
||||||
self.params.use_mlock = use_mlock
|
self.params.use_mlock = use_mlock
|
||||||
self.params.embedding = embedding
|
self.params.embedding = embedding
|
||||||
self.params.low_vram = low_vram
|
|
||||||
|
|
||||||
self.tensor_split = tensor_split
|
self.tensor_split = tensor_split
|
||||||
self._p_tensor_split = None
|
self._p_tensor_split = None
|
||||||
|
@ -296,12 +301,6 @@ class Llama:
|
||||||
) # keep a reference to the array so it is not gc'd
|
) # keep a reference to the array so it is not gc'd
|
||||||
self.params.tensor_split = self._c_tensor_split
|
self.params.tensor_split = self._c_tensor_split
|
||||||
|
|
||||||
self.params.rope_freq_base = rope_freq_base
|
|
||||||
self.params.rope_freq_scale = rope_freq_scale
|
|
||||||
|
|
||||||
|
|
||||||
if mul_mat_q is not None:
|
|
||||||
self.params.mul_mat_q = mul_mat_q
|
|
||||||
|
|
||||||
self.last_n_tokens_size = last_n_tokens_size
|
self.last_n_tokens_size = last_n_tokens_size
|
||||||
self.n_batch = min(n_ctx, n_batch)
|
self.n_batch = min(n_ctx, n_batch)
|
||||||
|
@ -313,10 +312,6 @@ class Llama:
|
||||||
self.lora_base = lora_base
|
self.lora_base = lora_base
|
||||||
self.lora_path = lora_path
|
self.lora_path = lora_path
|
||||||
|
|
||||||
### DEPRECATED ###
|
|
||||||
self.n_parts = n_parts
|
|
||||||
### DEPRECATED ###
|
|
||||||
|
|
||||||
if not os.path.exists(model_path):
|
if not os.path.exists(model_path):
|
||||||
raise ValueError(f"Model path does not exist: {model_path}")
|
raise ValueError(f"Model path does not exist: {model_path}")
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue