Update server params. Added lora_base, lora_path, low_vram, and main_gpu. Removed rms_norm_eps and n_gqa (deprecated in llama.cpp)
This commit is contained in:
parent
6a20293fc2
commit
2920c4bf7e
1 changed files with 46 additions and 39 deletions
|
@ -34,12 +34,21 @@ class Settings(BaseSettings):
|
||||||
default=None,
|
default=None,
|
||||||
description="The alias of the model to use for generating completions.",
|
description="The alias of the model to use for generating completions.",
|
||||||
)
|
)
|
||||||
|
seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
|
||||||
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
|
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
|
||||||
|
n_batch: int = Field(
|
||||||
|
default=512, ge=1, description="The batch size to use per eval."
|
||||||
|
)
|
||||||
n_gpu_layers: int = Field(
|
n_gpu_layers: int = Field(
|
||||||
default=0,
|
default=0,
|
||||||
ge=0,
|
ge=0,
|
||||||
description="The number of layers to put on the GPU. The rest will be on the CPU.",
|
description="The number of layers to put on the GPU. The rest will be on the CPU.",
|
||||||
)
|
)
|
||||||
|
main_gpu: int = Field(
|
||||||
|
default=0,
|
||||||
|
ge=0,
|
||||||
|
description="Main GPU to use.",
|
||||||
|
)
|
||||||
tensor_split: Optional[List[float]] = Field(
|
tensor_split: Optional[List[float]] = Field(
|
||||||
default=None,
|
default=None,
|
||||||
description="Split layers across multiple GPUs in proportion.",
|
description="Split layers across multiple GPUs in proportion.",
|
||||||
|
@ -50,35 +59,45 @@ class Settings(BaseSettings):
|
||||||
rope_freq_scale: float = Field(
|
rope_freq_scale: float = Field(
|
||||||
default=1.0, description="RoPE frequency scaling factor"
|
default=1.0, description="RoPE frequency scaling factor"
|
||||||
)
|
)
|
||||||
seed: int = Field(default=1337, description="Random seed. -1 for random.")
|
low_vram: bool = Field(
|
||||||
n_batch: int = Field(
|
default=False,
|
||||||
default=512, ge=1, description="The batch size to use per eval."
|
description="Whether to use less VRAM. This will reduce performance.",
|
||||||
)
|
)
|
||||||
n_threads: int = Field(
|
mul_mat_q: bool = Field(
|
||||||
default=max(multiprocessing.cpu_count() // 2, 1),
|
default=True, description="if true, use experimental mul_mat_q kernels"
|
||||||
ge=1,
|
|
||||||
description="The number of threads to use.",
|
|
||||||
)
|
)
|
||||||
f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
|
f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
|
||||||
use_mlock: bool = Field(
|
logits_all: bool = Field(default=True, description="Whether to return logits.")
|
||||||
default=llama_cpp.llama_mlock_supported(),
|
vocab_only: bool = Field(
|
||||||
description="Use mlock.",
|
default=False, description="Whether to only return the vocabulary."
|
||||||
)
|
)
|
||||||
use_mmap: bool = Field(
|
use_mmap: bool = Field(
|
||||||
default=llama_cpp.llama_mmap_supported(),
|
default=llama_cpp.llama_mmap_supported(),
|
||||||
description="Use mmap.",
|
description="Use mmap.",
|
||||||
)
|
)
|
||||||
|
use_mlock: bool = Field(
|
||||||
|
default=llama_cpp.llama_mlock_supported(),
|
||||||
|
description="Use mlock.",
|
||||||
|
)
|
||||||
embedding: bool = Field(default=True, description="Whether to use embeddings.")
|
embedding: bool = Field(default=True, description="Whether to use embeddings.")
|
||||||
low_vram: bool = Field(
|
n_threads: int = Field(
|
||||||
default=False,
|
default=max(multiprocessing.cpu_count() // 2, 1),
|
||||||
description="Whether to use less VRAM. This will reduce performance.",
|
ge=1,
|
||||||
|
description="The number of threads to use.",
|
||||||
)
|
)
|
||||||
last_n_tokens_size: int = Field(
|
last_n_tokens_size: int = Field(
|
||||||
default=64,
|
default=64,
|
||||||
ge=0,
|
ge=0,
|
||||||
description="Last n tokens to keep for repeat penalty calculation.",
|
description="Last n tokens to keep for repeat penalty calculation.",
|
||||||
)
|
)
|
||||||
logits_all: bool = Field(default=True, description="Whether to return logits.")
|
lora_base: Optional[str] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model."
|
||||||
|
)
|
||||||
|
lora_path: Optional[str] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Path to a LoRA file to apply to the model.",
|
||||||
|
)
|
||||||
cache: bool = Field(
|
cache: bool = Field(
|
||||||
default=False,
|
default=False,
|
||||||
description="Use a cache to reduce processing times for evaluated prompts.",
|
description="Use a cache to reduce processing times for evaluated prompts.",
|
||||||
|
@ -91,9 +110,6 @@ class Settings(BaseSettings):
|
||||||
default=2 << 30,
|
default=2 << 30,
|
||||||
description="The size of the cache in bytes. Only used if cache is True.",
|
description="The size of the cache in bytes. Only used if cache is True.",
|
||||||
)
|
)
|
||||||
vocab_only: bool = Field(
|
|
||||||
default=False, description="Whether to only return the vocabulary."
|
|
||||||
)
|
|
||||||
verbose: bool = Field(
|
verbose: bool = Field(
|
||||||
default=True, description="Whether to print debug information."
|
default=True, description="Whether to print debug information."
|
||||||
)
|
)
|
||||||
|
@ -103,18 +119,6 @@ class Settings(BaseSettings):
|
||||||
default=True,
|
default=True,
|
||||||
description="Whether to interrupt requests when a new request is received.",
|
description="Whether to interrupt requests when a new request is received.",
|
||||||
)
|
)
|
||||||
n_gqa: Optional[int] = Field(
|
|
||||||
default=None,
|
|
||||||
description="TEMPORARY: Set to 8 for Llama2 70B",
|
|
||||||
)
|
|
||||||
rms_norm_eps: Optional[float] = Field(
|
|
||||||
default=None,
|
|
||||||
description="TEMPORARY",
|
|
||||||
)
|
|
||||||
mul_mat_q: Optional[bool] = Field(
|
|
||||||
default=None,
|
|
||||||
description="TEMPORARY",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ErrorResponse(TypedDict):
|
class ErrorResponse(TypedDict):
|
||||||
|
@ -334,24 +338,27 @@ def create_app(settings: Optional[Settings] = None):
|
||||||
global llama
|
global llama
|
||||||
llama = llama_cpp.Llama(
|
llama = llama_cpp.Llama(
|
||||||
model_path=settings.model,
|
model_path=settings.model,
|
||||||
|
seed=settings.seed,
|
||||||
|
n_ctx=settings.n_ctx,
|
||||||
|
n_batch=settings.n_batch,
|
||||||
n_gpu_layers=settings.n_gpu_layers,
|
n_gpu_layers=settings.n_gpu_layers,
|
||||||
|
main_gpu=settings.main_gpu,
|
||||||
tensor_split=settings.tensor_split,
|
tensor_split=settings.tensor_split,
|
||||||
rope_freq_base=settings.rope_freq_base,
|
rope_freq_base=settings.rope_freq_base,
|
||||||
rope_freq_scale=settings.rope_freq_scale,
|
rope_freq_scale=settings.rope_freq_scale,
|
||||||
seed=settings.seed,
|
low_vram=settings.low_vram,
|
||||||
|
mul_mat_q=settings.mul_mat_q,
|
||||||
f16_kv=settings.f16_kv,
|
f16_kv=settings.f16_kv,
|
||||||
use_mlock=settings.use_mlock,
|
|
||||||
use_mmap=settings.use_mmap,
|
|
||||||
embedding=settings.embedding,
|
|
||||||
logits_all=settings.logits_all,
|
logits_all=settings.logits_all,
|
||||||
n_threads=settings.n_threads,
|
|
||||||
n_batch=settings.n_batch,
|
|
||||||
n_ctx=settings.n_ctx,
|
|
||||||
last_n_tokens_size=settings.last_n_tokens_size,
|
|
||||||
vocab_only=settings.vocab_only,
|
vocab_only=settings.vocab_only,
|
||||||
|
use_mmap=settings.use_mmap,
|
||||||
|
use_mlock=settings.use_mlock,
|
||||||
|
embedding=settings.embedding,
|
||||||
|
n_threads=settings.n_threads,
|
||||||
|
last_n_tokens_size=settings.last_n_tokens_size,
|
||||||
|
lora_base=settings.lora_base,
|
||||||
|
lora_path=settings.lora_path,
|
||||||
verbose=settings.verbose,
|
verbose=settings.verbose,
|
||||||
n_gqa=settings.n_gqa,
|
|
||||||
rms_norm_eps=settings.rms_norm_eps,
|
|
||||||
)
|
)
|
||||||
if settings.cache:
|
if settings.cache:
|
||||||
if settings.cache_type == "disk":
|
if settings.cache_type == "disk":
|
||||||
|
|
Loading…
Reference in a new issue