llama.cpp/llama_cpp/server/settings.py

from __future__ import annotations

import multiprocessing

from typing import Optional, List, Literal, Union, Dict, cast
from typing_extensions import Self

from pydantic import Field, model_validator
from pydantic_settings import BaseSettings

import llama_cpp

# Disable warning for model and model_alias settings
BaseSettings.model_config["protected_namespaces"] = ()


class ModelSettings(BaseSettings):
    """Model settings used to load a Llama model."""

    model: str = Field(
        description="The path to the model to use for generating completions."
    )
    model_alias: Optional[str] = Field(
        default=None,
        description="The alias of the model to use for generating completions.",
    )
    # Model Params
    n_gpu_layers: int = Field(
        default=0,
        ge=-1,
        description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
    )
    split_mode: int = Field(
        default=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
        description="The split mode to use.",
    )
    main_gpu: int = Field(
        default=0,
        ge=0,
        description="Main GPU to use.",
    )
    tensor_split: Optional[List[float]] = Field(
        default=None,
        description="Split layers across multiple GPUs in proportion.",
    )
    vocab_only: bool = Field(
        default=False, description="Whether to only return the vocabulary."
    )
    use_mmap: bool = Field(
        default=llama_cpp.llama_supports_mmap(),
        description="Use mmap.",
    )
    use_mlock: bool = Field(
        default=llama_cpp.llama_supports_mlock(),
        description="Use mlock.",
    )
    kv_overrides: Optional[List[str]] = Field(
        default=None,
        description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
    )
    # Context Params
    seed: int = Field(
        default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
    )
    n_ctx: int = Field(default=2048, ge=0, description="The context size.")
    n_batch: int = Field(
        default=512, ge=1, description="The batch size to use per eval."
    )
    n_threads: int = Field(
        default=max(multiprocessing.cpu_count() // 2, 1),
        ge=1,
        description="The number of threads to use. Use -1 for max cpu threads",
    )
    n_threads_batch: int = Field(
        default=max(multiprocessing.cpu_count(), 1),
        ge=0,
        description="The number of threads to use when batch processing. Use -1 for max cpu threads",
    )
    rope_scaling_type: int = Field(
        default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
    )
    rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
    rope_freq_scale: float = Field(
        default=0.0, description="RoPE frequency scaling factor"
    )
    yarn_ext_factor: float = Field(default=-1.0)
    yarn_attn_factor: float = Field(default=1.0)
    yarn_beta_fast: float = Field(default=32.0)
    yarn_beta_slow: float = Field(default=1.0)
    yarn_orig_ctx: int = Field(default=0)
    mul_mat_q: bool = Field(
        default=True, description="if true, use experimental mul_mat_q kernels"
    )
    logits_all: bool = Field(default=True, description="Whether to return logits.")
    embedding: bool = Field(default=True, description="Whether to use embeddings.")
    offload_kqv: bool = Field(
        default=True, description="Whether to offload kqv to the GPU."
    )
    # Sampling Params
    last_n_tokens_size: int = Field(
        default=64,
        ge=0,
        description="Last n tokens to keep for repeat penalty calculation.",
    )
    # LoRA Params
    lora_base: Optional[str] = Field(
        default=None,
        description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.",
    )
    lora_path: Optional[str] = Field(
        default=None,
        description="Path to a LoRA file to apply to the model.",
    )
    # Backend Params
    numa: Union[bool, int] = Field(
        default=False,
        description="Enable NUMA support.",
    )
    # Chat Format Params
    chat_format: Optional[str] = Field(
        default=None,
        description="Chat format to use.",
    )
    clip_model_path: Optional[str] = Field(
        default=None,
        description="Path to a CLIP model to use for multi-modal chat completion.",
    )
    # Cache Params
    cache: bool = Field(
        default=False,
        description="Use a cache to reduce processing times for evaluated prompts.",
    )
    cache_type: Literal["ram", "disk"] = Field(
        default="ram",
        description="The type of cache to use. Only used if cache is True.",
    )
    cache_size: int = Field(
        default=2 << 30,
        description="The size of the cache in bytes. Only used if cache is True.",
    )
    # Tokenizer Options
    hf_tokenizer_config_path: Optional[str] = Field(
        default=None,
        description="The path to a HuggingFace tokenizer_config.json file.",
    )
    hf_pretrained_model_name_or_path: Optional[str] = Field(
        default=None,
        description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
    )
    # Loading from HuggingFace Model Hub
    hf_model_repo_id: Optional[str] = Field(
        default=None,
        description="The model repo id to use for the HuggingFace tokenizer model.",
    )
    # Speculative Decoding
    draft_model: Optional[str] = Field(
        default=None,
        description="Method to use for speculative decoding. One of (prompt-lookup-decoding).",
    )
    draft_model_num_pred_tokens: int = Field(
        default=10,
        description="Number of tokens to predict using the draft model.",
    )
    # KV Cache Quantization
    type_k: Optional[int] = Field(
        default=None,
        description="Type of the key cache quantization.",
    )
    type_v: Optional[int] = Field(
        default=None,
        description="Type of the value cache quantization.",
    )
    # Misc
    verbose: bool = Field(
        default=True, description="Whether to print debug information."
    )

    @model_validator(mode="before")  # pre=True to ensure this runs before any other validation
    def set_dynamic_defaults(self) -> Self:
        # If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count()
        cpu_count = multiprocessing.cpu_count()
        values = cast(Dict[str, int], self)
        if values.get('n_threads', 0) == -1:
            values['n_threads'] = cpu_count
        if values.get('n_threads_batch', 0) == -1:
            values['n_threads_batch'] = cpu_count
        return self


class ServerSettings(BaseSettings):
    """Server settings used to configure the FastAPI and Uvicorn server."""

    # Uvicorn Settings
    host: str = Field(default="localhost", description="Listen address")
    port: int = Field(default=8000, description="Listen port")
    ssl_keyfile: Optional[str] = Field(
        default=None, description="SSL key file for HTTPS"
    )
    ssl_certfile: Optional[str] = Field(
        default=None, description="SSL certificate file for HTTPS"
    )
    # FastAPI Settings
    api_key: Optional[str] = Field(
        default=None,
        description="API key for authentication. If set all requests need to be authenticated.",
    )
    interrupt_requests: bool = Field(
        default=True,
        description="Whether to interrupt requests when a new request is received.",
    )
    disable_ping_events: bool = Field(
        default=False,
        description="Disable EventSource pings (may be needed for some clients).",
    )


class Settings(ServerSettings, ModelSettings):
    pass


class ConfigFileSettings(ServerSettings):
    """Configuration file format settings."""

    models: List[ModelSettings] = Field(default=[], description="Model configs")