docs: add server config docs

2023-12-22 14:37:24 -05:00 · 2023-12-22 14:37:24 -05:00 · 522aecb868
commit 522aecb868
parent 6473796343
2 changed files with 102 additions and 2 deletions
--- a/docs/server.md
+++ b/docs/server.md
@ -32,6 +32,12 @@ python3 -m llama_cpp.server --help

 NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.

+Check out the server config reference below settings for more information on the available options.
+CLI arguments and environment variables are available for all of the fields defined in [`ServerSettings`](#llama_cpp.server.settings.ServerSettings) and [`ModelSettings`](#llama_cpp.server.settings.ModelSettings) 
+
+Additionally the server supports configuration check out the [configuration section](#configuration-and-multi-model-support) for more information and examples.
+
+
 ## Guides

 ### Code Completion
@ -122,3 +128,91 @@ response = client.chat.completions.create(
 )
 print(response)
 ```
+
+## Configuration and Multi-Model Support
+
+The server supports configuration via a JSON config file that can be passed using the `--config_file` parameter or the `CONFIG_FILE` environment variable.
+
+```bash
+python3 -m llama_cpp.server --config_file <config_file>
+```
+
+Config files support all of the server and model options supported by the cli and environment variables however instead of only a single model the config file can specify multiple models.
+
+The server supports routing requests to multiple models based on the `model` parameter in the request which matches against the `model_alias` in the config file.
+
+At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed.
+
+```json
+{
+    "host": "0.0.0.0",
+    "port": 8080,
+    "models": [
+        {
+            "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
+            "model_alias": "gpt-3.5-turbo",
+            "chat_format": "chatml",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
+            "model_alias": "gpt-4",
+            "chat_format": "chatml",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/ggml_llava-v1.5-7b/ggml-model-q4_k.gguf",
+            "model_alias": "gpt-4-vision-preview",
+            "chat_format": "llava-1-5",
+            "clip_model_path": "models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf",
+            "model_alias": "text-davinci-003",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/replit-code-v1_5-3b-GGUF/replit-code-v1_5-3b.Q4_0.gguf",
+            "model_alias": "copilot-codex",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 1024,
+            "n_ctx": 9216
+        }
+    ]
+}
+```
+
+The config file format is defined by the [`ConfigFileSettings`](#llama_cpp.server.settings.ConfigFileSettings) class.
+
+## Server Options Reference
+
+::: llama_cpp.server.settings.ConfigFileSettings
+    options:
+        show_if_no_docstring: true
+
+::: llama_cpp.server.settings.ServerSettings
+    options:
+        show_if_no_docstring: true
+
+::: llama_cpp.server.settings.ModelSettings
+    options:
+        show_if_no_docstring: true
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@ -13,6 +13,8 @@ BaseSettings.model_config["protected_namespaces"] = ()


 class ModelSettings(BaseSettings):
+    """Model settings used to load a Llama model."""
+
    model: str = Field(
        description="The path to the model to use for generating completions."
    )
@ -131,6 +133,8 @@ class ModelSettings(BaseSettings):


 class ServerSettings(BaseSettings):
+    """Server settings used to configure the FastAPI and Uvicorn server."""
+
    # Uvicorn Settings
    host: str = Field(default="localhost", description="Listen address")
    port: int = Field(default=8000, description="Listen port")
@ -156,6 +160,8 @@ class Settings(ServerSettings, ModelSettings):


 class ConfigFileSettings(ServerSettings):
+    """Configuration file format settings."""
+
    models: List[ModelSettings] = Field(
-        default=[], description="Model configs, overwrites default config"
+        default=[], description="Model configs"
    )