diff --git a/docs/server.md b/docs/server.md index 4e1e562..cd351ba 100644 --- a/docs/server.md +++ b/docs/server.md @@ -32,6 +32,12 @@ python3 -m llama_cpp.server --help NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable. +Check out the server config reference below settings for more information on the available options. +CLI arguments and environment variables are available for all of the fields defined in [`ServerSettings`](#llama_cpp.server.settings.ServerSettings) and [`ModelSettings`](#llama_cpp.server.settings.ModelSettings) + +Additionally the server supports configuration check out the [configuration section](#configuration-and-multi-model-support) for more information and examples. + + ## Guides ### Code Completion @@ -121,4 +127,92 @@ response = client.chat.completions.create( ], ) print(response) -``` \ No newline at end of file +``` + +## Configuration and Multi-Model Support + +The server supports configuration via a JSON config file that can be passed using the `--config_file` parameter or the `CONFIG_FILE` environment variable. + +```bash +python3 -m llama_cpp.server --config_file +``` + +Config files support all of the server and model options supported by the cli and environment variables however instead of only a single model the config file can specify multiple models. + +The server supports routing requests to multiple models based on the `model` parameter in the request which matches against the `model_alias` in the config file. + +At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed. + +```json +{ + "host": "0.0.0.0", + "port": 8080, + "models": [ + { + "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf", + "model_alias": "gpt-3.5-turbo", + "chat_format": "chatml", + "n_gpu_layers": -1, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 512, + "n_ctx": 2048 + }, + { + "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf", + "model_alias": "gpt-4", + "chat_format": "chatml", + "n_gpu_layers": -1, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 512, + "n_ctx": 2048 + }, + { + "model": "models/ggml_llava-v1.5-7b/ggml-model-q4_k.gguf", + "model_alias": "gpt-4-vision-preview", + "chat_format": "llava-1-5", + "clip_model_path": "models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf", + "n_gpu_layers": -1, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 512, + "n_ctx": 2048 + }, + { + "model": "models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf", + "model_alias": "text-davinci-003", + "n_gpu_layers": -1, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 512, + "n_ctx": 2048 + }, + { + "model": "models/replit-code-v1_5-3b-GGUF/replit-code-v1_5-3b.Q4_0.gguf", + "model_alias": "copilot-codex", + "n_gpu_layers": -1, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 1024, + "n_ctx": 9216 + } + ] +} +``` + +The config file format is defined by the [`ConfigFileSettings`](#llama_cpp.server.settings.ConfigFileSettings) class. + +## Server Options Reference + +::: llama_cpp.server.settings.ConfigFileSettings + options: + show_if_no_docstring: true + +::: llama_cpp.server.settings.ServerSettings + options: + show_if_no_docstring: true + +::: llama_cpp.server.settings.ModelSettings + options: + show_if_no_docstring: true diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 53ead74..346b463 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -13,6 +13,8 @@ BaseSettings.model_config["protected_namespaces"] = () class ModelSettings(BaseSettings): + """Model settings used to load a Llama model.""" + model: str = Field( description="The path to the model to use for generating completions." ) @@ -131,6 +133,8 @@ class ModelSettings(BaseSettings): class ServerSettings(BaseSettings): + """Server settings used to configure the FastAPI and Uvicorn server.""" + # Uvicorn Settings host: str = Field(default="localhost", description="Listen address") port: int = Field(default=8000, description="Listen port") @@ -156,6 +160,8 @@ class Settings(ServerSettings, ModelSettings): class ConfigFileSettings(ServerSettings): + """Configuration file format settings.""" + models: List[ModelSettings] = Field( - default=[], description="Model configs, overwrites default config" + default=[], description="Model configs" )