diff --git a/README.md b/README.md index e3d7643..8d0980e 100644 --- a/README.md +++ b/README.md @@ -577,6 +577,12 @@ python3 -m llama_cpp.server --model models/7B/llama-model.gguf --chat_format cha That will format the prompt according to how model expects it. You can find the prompt format in the model card. For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_format.py) and look for lines starting with "@register_chat_format". +If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub. + +```bash +python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf' +``` + ### Web Server Features - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 5308dc2..816d089 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -120,9 +120,20 @@ class LlamaProxy: kv_overrides[key] = float(value) else: raise ValueError(f"Unknown value type {value_type}") + + import functools - _model = llama_cpp.Llama( - model_path=settings.model, + kwargs = {} + + if settings.hf_model_repo_id is not None: + create_fn = functools.partial(llama_cpp.Llama.from_pretrained, repo_id=settings.hf_model_repo_id, filename=settings.model) + else: + create_fn = llama_cpp.Llama + kwargs["model_path"] = settings.model + + + _model = create_fn( + **kwargs, # Model Params n_gpu_layers=settings.n_gpu_layers, main_gpu=settings.main_gpu, diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 8ac8df8..bd80674 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -143,6 +143,11 @@ class ModelSettings(BaseSettings): default=None, description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().", ) + # Loading from HuggingFace Model Hub + hf_model_repo_id: Optional[str] = Field( + default=None, + description="The model repo id to use for the HuggingFace tokenizer model.", + ) # Speculative Decoding draft_model: Optional[str] = Field( default=None,