feat(server): Add support for pulling models from Huggingface Hub (#1222)

* Basic support for hf pull on server

* Add hf_model_repo_id setting

* Update README
This commit is contained in:
Andrei 2024-02-26 14:35:08 -05:00 committed by GitHub
parent b3e358dee4
commit 4d574bd765
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 24 additions and 2 deletions

View file

@ -577,6 +577,12 @@ python3 -m llama_cpp.server --model models/7B/llama-model.gguf --chat_format cha
That will format the prompt according to how model expects it. You can find the prompt format in the model card. That will format the prompt according to how model expects it. You can find the prompt format in the model card.
For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_format.py) and look for lines starting with "@register_chat_format". For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_format.py) and look for lines starting with "@register_chat_format".
If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.
```bash
python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf'
```
### Web Server Features ### Web Server Features
- [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion) - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)

View file

@ -120,9 +120,20 @@ class LlamaProxy:
kv_overrides[key] = float(value) kv_overrides[key] = float(value)
else: else:
raise ValueError(f"Unknown value type {value_type}") raise ValueError(f"Unknown value type {value_type}")
import functools
_model = llama_cpp.Llama( kwargs = {}
model_path=settings.model,
if settings.hf_model_repo_id is not None:
create_fn = functools.partial(llama_cpp.Llama.from_pretrained, repo_id=settings.hf_model_repo_id, filename=settings.model)
else:
create_fn = llama_cpp.Llama
kwargs["model_path"] = settings.model
_model = create_fn(
**kwargs,
# Model Params # Model Params
n_gpu_layers=settings.n_gpu_layers, n_gpu_layers=settings.n_gpu_layers,
main_gpu=settings.main_gpu, main_gpu=settings.main_gpu,

View file

@ -143,6 +143,11 @@ class ModelSettings(BaseSettings):
default=None, default=None,
description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().", description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
) )
# Loading from HuggingFace Model Hub
hf_model_repo_id: Optional[str] = Field(
default=None,
description="The model repo id to use for the HuggingFace tokenizer model.",
)
# Speculative Decoding # Speculative Decoding
draft_model: Optional[str] = Field( draft_model: Optional[str] = Field(
default=None, default=None,