feat(server): Add support for pulling models from Huggingface Hub (#1222)
* Basic support for hf pull on server * Add hf_model_repo_id setting * Update README
This commit is contained in:
parent
b3e358dee4
commit
4d574bd765
3 changed files with 24 additions and 2 deletions
|
@ -577,6 +577,12 @@ python3 -m llama_cpp.server --model models/7B/llama-model.gguf --chat_format cha
|
||||||
That will format the prompt according to how model expects it. You can find the prompt format in the model card.
|
That will format the prompt according to how model expects it. You can find the prompt format in the model card.
|
||||||
For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_format.py) and look for lines starting with "@register_chat_format".
|
For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_format.py) and look for lines starting with "@register_chat_format".
|
||||||
|
|
||||||
|
If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf'
|
||||||
|
```
|
||||||
|
|
||||||
### Web Server Features
|
### Web Server Features
|
||||||
|
|
||||||
- [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
|
- [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
|
||||||
|
|
|
@ -120,9 +120,20 @@ class LlamaProxy:
|
||||||
kv_overrides[key] = float(value)
|
kv_overrides[key] = float(value)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown value type {value_type}")
|
raise ValueError(f"Unknown value type {value_type}")
|
||||||
|
|
||||||
|
import functools
|
||||||
|
|
||||||
_model = llama_cpp.Llama(
|
kwargs = {}
|
||||||
model_path=settings.model,
|
|
||||||
|
if settings.hf_model_repo_id is not None:
|
||||||
|
create_fn = functools.partial(llama_cpp.Llama.from_pretrained, repo_id=settings.hf_model_repo_id, filename=settings.model)
|
||||||
|
else:
|
||||||
|
create_fn = llama_cpp.Llama
|
||||||
|
kwargs["model_path"] = settings.model
|
||||||
|
|
||||||
|
|
||||||
|
_model = create_fn(
|
||||||
|
**kwargs,
|
||||||
# Model Params
|
# Model Params
|
||||||
n_gpu_layers=settings.n_gpu_layers,
|
n_gpu_layers=settings.n_gpu_layers,
|
||||||
main_gpu=settings.main_gpu,
|
main_gpu=settings.main_gpu,
|
||||||
|
|
|
@ -143,6 +143,11 @@ class ModelSettings(BaseSettings):
|
||||||
default=None,
|
default=None,
|
||||||
description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
|
description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
|
||||||
)
|
)
|
||||||
|
# Loading from HuggingFace Model Hub
|
||||||
|
hf_model_repo_id: Optional[str] = Field(
|
||||||
|
default=None,
|
||||||
|
description="The model repo id to use for the HuggingFace tokenizer model.",
|
||||||
|
)
|
||||||
# Speculative Decoding
|
# Speculative Decoding
|
||||||
draft_model: Optional[str] = Field(
|
draft_model: Optional[str] = Field(
|
||||||
default=None,
|
default=None,
|
||||||
|
|
Loading…
Reference in a new issue