Merge pull request #522 from bretello/llama2-70b-support
Llama2 70b support
This commit is contained in:
commit
e4431a6ade
2 changed files with 11 additions and 0 deletions
|
@ -135,6 +135,14 @@ For instance, if you want to work with larger contexts, you can expand the conte
|
|||
llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
|
||||
```
|
||||
|
||||
### Loading llama-2 70b
|
||||
|
||||
Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading:
|
||||
|
||||
```python
|
||||
llm = Llama(model_path="./models/7B/ggml-model.bin", n_gqa=8)
|
||||
```
|
||||
|
||||
## Web Server
|
||||
|
||||
`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
|
||||
|
|
|
@ -216,6 +216,7 @@ class Llama:
|
|||
embedding: bool = False,
|
||||
n_threads: Optional[int] = None,
|
||||
n_batch: int = 512,
|
||||
n_gqa: Optional[int] = None, # must be 8 for llama2 70b
|
||||
last_n_tokens_size: int = 64,
|
||||
lora_base: Optional[str] = None,
|
||||
lora_path: Optional[str] = None,
|
||||
|
@ -260,6 +261,8 @@ class Llama:
|
|||
|
||||
self.params = llama_cpp.llama_context_default_params()
|
||||
self.params.n_ctx = n_ctx
|
||||
if n_gqa is not None:
|
||||
self.params.n_gqa = n_gqa
|
||||
self.params.n_gpu_layers = n_gpu_layers
|
||||
self.params.seed = seed
|
||||
self.params.f16_kv = f16_kv
|
||||
|
|
Loading…
Reference in a new issue