diff --git a/README.md b/README.md index 363339e..0e6f218 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,14 @@ For instance, if you want to work with larger contexts, you can expand the conte llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048) ``` +### Loading llama-2 70b + +Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading: + +```python +llm = Llama(model_path="./models/7B/ggml-model.bin", n_gqa=8) +``` + ## Web Server `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5439b1d..251d064 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -216,6 +216,7 @@ class Llama: embedding: bool = False, n_threads: Optional[int] = None, n_batch: int = 512, + n_gqa: Optional[int] = None, # must be 8 for llama2 70b last_n_tokens_size: int = 64, lora_base: Optional[str] = None, lora_path: Optional[str] = None, @@ -260,6 +261,8 @@ class Llama: self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx + if n_gqa is not None: + self.params.n_gqa = n_gqa self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv