From 0f09f10e8c30a39dc25beda589ab81f016d686a4 Mon Sep 17 00:00:00 2001 From: bretello Date: Mon, 24 Jul 2023 15:51:19 +0200 Subject: [PATCH 1/4] add support for llama2 70b --- README.md | 8 ++++++++ llama_cpp/llama.py | 3 +++ 2 files changed, 11 insertions(+) diff --git a/README.md b/README.md index 363339e..0e6f218 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,14 @@ For instance, if you want to work with larger contexts, you can expand the conte llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048) ``` +### Loading llama-2 70b + +Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading: + +```python +llm = Llama(model_path="./models/7B/ggml-model.bin", n_gqa=8) +``` + ## Web Server `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5439b1d..251d064 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -216,6 +216,7 @@ class Llama: embedding: bool = False, n_threads: Optional[int] = None, n_batch: int = 512, + n_gqa: Optional[int] = None, # must be 8 for llama2 70b last_n_tokens_size: int = 64, lora_base: Optional[str] = None, lora_path: Optional[str] = None, @@ -260,6 +261,8 @@ class Llama: self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx + if n_gqa is not None: + self.params.n_gqa = n_gqa self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv From 8cd64d4ac30b147d8e831a09f781491008f33f82 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Jul 2023 13:52:12 -0400 Subject: [PATCH 2/4] Add rms_eps_norm --- llama_cpp/llama.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 251d064..7ca7af0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -216,7 +216,6 @@ class Llama: embedding: bool = False, n_threads: Optional[int] = None, n_batch: int = 512, - n_gqa: Optional[int] = None, # must be 8 for llama2 70b last_n_tokens_size: int = 64, lora_base: Optional[str] = None, lora_path: Optional[str] = None, @@ -224,6 +223,8 @@ class Llama: tensor_split: Optional[List[float]] = None, rope_freq_base: float = 10000.0, rope_freq_scale: float = 1.0, + n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b + rms_eps_norm: Optional[float] = None, # (TEMPORARY) verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -261,8 +262,6 @@ class Llama: self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx - if n_gqa is not None: - self.params.n_gqa = n_gqa self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv @@ -285,6 +284,12 @@ class Llama: self.params.rope_freq_base = rope_freq_base self.params.rope_freq_scale = rope_freq_scale + if n_gqa is not None: + self.params.n_gqa = n_gqa + + if rms_eps_norm is not None: + self.params.rms_eps_norm = rms_eps_norm + self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) @@ -1526,6 +1531,10 @@ class Llama: lora_base=self.lora_base, lora_path=self.lora_path, tensor_split=self.tensor_split, + ### TEMPORARY ### + n_gqa=self.params.n_gqa, + rms_eps_norm=self.params.rms_eps_norm, + ### TEMPORARY ### ### DEPRECATED ### n_parts=self.n_parts, ### DEPRECATED ### @@ -1535,7 +1544,6 @@ class Llama: self.__init__( model_path=state["model_path"], n_ctx=state["n_ctx"], - n_parts=state["n_parts"], n_gpu_layers=state["n_gpu_layers"], seed=state["seed"], f16_kv=state["f16_kv"], @@ -1551,7 +1559,14 @@ class Llama: lora_base=state["lora_base"], lora_path=state["lora_path"], tensor_split=state["tensor_split"], + n_gqa=state["n_gqa"], + ### TEMPORARY ### + rms_eps_norm=state["rms_eps_norm"], verbose=state["verbose"], + ### TEMPORARY ### + ### DEPRECATED ### + n_parts=state["n_parts"], + ### DEPRECATED ### ) def save_state(self) -> LlamaState: From 11dd2bf3829896b00d7af1121d19e60c03385987 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Jul 2023 14:09:24 -0400 Subject: [PATCH 3/4] Add temporary rms_norm_eps parameter --- llama_cpp/llama.py | 14 +++++++------- llama_cpp/server/app.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7ca7af0..9679b2e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -224,7 +224,7 @@ class Llama: rope_freq_base: float = 10000.0, rope_freq_scale: float = 1.0, n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b - rms_eps_norm: Optional[float] = None, # (TEMPORARY) + rms_norm_eps: Optional[float] = None, # (TEMPORARY) verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -287,8 +287,8 @@ class Llama: if n_gqa is not None: self.params.n_gqa = n_gqa - if rms_eps_norm is not None: - self.params.rms_eps_norm = rms_eps_norm + if rms_norm_eps is not None: + self.params.rms_norm_eps = rms_norm_eps self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) @@ -1533,7 +1533,7 @@ class Llama: tensor_split=self.tensor_split, ### TEMPORARY ### n_gqa=self.params.n_gqa, - rms_eps_norm=self.params.rms_eps_norm, + rms_norm_eps=self.params.rms_norm_eps, ### TEMPORARY ### ### DEPRECATED ### n_parts=self.n_parts, @@ -1559,11 +1559,11 @@ class Llama: lora_base=state["lora_base"], lora_path=state["lora_path"], tensor_split=state["tensor_split"], - n_gqa=state["n_gqa"], - ### TEMPORARY ### - rms_eps_norm=state["rms_eps_norm"], verbose=state["verbose"], ### TEMPORARY ### + n_gqa=state["n_gqa"], + rms_norm_eps=state["rms_norm_eps"], + ### TEMPORARY ### ### DEPRECATED ### n_parts=state["n_parts"], ### DEPRECATED ### diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ba68ba8..4afcfd5 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -95,6 +95,14 @@ class Settings(BaseSettings): default=True, description="Whether to interrupt requests when a new request is received.", ) + n_gqa: Optional[int] = Field( + default=None, + description="TEMPORARY: Set to 8 for Llama2 70B", + ) + rms_norm_eps: Optional[float] = Field( + default=None, + description="TEMPORARY", + ) class ErrorResponse(TypedDict): @@ -320,6 +328,8 @@ def create_app(settings: Optional[Settings] = None): last_n_tokens_size=settings.last_n_tokens_size, vocab_only=settings.vocab_only, verbose=settings.verbose, + n_gqa=settings.n_gqa, + rms_norm_eps=settings.rms_norm_eps, ) if settings.cache: if settings.cache_type == "disk": From c7c700b0d4900f22f5990d565adc67d0b0af3b22 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 24 Jul 2023 14:11:21 -0400 Subject: [PATCH 4/4] Bump version --- CHANGELOG.md | 5 +++++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56db2b3..9ca220e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.77] + +- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B +- (server) Add temporary n_gqa and rms_norm_eps parameters required for LLaMa 2 70B + ## [0.1.76] - (llama.cpp) Update llama.cpp add support for LLaMa 2 70B diff --git a/pyproject.toml b/pyproject.toml index 5861af8..a7a3b35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.76" +version = "0.1.77" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 5ea2781..8e6139d 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.76", + version="0.1.77", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT",