diff --git a/CHANGELOG.md b/CHANGELOG.md index 56db2b3..9ca220e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.77] + +- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B +- (server) Add temporary n_gqa and rms_norm_eps parameters required for LLaMa 2 70B + ## [0.1.76] - (llama.cpp) Update llama.cpp add support for LLaMa 2 70B diff --git a/README.md b/README.md index f61be2f..639d261 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,14 @@ For instance, if you want to work with larger contexts, you can expand the conte llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048) ``` +### Loading llama-2 70b + +Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading: + +```python +llm = Llama(model_path="./models/7B/ggml-model.bin", n_gqa=8) +``` + ## Web Server `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 0178458..94ab8c5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -230,6 +230,8 @@ class Llama: tensor_split: Optional[List[float]] = None, rope_freq_base: float = 10000.0, rope_freq_scale: float = 1.0, + n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b + rms_norm_eps: Optional[float] = None, # (TEMPORARY) verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -291,6 +293,12 @@ class Llama: self.params.rope_freq_base = rope_freq_base self.params.rope_freq_scale = rope_freq_scale + if n_gqa is not None: + self.params.n_gqa = n_gqa + + if rms_norm_eps is not None: + self.params.rms_norm_eps = rms_norm_eps + self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) @@ -1530,6 +1538,10 @@ class Llama: lora_base=self.lora_base, lora_path=self.lora_path, tensor_split=self.tensor_split, + ### TEMPORARY ### + n_gqa=self.params.n_gqa, + rms_norm_eps=self.params.rms_norm_eps, + ### TEMPORARY ### ### DEPRECATED ### n_parts=self.n_parts, ### DEPRECATED ### @@ -1539,7 +1551,6 @@ class Llama: self.__init__( model_path=state["model_path"], n_ctx=state["n_ctx"], - n_parts=state["n_parts"], n_gpu_layers=state["n_gpu_layers"], seed=state["seed"], f16_kv=state["f16_kv"], @@ -1556,6 +1567,13 @@ class Llama: lora_path=state["lora_path"], tensor_split=state["tensor_split"], verbose=state["verbose"], + ### TEMPORARY ### + n_gqa=state["n_gqa"], + rms_norm_eps=state["rms_norm_eps"], + ### TEMPORARY ### + ### DEPRECATED ### + n_parts=state["n_parts"], + ### DEPRECATED ### ) def save_state(self) -> LlamaState: diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 283a9ad..58b5551 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -100,6 +100,14 @@ class Settings(BaseSettings): default=True, description="Whether to interrupt requests when a new request is received.", ) + n_gqa: Optional[int] = Field( + default=None, + description="TEMPORARY: Set to 8 for Llama2 70B", + ) + rms_norm_eps: Optional[float] = Field( + default=None, + description="TEMPORARY", + ) class ErrorResponse(TypedDict): @@ -325,6 +333,8 @@ def create_app(settings: Optional[Settings] = None): last_n_tokens_size=settings.last_n_tokens_size, vocab_only=settings.vocab_only, verbose=settings.verbose, + n_gqa=settings.n_gqa, + rms_norm_eps=settings.rms_norm_eps, ) if settings.cache: if settings.cache_type == "disk": diff --git a/pyproject.toml b/pyproject.toml index 672a6f6..6272bb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "llama_cpp_python" -version = "0.1.76" +version = "0.1.77" description = "Python bindings for the llama.cpp library" readme = "README.md" license = { text = "MIT" }