Merge branch 'main' into v0.2-wip
This commit is contained in:
commit
343480364f
5 changed files with 43 additions and 2 deletions
|
@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.1.77]
|
||||||
|
|
||||||
|
- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
|
||||||
|
- (server) Add temporary n_gqa and rms_norm_eps parameters required for LLaMa 2 70B
|
||||||
|
|
||||||
## [0.1.76]
|
## [0.1.76]
|
||||||
|
|
||||||
- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
|
- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
|
||||||
|
|
|
@ -135,6 +135,14 @@ For instance, if you want to work with larger contexts, you can expand the conte
|
||||||
llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
|
llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Loading llama-2 70b
|
||||||
|
|
||||||
|
Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading:
|
||||||
|
|
||||||
|
```python
|
||||||
|
llm = Llama(model_path="./models/7B/ggml-model.bin", n_gqa=8)
|
||||||
|
```
|
||||||
|
|
||||||
## Web Server
|
## Web Server
|
||||||
|
|
||||||
`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
|
`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
|
||||||
|
|
|
@ -230,6 +230,8 @@ class Llama:
|
||||||
tensor_split: Optional[List[float]] = None,
|
tensor_split: Optional[List[float]] = None,
|
||||||
rope_freq_base: float = 10000.0,
|
rope_freq_base: float = 10000.0,
|
||||||
rope_freq_scale: float = 1.0,
|
rope_freq_scale: float = 1.0,
|
||||||
|
n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b
|
||||||
|
rms_norm_eps: Optional[float] = None, # (TEMPORARY)
|
||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
):
|
):
|
||||||
"""Load a llama.cpp model from `model_path`.
|
"""Load a llama.cpp model from `model_path`.
|
||||||
|
@ -291,6 +293,12 @@ class Llama:
|
||||||
self.params.rope_freq_base = rope_freq_base
|
self.params.rope_freq_base = rope_freq_base
|
||||||
self.params.rope_freq_scale = rope_freq_scale
|
self.params.rope_freq_scale = rope_freq_scale
|
||||||
|
|
||||||
|
if n_gqa is not None:
|
||||||
|
self.params.n_gqa = n_gqa
|
||||||
|
|
||||||
|
if rms_norm_eps is not None:
|
||||||
|
self.params.rms_norm_eps = rms_norm_eps
|
||||||
|
|
||||||
self.last_n_tokens_size = last_n_tokens_size
|
self.last_n_tokens_size = last_n_tokens_size
|
||||||
self.n_batch = min(n_ctx, n_batch)
|
self.n_batch = min(n_ctx, n_batch)
|
||||||
|
|
||||||
|
@ -1530,6 +1538,10 @@ class Llama:
|
||||||
lora_base=self.lora_base,
|
lora_base=self.lora_base,
|
||||||
lora_path=self.lora_path,
|
lora_path=self.lora_path,
|
||||||
tensor_split=self.tensor_split,
|
tensor_split=self.tensor_split,
|
||||||
|
### TEMPORARY ###
|
||||||
|
n_gqa=self.params.n_gqa,
|
||||||
|
rms_norm_eps=self.params.rms_norm_eps,
|
||||||
|
### TEMPORARY ###
|
||||||
### DEPRECATED ###
|
### DEPRECATED ###
|
||||||
n_parts=self.n_parts,
|
n_parts=self.n_parts,
|
||||||
### DEPRECATED ###
|
### DEPRECATED ###
|
||||||
|
@ -1539,7 +1551,6 @@ class Llama:
|
||||||
self.__init__(
|
self.__init__(
|
||||||
model_path=state["model_path"],
|
model_path=state["model_path"],
|
||||||
n_ctx=state["n_ctx"],
|
n_ctx=state["n_ctx"],
|
||||||
n_parts=state["n_parts"],
|
|
||||||
n_gpu_layers=state["n_gpu_layers"],
|
n_gpu_layers=state["n_gpu_layers"],
|
||||||
seed=state["seed"],
|
seed=state["seed"],
|
||||||
f16_kv=state["f16_kv"],
|
f16_kv=state["f16_kv"],
|
||||||
|
@ -1556,6 +1567,13 @@ class Llama:
|
||||||
lora_path=state["lora_path"],
|
lora_path=state["lora_path"],
|
||||||
tensor_split=state["tensor_split"],
|
tensor_split=state["tensor_split"],
|
||||||
verbose=state["verbose"],
|
verbose=state["verbose"],
|
||||||
|
### TEMPORARY ###
|
||||||
|
n_gqa=state["n_gqa"],
|
||||||
|
rms_norm_eps=state["rms_norm_eps"],
|
||||||
|
### TEMPORARY ###
|
||||||
|
### DEPRECATED ###
|
||||||
|
n_parts=state["n_parts"],
|
||||||
|
### DEPRECATED ###
|
||||||
)
|
)
|
||||||
|
|
||||||
def save_state(self) -> LlamaState:
|
def save_state(self) -> LlamaState:
|
||||||
|
|
|
@ -100,6 +100,14 @@ class Settings(BaseSettings):
|
||||||
default=True,
|
default=True,
|
||||||
description="Whether to interrupt requests when a new request is received.",
|
description="Whether to interrupt requests when a new request is received.",
|
||||||
)
|
)
|
||||||
|
n_gqa: Optional[int] = Field(
|
||||||
|
default=None,
|
||||||
|
description="TEMPORARY: Set to 8 for Llama2 70B",
|
||||||
|
)
|
||||||
|
rms_norm_eps: Optional[float] = Field(
|
||||||
|
default=None,
|
||||||
|
description="TEMPORARY",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class ErrorResponse(TypedDict):
|
class ErrorResponse(TypedDict):
|
||||||
|
@ -325,6 +333,8 @@ def create_app(settings: Optional[Settings] = None):
|
||||||
last_n_tokens_size=settings.last_n_tokens_size,
|
last_n_tokens_size=settings.last_n_tokens_size,
|
||||||
vocab_only=settings.vocab_only,
|
vocab_only=settings.vocab_only,
|
||||||
verbose=settings.verbose,
|
verbose=settings.verbose,
|
||||||
|
n_gqa=settings.n_gqa,
|
||||||
|
rms_norm_eps=settings.rms_norm_eps,
|
||||||
)
|
)
|
||||||
if settings.cache:
|
if settings.cache:
|
||||||
if settings.cache_type == "disk":
|
if settings.cache_type == "disk":
|
||||||
|
|
|
@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
|
||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "llama_cpp_python"
|
name = "llama_cpp_python"
|
||||||
version = "0.1.76"
|
version = "0.1.77"
|
||||||
description = "Python bindings for the llama.cpp library"
|
description = "Python bindings for the llama.cpp library"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = { text = "MIT" }
|
license = { text = "MIT" }
|
||||||
|
|
Loading…
Reference in a new issue