Offload KQV by default

2024-01-18 11:08:57 -05:00 · 2024-01-18 11:08:57 -05:00 · 48c3b77e6f
commit 48c3b77e6f
parent 6bfe98bd80
2 changed files with 2 additions and 2 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -77,7 +77,7 @@ class Llama:
        mul_mat_q: bool = True,
        logits_all: bool = False,
        embedding: bool = False,
-        offload_kqv: bool = False,
+        offload_kqv: bool = True,
        # Sampling Params
        last_n_tokens_size: int = 64,
        # LoRA Params
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@ -90,7 +90,7 @@ class ModelSettings(BaseSettings):
    logits_all: bool = Field(default=True, description="Whether to return logits.")
    embedding: bool = Field(default=True, description="Whether to use embeddings.")
    offload_kqv: bool = Field(
-        default=False, description="Whether to offload kqv to the GPU."
+        default=True, description="Whether to offload kqv to the GPU."
    )
    # Sampling Params
    last_n_tokens_size: int = Field(