From 48c3b77e6f558a9899de0e1155c7dc0c7958d8e8 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 18 Jan 2024 11:08:57 -0500
Subject: [PATCH] Offload KQV by default

---
 llama_cpp/llama.py           | 2 +-
 llama_cpp/server/settings.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 25abf36..6cdc1eb 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -77,7 +77,7 @@ class Llama:
         mul_mat_q: bool = True,
         logits_all: bool = False,
         embedding: bool = False,
-        offload_kqv: bool = False,
+        offload_kqv: bool = True,
         # Sampling Params
         last_n_tokens_size: int = 64,
         # LoRA Params
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index a10390c..dc5be20 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -90,7 +90,7 @@ class ModelSettings(BaseSettings):
     logits_all: bool = Field(default=True, description="Whether to return logits.")
     embedding: bool = Field(default=True, description="Whether to use embeddings.")
     offload_kqv: bool = Field(
-        default=False, description="Whether to offload kqv to the GPU."
+        default=True, description="Whether to offload kqv to the GPU."
     )
     # Sampling Params
     last_n_tokens_size: int = Field(