Add offload_kqv option to llama and server

This commit is contained in:
Andrei Betlen 2023-12-18 15:36:09 -05:00
parent 472b344ae3
commit 095c650006
2 changed files with 7 additions and 0 deletions

View file

@ -752,6 +752,7 @@ class Llama:
mul_mat_q: bool = True, mul_mat_q: bool = True,
logits_all: bool = False, logits_all: bool = False,
embedding: bool = False, embedding: bool = False,
offload_kqv: bool = False,
# Sampling Params # Sampling Params
last_n_tokens_size: int = 64, last_n_tokens_size: int = 64,
# LoRA Params # LoRA Params
@ -817,6 +818,7 @@ class Llama:
yarn_orig_ctx: YaRN original context size yarn_orig_ctx: YaRN original context size
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs. logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
embedding: Embedding mode only. embedding: Embedding mode only.
offload_kqv: Offload K, Q, V to GPU.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
lora_path: Path to a LoRA file to apply to the model. lora_path: Path to a LoRA file to apply to the model.
@ -903,6 +905,7 @@ class Llama:
self.context_params.mul_mat_q = mul_mat_q self.context_params.mul_mat_q = mul_mat_q
self.context_params.logits_all = logits_all self.context_params.logits_all = logits_all
self.context_params.embedding = embedding self.context_params.embedding = embedding
self.context_params.offload_kqv = offload_kqv
# Sampling Params # Sampling Params
self.last_n_tokens_size = last_n_tokens_size self.last_n_tokens_size = last_n_tokens_size

View file

@ -100,6 +100,9 @@ class Settings(BaseSettings):
) )
logits_all: bool = Field(default=True, description="Whether to return logits.") logits_all: bool = Field(default=True, description="Whether to return logits.")
embedding: bool = Field(default=True, description="Whether to use embeddings.") embedding: bool = Field(default=True, description="Whether to use embeddings.")
offload_kqv: bool = Field(
default=False, description="Whether to offload kqv to the GPU."
)
# Sampling Params # Sampling Params
last_n_tokens_size: int = Field( last_n_tokens_size: int = Field(
default=64, default=64,
@ -409,6 +412,7 @@ def create_app(settings: Optional[Settings] = None):
mul_mat_q=settings.mul_mat_q, mul_mat_q=settings.mul_mat_q,
logits_all=settings.logits_all, logits_all=settings.logits_all,
embedding=settings.embedding, embedding=settings.embedding,
offload_kqv=settings.offload_kqv,
# Sampling Params # Sampling Params
last_n_tokens_size=settings.last_n_tokens_size, last_n_tokens_size=settings.last_n_tokens_size,
# LoRA Params # LoRA Params