From 2138561fab5e60672c63b6c446b62a8bc26e17c4 Mon Sep 17 00:00:00 2001 From: Daniel Thuerck Date: Fri, 3 May 2024 18:17:07 +0200 Subject: [PATCH] fix(server): Propagate `flash_attn` to model load. (#1424) --- llama_cpp/server/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index e102fad..f002924 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -242,6 +242,7 @@ class LlamaProxy: logits_all=settings.logits_all, embedding=settings.embedding, offload_kqv=settings.offload_kqv, + flash_attn=settings.flash_attn, # Sampling Params last_n_tokens_size=settings.last_n_tokens_size, # LoRA Params