From a50a87a7b87edc9529c9b235f33b37a7cfa30daa Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 30 May 2024 16:58:01 -0700 Subject: [PATCH] partial offloading: allow flash attention and disable mmap (#4734) * partial offloading: allow flash attention and disable mmap * allow mmap with num_gpu=0 --- llm/server.go | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/llm/server.go b/llm/server.go index 9b5d0f06..97aa2a15 100644 --- a/llm/server.go +++ b/llm/server.go @@ -191,35 +191,38 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--memory-f32") } - if opts.UseMLock { - params = append(params, "--mlock") + flashAttnEnabled := envconfig.FlashAttention + + for _, g := range gpus { + // only cuda (compute capability 7+) and metal support flash attention + if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) { + flashAttnEnabled = false + } + + // mmap has issues with partial offloading on metal + if g.Library == "metal" && + uint64(opts.NumGPU) > 0 && + uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 { + opts.UseMMap = false + } + } + + if flashAttnEnabled { + params = append(params, "--flash-attn") } if !opts.UseMMap { params = append(params, "--no-mmap") } + if opts.UseMLock { + params = append(params, "--mlock") + } + if opts.UseNUMA { params = append(params, "--numa") } - flashAttnEnabled := envconfig.FlashAttention - - // partial offloading does not support flash attention - if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 { - flashAttnEnabled = false - } - - // only cuda (compute capability 7+) and metal support flash attention - for _, g := range gpus { - if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) { - flashAttnEnabled = false - } - } - if flashAttnEnabled { - params = append(params, "--flash-attn") - } - numParallel := envconfig.NumParallel // TODO (jmorganca): multimodal models don't support parallel yet