From a50a87a7b87edc9529c9b235f33b37a7cfa30daa Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Thu, 30 May 2024 16:58:01 -0700
Subject: [PATCH] partial offloading: allow flash attention and disable mmap
 (#4734)

* partial offloading: allow flash attention and disable mmap

* allow mmap with num_gpu=0
---
 llm/server.go | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/llm/server.go b/llm/server.go
index 9b5d0f06..97aa2a15 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -191,35 +191,38 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--memory-f32")
 	}
 
-	if opts.UseMLock {
-		params = append(params, "--mlock")
+	flashAttnEnabled := envconfig.FlashAttention
+
+	for _, g := range gpus {
+		// only cuda (compute capability 7+) and metal support flash attention
+		if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
+			flashAttnEnabled = false
+		}
+
+		// mmap has issues with partial offloading on metal
+		if g.Library == "metal" &&
+			uint64(opts.NumGPU) > 0 &&
+			uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
+			opts.UseMMap = false
+		}
+	}
+
+	if flashAttnEnabled {
+		params = append(params, "--flash-attn")
 	}
 
 	if !opts.UseMMap {
 		params = append(params, "--no-mmap")
 	}
 
+	if opts.UseMLock {
+		params = append(params, "--mlock")
+	}
+
 	if opts.UseNUMA {
 		params = append(params, "--numa")
 	}
 
-	flashAttnEnabled := envconfig.FlashAttention
-
-	// partial offloading does not support flash attention
-	if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
-		flashAttnEnabled = false
-	}
-
-	// only cuda (compute capability 7+) and metal support flash attention
-	for _, g := range gpus {
-		if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
-			flashAttnEnabled = false
-		}
-	}
-	if flashAttnEnabled {
-		params = append(params, "--flash-attn")
-	}
-
 	numParallel := envconfig.NumParallel
 
 	// TODO (jmorganca): multimodal models don't support parallel yet