partial offloading: allow flash attention and disable mmap (#4734)

* partial offloading: allow flash attention and disable mmap * allow mmap with num_gpu=0
2024-05-30 16:58:01 -07:00 · 2024-05-30 16:58:01 -07:00 · a50a87a7b8
commit a50a87a7b8
parent e91d0ef737
1 changed files with 22 additions and 19 deletions
--- a/llm/server.go
+++ b/llm/server.go
@ -191,35 +191,38 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--memory-f32")
 	}
-	if opts.UseMLock {
+	flashAttnEnabled := envconfig.FlashAttention
-		params = append(params, "--mlock")
+
 	for _, g := range gpus {
 		// only cuda (compute capability 7+) and metal support flash attention
 		if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
 			flashAttnEnabled = false
 		}
 		// mmap has issues with partial offloading on metal
 		if g.Library == "metal" &&
 			uint64(opts.NumGPU) > 0 &&
 			uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
 			opts.UseMMap = false
 		}
 	}
 	if flashAttnEnabled {
 		params = append(params, "--flash-attn")
 	}
 	if !opts.UseMMap {
 		params = append(params, "--no-mmap")
 	}
 	if opts.UseMLock {
 		params = append(params, "--mlock")
 	}
 	if opts.UseNUMA {
 		params = append(params, "--numa")
 	}
 	flashAttnEnabled := envconfig.FlashAttention
 	// partial offloading does not support flash attention
 	if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
 		flashAttnEnabled = false
 	}
 	// only cuda (compute capability 7+) and metal support flash attention
 	for _, g := range gpus {
 		if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
 			flashAttnEnabled = false
 		}
 	}
 	if flashAttnEnabled {
 		params = append(params, "--flash-attn")
 	}
 	numParallel := envconfig.NumParallel
 	// TODO (jmorganca): multimodal models don't support parallel yet