partial offloading: allow flash attention and disable mmap (#4734)

* partial offloading: allow flash attention and disable mmap

* allow mmap with num_gpu=0
This commit is contained in:
Jeffrey Morgan 2024-05-30 16:58:01 -07:00 committed by GitHub
parent e91d0ef737
commit a50a87a7b8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -191,35 +191,38 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--memory-f32") params = append(params, "--memory-f32")
} }
if opts.UseMLock { flashAttnEnabled := envconfig.FlashAttention
params = append(params, "--mlock")
for _, g := range gpus {
// only cuda (compute capability 7+) and metal support flash attention
if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
flashAttnEnabled = false
}
// mmap has issues with partial offloading on metal
if g.Library == "metal" &&
uint64(opts.NumGPU) > 0 &&
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
opts.UseMMap = false
}
}
if flashAttnEnabled {
params = append(params, "--flash-attn")
} }
if !opts.UseMMap { if !opts.UseMMap {
params = append(params, "--no-mmap") params = append(params, "--no-mmap")
} }
if opts.UseMLock {
params = append(params, "--mlock")
}
if opts.UseNUMA { if opts.UseNUMA {
params = append(params, "--numa") params = append(params, "--numa")
} }
flashAttnEnabled := envconfig.FlashAttention
// partial offloading does not support flash attention
if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
flashAttnEnabled = false
}
// only cuda (compute capability 7+) and metal support flash attention
for _, g := range gpus {
if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
flashAttnEnabled = false
}
}
if flashAttnEnabled {
params = append(params, "--flash-attn")
}
numParallel := envconfig.NumParallel numParallel := envconfig.NumParallel
// TODO (jmorganca): multimodal models don't support parallel yet // TODO (jmorganca): multimodal models don't support parallel yet