partial offloading: allow flash attention and disable mmap (#4734)
* partial offloading: allow flash attention and disable mmap * allow mmap with num_gpu=0
This commit is contained in:
parent
e91d0ef737
commit
a50a87a7b8
1 changed files with 22 additions and 19 deletions
|
@ -191,35 +191,38 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
params = append(params, "--memory-f32")
|
params = append(params, "--memory-f32")
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.UseMLock {
|
flashAttnEnabled := envconfig.FlashAttention
|
||||||
params = append(params, "--mlock")
|
|
||||||
|
for _, g := range gpus {
|
||||||
|
// only cuda (compute capability 7+) and metal support flash attention
|
||||||
|
if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
|
||||||
|
flashAttnEnabled = false
|
||||||
|
}
|
||||||
|
|
||||||
|
// mmap has issues with partial offloading on metal
|
||||||
|
if g.Library == "metal" &&
|
||||||
|
uint64(opts.NumGPU) > 0 &&
|
||||||
|
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
||||||
|
opts.UseMMap = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if flashAttnEnabled {
|
||||||
|
params = append(params, "--flash-attn")
|
||||||
}
|
}
|
||||||
|
|
||||||
if !opts.UseMMap {
|
if !opts.UseMMap {
|
||||||
params = append(params, "--no-mmap")
|
params = append(params, "--no-mmap")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if opts.UseMLock {
|
||||||
|
params = append(params, "--mlock")
|
||||||
|
}
|
||||||
|
|
||||||
if opts.UseNUMA {
|
if opts.UseNUMA {
|
||||||
params = append(params, "--numa")
|
params = append(params, "--numa")
|
||||||
}
|
}
|
||||||
|
|
||||||
flashAttnEnabled := envconfig.FlashAttention
|
|
||||||
|
|
||||||
// partial offloading does not support flash attention
|
|
||||||
if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
|
||||||
flashAttnEnabled = false
|
|
||||||
}
|
|
||||||
|
|
||||||
// only cuda (compute capability 7+) and metal support flash attention
|
|
||||||
for _, g := range gpus {
|
|
||||||
if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
|
|
||||||
flashAttnEnabled = false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if flashAttnEnabled {
|
|
||||||
params = append(params, "--flash-attn")
|
|
||||||
}
|
|
||||||
|
|
||||||
numParallel := envconfig.NumParallel
|
numParallel := envconfig.NumParallel
|
||||||
|
|
||||||
// TODO (jmorganca): multimodal models don't support parallel yet
|
// TODO (jmorganca): multimodal models don't support parallel yet
|
||||||
|
|
Loading…
Reference in a new issue