use runner if cuda alloc won't fit
This commit is contained in:
parent
6566387ae3
commit
f387e9631b
1 changed files with 3 additions and 6 deletions
|
@ -100,10 +100,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// This handles two cases:
|
// requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
|
||||||
// 1. overhead + tensors are always loaded into scratch memory even with num_gpu 0
|
if requiredAlloc > available {
|
||||||
// 2. it seems llama.cpp always tries to allocate the entire kv cache (even if later split into layers) into vram or crashes
|
|
||||||
if requiredAlloc > available || requiredKv > available {
|
|
||||||
log.Printf("not enough vram available, falling back to CPU only")
|
log.Printf("not enough vram available, falling back to CPU only")
|
||||||
library = "cpu"
|
library = "cpu"
|
||||||
opts.NumGPU = 0
|
opts.NumGPU = 0
|
||||||
|
@ -127,8 +125,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||||
opts.NumGQA = 0
|
opts.NumGQA = 0
|
||||||
opts.RopeFrequencyBase = 0.0
|
opts.RopeFrequencyBase = 0.0
|
||||||
opts.RopeFrequencyScale = 0.0
|
opts.RopeFrequencyScale = 0.0
|
||||||
gpuInfo := gpu.GetGPUInfo()
|
return newLlmServer(library, model, adapters, projectors, opts)
|
||||||
return newLlmServer(gpuInfo.Library, model, adapters, projectors, opts)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Give any native cgo implementations an opportunity to initialize
|
// Give any native cgo implementations an opportunity to initialize
|
||||||
|
|
Loading…
Reference in a new issue