use 10% vram overhead for cuda

This commit is contained in:
Jeffrey Morgan 2024-01-08 23:17:44 -05:00
parent 58ce2d8273
commit cb534e6ac2
2 changed files with 6 additions and 4 deletions

View file

@ -131,10 +131,11 @@ func getCPUMem() (memInfo, error) {
func CheckVRAM() (int64, error) { func CheckVRAM() (int64, error) {
gpuInfo := GetGPUInfo() gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
// allocate 384MiB for llama.cpp overhead (outside of model) // leave 10% or 400MiB of VRAM free for overhead
overhead := uint64(384 * 1024 * 1024) overhead := gpuInfo.FreeMemory / 10
if gpuInfo.FreeMemory <= overhead { minOverhead := 400 * 1024 * 1024
return 0, nil if overhead < minOverhead {
overhead = minOverhead
} }
return int64(gpuInfo.FreeMemory - overhead), nil return int64(gpuInfo.FreeMemory - overhead), nil

View file

@ -117,6 +117,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers())) bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
log.Println("bytes per layer:", bytesPerLayer) log.Println("bytes per layer:", bytesPerLayer)
layers := available / bytesPerLayer layers := available / bytesPerLayer
log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
if layers < int64(opts.NumGPU) { if layers < int64(opts.NumGPU) {
opts.NumGPU = int(layers) opts.NumGPU = int(layers)
} }