Wait for GPU free memory reporting to converge

The GPU drivers take a while to update their free memory reporting, so we need
to wait until the values converge with what we're expecting before proceeding
to start another runner in order to get an accurate picture.
This commit is contained in:
Daniel Hiltgen 2024-05-09 11:44:45 -07:00
parent dc18eee39d
commit 354ad9254e
2 changed files with 61 additions and 3 deletions

View file

@ -8,14 +8,14 @@ import (
func GetCPUVariant() string { func GetCPUVariant() string {
if cpu.X86.HasAVX2 { if cpu.X86.HasAVX2 {
slog.Info("CPU has AVX2") slog.Debug("CPU has AVX2")
return "avx2" return "avx2"
} }
if cpu.X86.HasAVX { if cpu.X86.HasAVX {
slog.Info("CPU has AVX") slog.Debug("CPU has AVX")
return "avx" return "avx"
} }
slog.Info("CPU does not have vector extensions") slog.Debug("CPU does not have vector extensions")
// else LCD // else LCD
return "" return ""
} }

View file

@ -265,11 +265,14 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
s.loadedMu.Lock() s.loadedMu.Lock()
slog.Debug("got lock to unload", "model", runner.model) slog.Debug("got lock to unload", "model", runner.model)
finished := runner.waitForVRAMRecovery()
runner.unload() runner.unload()
delete(s.loaded, runner.model) delete(s.loaded, runner.model)
s.loadedMu.Unlock() s.loadedMu.Unlock()
slog.Debug("runner released", "model", runner.model) slog.Debug("runner released", "model", runner.model)
runner.refMu.Unlock() runner.refMu.Unlock()
<-finished
slog.Debug("sending an unloaded event", "model", runner.model) slog.Debug("sending an unloaded event", "model", runner.model)
s.unloadedCh <- struct{}{} s.unloadedCh <- struct{}{}
} }
@ -465,6 +468,61 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
return false return false
} }
// Free memory reporting on GPUs can lag for a while even after the runner
// exits, so we have to keep checking until we see the available memory recover,
// otherwise subsequent model loads will get far less layers loaded or worse
// case, may completely fall back to CPU mode.
// This routine must be called before the runner unloads so it can establish
// a before and after GPU memory allocation. The returned channel
// will be notified when we're done waiting, or have timed out and should
// proceed anyway
func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
finished := make(chan interface{}, 1)
// CPU or Metal don't need checking, so no waiting required
if len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal") {
finished <- struct{}{}
return finished
}
start := time.Now()
// Establish a baseline before we unload
gpusBefore := gpu.GetGPUInfo()
var totalMemoryBefore, freeMemoryBefore uint64
for _, gpu := range gpusBefore {
totalMemoryBefore += gpu.TotalMemory
freeMemoryBefore += gpu.FreeMemory
}
go func() {
expiresAt := start.Add(5 * time.Second) // typical convergence is 0.5-1.5s
ticker := time.NewTicker(250 * time.Millisecond)
defer ticker.Stop()
for {
<-ticker.C
if time.Now().After(expiresAt) {
slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds())
finished <- struct{}{}
}
// Query GPUs, look for free to go back up
gpusNow := gpu.GetGPUInfo()
var totalMemoryNow, freeMemoryNow uint64
for _, gpu := range gpusNow {
totalMemoryNow += gpu.TotalMemory
freeMemoryNow += gpu.FreeMemory
}
// If we're within ~80% of the estimated memory usage recovered, bail out
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()))
finished <- struct{}{}
return
}
}
}()
return finished
}
type ByDuration []*runnerRef type ByDuration []*runnerRef
func (a ByDuration) Len() int { return len(a) } func (a ByDuration) Len() int { return len(a) }