Wait for GPU free memory reporting to converge
The GPU drivers take a while to update their free memory reporting, so we need to wait until the values converge with what we're expecting before proceeding to start another runner in order to get an accurate picture.
This commit is contained in:
parent
dc18eee39d
commit
354ad9254e
2 changed files with 61 additions and 3 deletions
|
@ -8,14 +8,14 @@ import (
|
||||||
|
|
||||||
func GetCPUVariant() string {
|
func GetCPUVariant() string {
|
||||||
if cpu.X86.HasAVX2 {
|
if cpu.X86.HasAVX2 {
|
||||||
slog.Info("CPU has AVX2")
|
slog.Debug("CPU has AVX2")
|
||||||
return "avx2"
|
return "avx2"
|
||||||
}
|
}
|
||||||
if cpu.X86.HasAVX {
|
if cpu.X86.HasAVX {
|
||||||
slog.Info("CPU has AVX")
|
slog.Debug("CPU has AVX")
|
||||||
return "avx"
|
return "avx"
|
||||||
}
|
}
|
||||||
slog.Info("CPU does not have vector extensions")
|
slog.Debug("CPU does not have vector extensions")
|
||||||
// else LCD
|
// else LCD
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
|
@ -265,11 +265,14 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
|
||||||
|
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
slog.Debug("got lock to unload", "model", runner.model)
|
slog.Debug("got lock to unload", "model", runner.model)
|
||||||
|
finished := runner.waitForVRAMRecovery()
|
||||||
runner.unload()
|
runner.unload()
|
||||||
delete(s.loaded, runner.model)
|
delete(s.loaded, runner.model)
|
||||||
s.loadedMu.Unlock()
|
s.loadedMu.Unlock()
|
||||||
slog.Debug("runner released", "model", runner.model)
|
slog.Debug("runner released", "model", runner.model)
|
||||||
runner.refMu.Unlock()
|
runner.refMu.Unlock()
|
||||||
|
|
||||||
|
<-finished
|
||||||
slog.Debug("sending an unloaded event", "model", runner.model)
|
slog.Debug("sending an unloaded event", "model", runner.model)
|
||||||
s.unloadedCh <- struct{}{}
|
s.unloadedCh <- struct{}{}
|
||||||
}
|
}
|
||||||
|
@ -465,6 +468,61 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Free memory reporting on GPUs can lag for a while even after the runner
|
||||||
|
// exits, so we have to keep checking until we see the available memory recover,
|
||||||
|
// otherwise subsequent model loads will get far less layers loaded or worse
|
||||||
|
// case, may completely fall back to CPU mode.
|
||||||
|
// This routine must be called before the runner unloads so it can establish
|
||||||
|
// a before and after GPU memory allocation. The returned channel
|
||||||
|
// will be notified when we're done waiting, or have timed out and should
|
||||||
|
// proceed anyway
|
||||||
|
func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
||||||
|
finished := make(chan interface{}, 1)
|
||||||
|
|
||||||
|
// CPU or Metal don't need checking, so no waiting required
|
||||||
|
if len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal") {
|
||||||
|
finished <- struct{}{}
|
||||||
|
return finished
|
||||||
|
}
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
// Establish a baseline before we unload
|
||||||
|
gpusBefore := gpu.GetGPUInfo()
|
||||||
|
var totalMemoryBefore, freeMemoryBefore uint64
|
||||||
|
for _, gpu := range gpusBefore {
|
||||||
|
totalMemoryBefore += gpu.TotalMemory
|
||||||
|
freeMemoryBefore += gpu.FreeMemory
|
||||||
|
}
|
||||||
|
go func() {
|
||||||
|
expiresAt := start.Add(5 * time.Second) // typical convergence is 0.5-1.5s
|
||||||
|
ticker := time.NewTicker(250 * time.Millisecond)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
<-ticker.C
|
||||||
|
if time.Now().After(expiresAt) {
|
||||||
|
slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds())
|
||||||
|
finished <- struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Query GPUs, look for free to go back up
|
||||||
|
gpusNow := gpu.GetGPUInfo()
|
||||||
|
var totalMemoryNow, freeMemoryNow uint64
|
||||||
|
for _, gpu := range gpusNow {
|
||||||
|
totalMemoryNow += gpu.TotalMemory
|
||||||
|
freeMemoryNow += gpu.FreeMemory
|
||||||
|
}
|
||||||
|
// If we're within ~80% of the estimated memory usage recovered, bail out
|
||||||
|
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
|
||||||
|
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()))
|
||||||
|
finished <- struct{}{}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return finished
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
type ByDuration []*runnerRef
|
type ByDuration []*runnerRef
|
||||||
|
|
||||||
func (a ByDuration) Len() int { return len(a) }
|
func (a ByDuration) Len() int { return len(a) }
|
||||||
|
|
Loading…
Reference in a new issue