diff --git a/gpu/cpu_common.go b/gpu/cpu_common.go index 3b299e42..920d0f5b 100644 --- a/gpu/cpu_common.go +++ b/gpu/cpu_common.go @@ -8,14 +8,14 @@ import ( func GetCPUVariant() string { if cpu.X86.HasAVX2 { - slog.Info("CPU has AVX2") + slog.Debug("CPU has AVX2") return "avx2" } if cpu.X86.HasAVX { - slog.Info("CPU has AVX") + slog.Debug("CPU has AVX") return "avx" } - slog.Info("CPU does not have vector extensions") + slog.Debug("CPU does not have vector extensions") // else LCD return "" } diff --git a/server/sched.go b/server/sched.go index c4a071c1..96235ea5 100644 --- a/server/sched.go +++ b/server/sched.go @@ -265,11 +265,14 @@ func (s *Scheduler) processCompleted(ctx context.Context) { s.loadedMu.Lock() slog.Debug("got lock to unload", "model", runner.model) + finished := runner.waitForVRAMRecovery() runner.unload() delete(s.loaded, runner.model) s.loadedMu.Unlock() slog.Debug("runner released", "model", runner.model) runner.refMu.Unlock() + + <-finished slog.Debug("sending an unloaded event", "model", runner.model) s.unloadedCh <- struct{}{} } @@ -465,6 +468,61 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool return false } +// Free memory reporting on GPUs can lag for a while even after the runner +// exits, so we have to keep checking until we see the available memory recover, +// otherwise subsequent model loads will get far less layers loaded or worse +// case, may completely fall back to CPU mode. +// This routine must be called before the runner unloads so it can establish +// a before and after GPU memory allocation. The returned channel +// will be notified when we're done waiting, or have timed out and should +// proceed anyway +func (runner *runnerRef) waitForVRAMRecovery() chan interface{} { + finished := make(chan interface{}, 1) + + // CPU or Metal don't need checking, so no waiting required + if len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal") { + finished <- struct{}{} + return finished + } + start := time.Now() + + // Establish a baseline before we unload + gpusBefore := gpu.GetGPUInfo() + var totalMemoryBefore, freeMemoryBefore uint64 + for _, gpu := range gpusBefore { + totalMemoryBefore += gpu.TotalMemory + freeMemoryBefore += gpu.FreeMemory + } + go func() { + expiresAt := start.Add(5 * time.Second) // typical convergence is 0.5-1.5s + ticker := time.NewTicker(250 * time.Millisecond) + defer ticker.Stop() + for { + <-ticker.C + if time.Now().After(expiresAt) { + slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds()) + finished <- struct{}{} + } + + // Query GPUs, look for free to go back up + gpusNow := gpu.GetGPUInfo() + var totalMemoryNow, freeMemoryNow uint64 + for _, gpu := range gpusNow { + totalMemoryNow += gpu.TotalMemory + freeMemoryNow += gpu.FreeMemory + } + // If we're within ~80% of the estimated memory usage recovered, bail out + if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 { + slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds())) + finished <- struct{}{} + return + } + } + }() + return finished + +} + type ByDuration []*runnerRef func (a ByDuration) Len() int { return len(a) }