From 345420998e90090d2d6fba38ad5c2f3f5512adf4 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 22 Jul 2024 11:57:26 -0700 Subject: [PATCH] Prevent partial loading on mixed GPU brands In mult-brand GPU setups, if we couldn't fully load the model we would fall through the scheduler and mistakenly try to load across a mix of brands. This makes sure we find the set of GPU(s) that best fit for the partial load. --- server/sched.go | 31 +++++++++++++++++++++++++++---- server/sched_test.go | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/server/sched.go b/server/sched.go index 2daed3ab..92b8d508 100644 --- a/server/sched.go +++ b/server/sched.go @@ -212,9 +212,12 @@ func (s *Scheduler) processPending(ctx context.Context) { } else if loadedCount == 0 { // No models loaded. Load the model but prefer the best fit. slog.Debug("loading first model", "model", pending.model.ModelPath) - g := pickBestFitGPUs(pending, ggml, gpus, &numParallel) + g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel) if g != nil { gpus = g + } else { + // Only allow partial loads when this is the first model + gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel) } s.loadFn(pending, ggml, gpus, numParallel) break @@ -231,7 +234,7 @@ func (s *Scheduler) processPending(ctx context.Context) { // Update free memory from currently loaded models s.updateFreeSpace(availGpus) - fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel) + fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel) if fitGpus != nil { slog.Debug("new model fits with existing models, loading") s.loadFn(pending, ggml, fitGpus, numParallel) @@ -668,11 +671,12 @@ func (a ByDuration) Less(i, j int) bool { // func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] } // func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM } -// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits +// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits +// The list of GPUs returned will always be the same brand (library) // If the model can not be fit fully within the available GPU(s) nil is returned // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust // opts.NumCtx accordingly -func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { +func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { var estimatedVRAM uint64 var numParallelToTry []int @@ -723,6 +727,25 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP return nil } +// If multiple Libraries are detected, pick the Library which loads the most layers for the model +func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { + *numParallel = 1 + byLibrary := gpus.ByLibrary() + if len(byLibrary) <= 1 { + return gpus + } + var bestEstimate uint64 + var bestFit int + for i, gl := range byLibrary { + _, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts) + if estimatedVRAM > bestEstimate { + bestEstimate = estimatedVRAM + bestFit = i + } + } + return byLibrary[bestFit] +} + // findRunnerToUnload finds a runner to unload to make room for a new model func (s *Scheduler) findRunnerToUnload() *runnerRef { s.loadedMu.Lock() diff --git a/server/sched_test.go b/server/sched_test.go index 9ddd1fab..a186ce0e 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -666,6 +666,45 @@ func TestAlreadyCanceled(t *testing.T) { require.Empty(t, scenario1a.req.successCh) } +func TestHomogeneousGPUs(t *testing.T) { + ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer done() + s := InitScheduler(ctx) + + s.getGpuFn = func() gpu.GpuInfoList { + // Set memory values to require the model to be spread + gpus := []gpu.GpuInfo{ + {Library: "cuda"}, + {Library: "rocm"}, + } + gpus[0].TotalMemory = 1 * format.GibiByte + gpus[0].FreeMemory = 256 * format.MebiByte + gpus[1].TotalMemory = 1 * format.GibiByte + gpus[1].FreeMemory = 256 * format.MebiByte + return gpus + } + s.getCpuFn = getCpuFn + a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}) + s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + require.Len(t, gpus, 1) + return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel) + } + slog.Info("a") + s.pendingReqCh <- a.req + require.Len(t, s.pendingReqCh, 1) + s.Run(ctx) + select { + case resp := <-a.req.successCh: + require.Equal(t, resp.llama, a.srv) + require.Empty(t, s.pendingReqCh) + require.Empty(t, a.req.errCh) + case err := <-a.req.errCh: + t.Fatal(err.Error()) + case <-ctx.Done(): + t.Fatal("timeout") + } +} + type mockLlm struct { pingResp error waitResp error