Prevent partial loading on mixed GPU brands
In mult-brand GPU setups, if we couldn't fully load the model we would fall through the scheduler and mistakenly try to load across a mix of brands. This makes sure we find the set of GPU(s) that best fit for the partial load.
This commit is contained in:
parent
0be8baad2b
commit
345420998e
2 changed files with 66 additions and 4 deletions
|
@ -212,9 +212,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
} else if loadedCount == 0 {
|
} else if loadedCount == 0 {
|
||||||
// No models loaded. Load the model but prefer the best fit.
|
// No models loaded. Load the model but prefer the best fit.
|
||||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||||
g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
|
g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
|
||||||
if g != nil {
|
if g != nil {
|
||||||
gpus = g
|
gpus = g
|
||||||
|
} else {
|
||||||
|
// Only allow partial loads when this is the first model
|
||||||
|
gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
|
||||||
}
|
}
|
||||||
s.loadFn(pending, ggml, gpus, numParallel)
|
s.loadFn(pending, ggml, gpus, numParallel)
|
||||||
break
|
break
|
||||||
|
@ -231,7 +234,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
|
|
||||||
// Update free memory from currently loaded models
|
// Update free memory from currently loaded models
|
||||||
s.updateFreeSpace(availGpus)
|
s.updateFreeSpace(availGpus)
|
||||||
fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
|
fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
|
||||||
if fitGpus != nil {
|
if fitGpus != nil {
|
||||||
slog.Debug("new model fits with existing models, loading")
|
slog.Debug("new model fits with existing models, loading")
|
||||||
s.loadFn(pending, ggml, fitGpus, numParallel)
|
s.loadFn(pending, ggml, fitGpus, numParallel)
|
||||||
|
@ -668,11 +671,12 @@ func (a ByDuration) Less(i, j int) bool {
|
||||||
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||||
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
|
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
|
||||||
|
|
||||||
// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
||||||
|
// The list of GPUs returned will always be the same brand (library)
|
||||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||||
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
||||||
// opts.NumCtx accordingly
|
// opts.NumCtx accordingly
|
||||||
func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||||
var estimatedVRAM uint64
|
var estimatedVRAM uint64
|
||||||
|
|
||||||
var numParallelToTry []int
|
var numParallelToTry []int
|
||||||
|
@ -723,6 +727,25 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||||
|
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||||
|
*numParallel = 1
|
||||||
|
byLibrary := gpus.ByLibrary()
|
||||||
|
if len(byLibrary) <= 1 {
|
||||||
|
return gpus
|
||||||
|
}
|
||||||
|
var bestEstimate uint64
|
||||||
|
var bestFit int
|
||||||
|
for i, gl := range byLibrary {
|
||||||
|
_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
||||||
|
if estimatedVRAM > bestEstimate {
|
||||||
|
bestEstimate = estimatedVRAM
|
||||||
|
bestFit = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return byLibrary[bestFit]
|
||||||
|
}
|
||||||
|
|
||||||
// findRunnerToUnload finds a runner to unload to make room for a new model
|
// findRunnerToUnload finds a runner to unload to make room for a new model
|
||||||
func (s *Scheduler) findRunnerToUnload() *runnerRef {
|
func (s *Scheduler) findRunnerToUnload() *runnerRef {
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
|
|
|
@ -666,6 +666,45 @@ func TestAlreadyCanceled(t *testing.T) {
|
||||||
require.Empty(t, scenario1a.req.successCh)
|
require.Empty(t, scenario1a.req.successCh)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHomogeneousGPUs(t *testing.T) {
|
||||||
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
|
defer done()
|
||||||
|
s := InitScheduler(ctx)
|
||||||
|
|
||||||
|
s.getGpuFn = func() gpu.GpuInfoList {
|
||||||
|
// Set memory values to require the model to be spread
|
||||||
|
gpus := []gpu.GpuInfo{
|
||||||
|
{Library: "cuda"},
|
||||||
|
{Library: "rocm"},
|
||||||
|
}
|
||||||
|
gpus[0].TotalMemory = 1 * format.GibiByte
|
||||||
|
gpus[0].FreeMemory = 256 * format.MebiByte
|
||||||
|
gpus[1].TotalMemory = 1 * format.GibiByte
|
||||||
|
gpus[1].FreeMemory = 256 * format.MebiByte
|
||||||
|
return gpus
|
||||||
|
}
|
||||||
|
s.getCpuFn = getCpuFn
|
||||||
|
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||||
|
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
|
require.Len(t, gpus, 1)
|
||||||
|
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
||||||
|
}
|
||||||
|
slog.Info("a")
|
||||||
|
s.pendingReqCh <- a.req
|
||||||
|
require.Len(t, s.pendingReqCh, 1)
|
||||||
|
s.Run(ctx)
|
||||||
|
select {
|
||||||
|
case resp := <-a.req.successCh:
|
||||||
|
require.Equal(t, resp.llama, a.srv)
|
||||||
|
require.Empty(t, s.pendingReqCh)
|
||||||
|
require.Empty(t, a.req.errCh)
|
||||||
|
case err := <-a.req.errCh:
|
||||||
|
t.Fatal(err.Error())
|
||||||
|
case <-ctx.Done():
|
||||||
|
t.Fatal("timeout")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type mockLlm struct {
|
type mockLlm struct {
|
||||||
pingResp error
|
pingResp error
|
||||||
waitResp error
|
waitResp error
|
||||||
|
|
Loading…
Reference in a new issue