Restructure loading conditional chain

2024-04-24 17:37:03 -06:00 · 2024-04-24 17:37:03 -06:00 · 36a6daccab
commit 36a6daccab
parent ceb0e26e5e
2 changed files with 18 additions and 19 deletions
--- a/server/sched.go
+++ b/server/sched.go
@ -123,36 +123,35 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						pending.useLoadedRunner(runner, s.finishedReqCh)
 						break
 					}
-				} else if loadedCount == 0 {
-					slog.Debug("loading first model", "model", pending.model.ModelPath)
+				} else if loadedMax > 0 && loadedCount >= loadedMax {
+					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
+					runnerToExpire = s.findRunnerToUnload(pending)
+				} else {
+					// Either no models are loaded or below loadedMax
+					// Get a refreshed GPU list
 					gpus := s.getGpuFn()

+					// Load model for fitting
 					ggml, err := llm.LoadModel(pending.model.ModelPath)
 					if err != nil {
 						pending.errCh <- err
 						break
 					}
+
+					// No models loaded. Load the model but prefer the best fit.
+					if loadedCount == 0 {
+						slog.Debug("loading first model", "model", pending.model.ModelPath)
 						g := pickBestFitGPUs(pending, ggml, gpus)
 						if g != nil {
 							gpus = g
 						}
 						s.loadFn(pending, ggml, gpus)
 						break
-				} else if loadedMax > 0 && loadedCount >= loadedMax {
-					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
-					runnerToExpire = s.findRunnerToUnload(pending)
-				} else {
+					}
+
 					// More than one loaded model, so we have to see if the new one fits
-					// Get a refreshed GPU list
-					gpus := s.getGpuFn()
 					// Update free memory from currently loaded models
 					s.updateFreeSpace(gpus)
-
-					ggml, err := llm.LoadModel(pending.model.ModelPath)
-					if err != nil {
-						pending.errCh <- err
-						break
-					}
 					gpus = pickBestFitGPUs(pending, ggml, gpus)
 					if gpus != nil {
 						slog.Debug("new model fits with existing models, loading")
--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -47,7 +47,7 @@ func TestLoad(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
 	defer done()
 	s := InitScheduler(ctx)
-	ggml := nil // value not used in tests
+	var ggml *llm.GGML // value not used in tests
 	req := &LlmRequest{
 		ctx:             ctx,
 		model:           &Model{ModelPath: "foo"},