sched: don't error if paging to disk on Windows and macOS (#5523)

2024-07-06 22:01:52 -04:00 · 2024-07-06 22:01:52 -04:00 · 0ee87615c7
commit 0ee87615c7
parent f8241bfba3
1 changed files with 24 additions and 13 deletions
--- a/server/sched.go
+++ b/server/sched.go
@ -197,9 +197,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}
 					// Block attempting to load a model larger than system memory + GPU memory
 					estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts)
 					maxSize := systemMem.FreeMemory
 					// Add available GPU memory to the total pool
 					// macOS hardware has unified memory so don't double count
 					if runtime.GOOS != "darwin" {
 						for _, gpu := range gpus {
 							if gpu.Library == "cpu" {
 								continue
@ -212,11 +215,19 @@ func (s *Scheduler) processPending(ctx context.Context) {
 								maxSize += gpu.TotalMemory
 							}
 						}
 					}
 					// Block attempting to load a model larger than system memory + GPU memory
 					if estimate.TotalSize > maxSize {
 						slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize))
 						// Linux will crash if over-allocating memory - return an error to the user.
 						// TODO (jmorganca): add reasonable upper limits for darwin and windows as well
 						if runtime.GOOS == "linux" {
 							pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize))
 							break
 						}
 					}
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
 					if len(gpus) == 1 && gpus[0].Library == "cpu" {