llm: make load time stall duration configurable via OLLAMA_LOAD_TIMEOUT

With the new very large parameter models, some users are willing to wait for a very long time for models to load.
2024-09-05 14:00:08 -07:00 · 2024-09-05 14:00:08 -07:00 · 6719097649
commit 6719097649
parent b05c9e83d9
4 changed files with 60 additions and 7 deletions
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -1422,6 +1422,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_FLASH_ATTENTION"],
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -112,6 +112,26 @@ func KeepAlive() (keepAlive time.Duration) {
 	return keepAlive
 }
 // LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
 // Zero or Negative values are treated as infinite.
 // Default is 5 minutes.
 func LoadTimeout() (loadTimeout time.Duration) {
 	loadTimeout = 5 * time.Minute
 	if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
 		if d, err := time.ParseDuration(s); err == nil {
 			loadTimeout = d
 		} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
 			loadTimeout = time.Duration(n) * time.Second
 		}
 	}
 	if loadTimeout <= 0 {
 		return time.Duration(math.MaxInt64)
 	}
 	return loadTimeout
 }
 func Bool(k string) func() bool {
 	return func() bool {
 		if s := Var(k); s != "" {
@ -245,10 +265,8 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
 	}
 }
-var (
+// Set aside VRAM per GPU
-	// Set aside VRAM per GPU
+var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
 	GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
 )
 type EnvVar struct {
 	Name        string
@ -264,6 +282,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
 		"OLLAMA_LOAD_TIMEOUT":      {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@ -215,6 +215,40 @@ func TestKeepAlive(t *testing.T) {
 	}
 }
 func TestLoadTimeout(t *testing.T) {
 	defaultTimeout := 5 * time.Minute
 	cases := map[string]time.Duration{
 		"":       defaultTimeout,
 		"1s":     time.Second,
 		"1m":     time.Minute,
 		"1h":     time.Hour,
 		"5m0s":   defaultTimeout,
 		"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
 		"0":      time.Duration(math.MaxInt64),
 		"60":     60 * time.Second,
 		"120":    2 * time.Minute,
 		"3600":   time.Hour,
 		"-0":     time.Duration(math.MaxInt64),
 		"-1":     time.Duration(math.MaxInt64),
 		"-1m":    time.Duration(math.MaxInt64),
 		// invalid values
 		" ":   defaultTimeout,
 		"???": defaultTimeout,
 		"1d":  defaultTimeout,
 		"1y":  defaultTimeout,
 		"1w":  defaultTimeout,
 	}
 	for tt, expect := range cases {
 		t.Run(tt, func(t *testing.T) {
 			t.Setenv("OLLAMA_LOAD_TIMEOUT", tt)
 			if actual := LoadTimeout(); actual != expect {
 				t.Errorf("%s: expected %s, got %s", tt, expect, actual)
 			}
 		})
 	}
 }
 func TestVar(t *testing.T) {
 	cases := map[string]string{
 		"value":       "value",
--- a/llm/server.go
+++ b/llm/server.go
@ -584,8 +584,7 @@ func (s *llmServer) Ping(ctx context.Context) error {
 func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 	start := time.Now()
-	stallDuration := 5 * time.Minute            // If no progress happens
+	stallDuration := envconfig.LoadTimeout()    // If no progress happens
 	finalLoadDuration := 5 * time.Minute        // After we hit 100%, give the runner more time to come online
 	stallTimer := time.Now().Add(stallDuration) // give up if we stall
 	slog.Info("waiting for llama runner to start responding")
@ -637,7 +636,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 				stallTimer = time.Now().Add(stallDuration)
 			} else if !fullyLoaded && int(s.loadProgress*100.0) >= 100 {
 				slog.Debug("model load completed, waiting for server to become available", "status", status.ToString())
-				stallTimer = time.Now().Add(finalLoadDuration)
+				stallTimer = time.Now().Add(stallDuration)
 				fullyLoaded = true
 			}
 			time.Sleep(time.Millisecond * 250)