llm: make load time stall duration configurable via OLLAMA_LOAD_TIMEOUT

With the new very large parameter models, some users are willing to wait for
a very long time for models to load.
This commit is contained in:
Daniel Hiltgen 2024-09-05 14:00:08 -07:00 committed by GitHub
parent b05c9e83d9
commit 6719097649
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 60 additions and 7 deletions

View file

@ -1422,6 +1422,7 @@ func NewCLI() *cobra.Command {
envVars["OLLAMA_FLASH_ATTENTION"], envVars["OLLAMA_FLASH_ATTENTION"],
envVars["OLLAMA_LLM_LIBRARY"], envVars["OLLAMA_LLM_LIBRARY"],
envVars["OLLAMA_GPU_OVERHEAD"], envVars["OLLAMA_GPU_OVERHEAD"],
envVars["OLLAMA_LOAD_TIMEOUT"],
}) })
default: default:
appendEnvDocs(cmd, envs) appendEnvDocs(cmd, envs)

View file

@ -112,6 +112,26 @@ func KeepAlive() (keepAlive time.Duration) {
return keepAlive return keepAlive
} }
// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
// Zero or Negative values are treated as infinite.
// Default is 5 minutes.
func LoadTimeout() (loadTimeout time.Duration) {
loadTimeout = 5 * time.Minute
if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
if d, err := time.ParseDuration(s); err == nil {
loadTimeout = d
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
loadTimeout = time.Duration(n) * time.Second
}
}
if loadTimeout <= 0 {
return time.Duration(math.MaxInt64)
}
return loadTimeout
}
func Bool(k string) func() bool { func Bool(k string) func() bool {
return func() bool { return func() bool {
if s := Var(k); s != "" { if s := Var(k); s != "" {
@ -245,10 +265,8 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
} }
} }
var ( // Set aside VRAM per GPU
// Set aside VRAM per GPU var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
)
type EnvVar struct { type EnvVar struct {
Name string Name string
@ -264,6 +282,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"}, "OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"}, "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"}, "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
"OLLAMA_LOAD_TIMEOUT": {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"}, "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"}, "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},

View file

@ -215,6 +215,40 @@ func TestKeepAlive(t *testing.T) {
} }
} }
func TestLoadTimeout(t *testing.T) {
defaultTimeout := 5 * time.Minute
cases := map[string]time.Duration{
"": defaultTimeout,
"1s": time.Second,
"1m": time.Minute,
"1h": time.Hour,
"5m0s": defaultTimeout,
"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
"0": time.Duration(math.MaxInt64),
"60": 60 * time.Second,
"120": 2 * time.Minute,
"3600": time.Hour,
"-0": time.Duration(math.MaxInt64),
"-1": time.Duration(math.MaxInt64),
"-1m": time.Duration(math.MaxInt64),
// invalid values
" ": defaultTimeout,
"???": defaultTimeout,
"1d": defaultTimeout,
"1y": defaultTimeout,
"1w": defaultTimeout,
}
for tt, expect := range cases {
t.Run(tt, func(t *testing.T) {
t.Setenv("OLLAMA_LOAD_TIMEOUT", tt)
if actual := LoadTimeout(); actual != expect {
t.Errorf("%s: expected %s, got %s", tt, expect, actual)
}
})
}
}
func TestVar(t *testing.T) { func TestVar(t *testing.T) {
cases := map[string]string{ cases := map[string]string{
"value": "value", "value": "value",

View file

@ -584,8 +584,7 @@ func (s *llmServer) Ping(ctx context.Context) error {
func (s *llmServer) WaitUntilRunning(ctx context.Context) error { func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
start := time.Now() start := time.Now()
stallDuration := 5 * time.Minute // If no progress happens stallDuration := envconfig.LoadTimeout() // If no progress happens
finalLoadDuration := 5 * time.Minute // After we hit 100%, give the runner more time to come online
stallTimer := time.Now().Add(stallDuration) // give up if we stall stallTimer := time.Now().Add(stallDuration) // give up if we stall
slog.Info("waiting for llama runner to start responding") slog.Info("waiting for llama runner to start responding")
@ -637,7 +636,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
stallTimer = time.Now().Add(stallDuration) stallTimer = time.Now().Add(stallDuration)
} else if !fullyLoaded && int(s.loadProgress*100.0) >= 100 { } else if !fullyLoaded && int(s.loadProgress*100.0) >= 100 {
slog.Debug("model load completed, waiting for server to become available", "status", status.ToString()) slog.Debug("model load completed, waiting for server to become available", "status", status.ToString())
stallTimer = time.Now().Add(finalLoadDuration) stallTimer = time.Now().Add(stallDuration)
fullyLoaded = true fullyLoaded = true
} }
time.Sleep(time.Millisecond * 250) time.Sleep(time.Millisecond * 250)