llm: make load time stall duration configurable via OLLAMA_LOAD_TIMEOUT
With the new very large parameter models, some users are willing to wait for a very long time for models to load.
This commit is contained in:
parent
b05c9e83d9
commit
6719097649
4 changed files with 60 additions and 7 deletions
|
@ -1422,6 +1422,7 @@ func NewCLI() *cobra.Command {
|
||||||
envVars["OLLAMA_FLASH_ATTENTION"],
|
envVars["OLLAMA_FLASH_ATTENTION"],
|
||||||
envVars["OLLAMA_LLM_LIBRARY"],
|
envVars["OLLAMA_LLM_LIBRARY"],
|
||||||
envVars["OLLAMA_GPU_OVERHEAD"],
|
envVars["OLLAMA_GPU_OVERHEAD"],
|
||||||
|
envVars["OLLAMA_LOAD_TIMEOUT"],
|
||||||
})
|
})
|
||||||
default:
|
default:
|
||||||
appendEnvDocs(cmd, envs)
|
appendEnvDocs(cmd, envs)
|
||||||
|
|
|
@ -112,6 +112,26 @@ func KeepAlive() (keepAlive time.Duration) {
|
||||||
return keepAlive
|
return keepAlive
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
|
||||||
|
// Zero or Negative values are treated as infinite.
|
||||||
|
// Default is 5 minutes.
|
||||||
|
func LoadTimeout() (loadTimeout time.Duration) {
|
||||||
|
loadTimeout = 5 * time.Minute
|
||||||
|
if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
|
||||||
|
if d, err := time.ParseDuration(s); err == nil {
|
||||||
|
loadTimeout = d
|
||||||
|
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
|
||||||
|
loadTimeout = time.Duration(n) * time.Second
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if loadTimeout <= 0 {
|
||||||
|
return time.Duration(math.MaxInt64)
|
||||||
|
}
|
||||||
|
|
||||||
|
return loadTimeout
|
||||||
|
}
|
||||||
|
|
||||||
func Bool(k string) func() bool {
|
func Bool(k string) func() bool {
|
||||||
return func() bool {
|
return func() bool {
|
||||||
if s := Var(k); s != "" {
|
if s := Var(k); s != "" {
|
||||||
|
@ -245,10 +265,8 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
// Set aside VRAM per GPU
|
||||||
// Set aside VRAM per GPU
|
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
||||||
GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
|
||||||
)
|
|
||||||
|
|
||||||
type EnvVar struct {
|
type EnvVar struct {
|
||||||
Name string
|
Name string
|
||||||
|
@ -264,6 +282,7 @@ func AsMap() map[string]EnvVar {
|
||||||
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
||||||
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
||||||
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
||||||
|
"OLLAMA_LOAD_TIMEOUT": {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
|
||||||
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
|
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
|
||||||
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
|
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
|
||||||
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
||||||
|
|
|
@ -215,6 +215,40 @@ func TestKeepAlive(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestLoadTimeout(t *testing.T) {
|
||||||
|
defaultTimeout := 5 * time.Minute
|
||||||
|
cases := map[string]time.Duration{
|
||||||
|
"": defaultTimeout,
|
||||||
|
"1s": time.Second,
|
||||||
|
"1m": time.Minute,
|
||||||
|
"1h": time.Hour,
|
||||||
|
"5m0s": defaultTimeout,
|
||||||
|
"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
|
||||||
|
"0": time.Duration(math.MaxInt64),
|
||||||
|
"60": 60 * time.Second,
|
||||||
|
"120": 2 * time.Minute,
|
||||||
|
"3600": time.Hour,
|
||||||
|
"-0": time.Duration(math.MaxInt64),
|
||||||
|
"-1": time.Duration(math.MaxInt64),
|
||||||
|
"-1m": time.Duration(math.MaxInt64),
|
||||||
|
// invalid values
|
||||||
|
" ": defaultTimeout,
|
||||||
|
"???": defaultTimeout,
|
||||||
|
"1d": defaultTimeout,
|
||||||
|
"1y": defaultTimeout,
|
||||||
|
"1w": defaultTimeout,
|
||||||
|
}
|
||||||
|
|
||||||
|
for tt, expect := range cases {
|
||||||
|
t.Run(tt, func(t *testing.T) {
|
||||||
|
t.Setenv("OLLAMA_LOAD_TIMEOUT", tt)
|
||||||
|
if actual := LoadTimeout(); actual != expect {
|
||||||
|
t.Errorf("%s: expected %s, got %s", tt, expect, actual)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestVar(t *testing.T) {
|
func TestVar(t *testing.T) {
|
||||||
cases := map[string]string{
|
cases := map[string]string{
|
||||||
"value": "value",
|
"value": "value",
|
||||||
|
|
|
@ -584,8 +584,7 @@ func (s *llmServer) Ping(ctx context.Context) error {
|
||||||
|
|
||||||
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
stallDuration := 5 * time.Minute // If no progress happens
|
stallDuration := envconfig.LoadTimeout() // If no progress happens
|
||||||
finalLoadDuration := 5 * time.Minute // After we hit 100%, give the runner more time to come online
|
|
||||||
stallTimer := time.Now().Add(stallDuration) // give up if we stall
|
stallTimer := time.Now().Add(stallDuration) // give up if we stall
|
||||||
|
|
||||||
slog.Info("waiting for llama runner to start responding")
|
slog.Info("waiting for llama runner to start responding")
|
||||||
|
@ -637,7 +636,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||||
stallTimer = time.Now().Add(stallDuration)
|
stallTimer = time.Now().Add(stallDuration)
|
||||||
} else if !fullyLoaded && int(s.loadProgress*100.0) >= 100 {
|
} else if !fullyLoaded && int(s.loadProgress*100.0) >= 100 {
|
||||||
slog.Debug("model load completed, waiting for server to become available", "status", status.ToString())
|
slog.Debug("model load completed, waiting for server to become available", "status", status.ToString())
|
||||||
stallTimer = time.Now().Add(finalLoadDuration)
|
stallTimer = time.Now().Add(stallDuration)
|
||||||
fullyLoaded = true
|
fullyLoaded = true
|
||||||
}
|
}
|
||||||
time.Sleep(time.Millisecond * 250)
|
time.Sleep(time.Millisecond * 250)
|
||||||
|
|
Loading…
Reference in a new issue