llm: make load time stall duration configurable via OLLAMA_LOAD_TIMEOUT
With the new very large parameter models, some users are willing to wait for a very long time for models to load.
This commit is contained in:
parent
b05c9e83d9
commit
6719097649
4 changed files with 60 additions and 7 deletions
|
@ -1422,6 +1422,7 @@ func NewCLI() *cobra.Command {
|
|||
envVars["OLLAMA_FLASH_ATTENTION"],
|
||||
envVars["OLLAMA_LLM_LIBRARY"],
|
||||
envVars["OLLAMA_GPU_OVERHEAD"],
|
||||
envVars["OLLAMA_LOAD_TIMEOUT"],
|
||||
})
|
||||
default:
|
||||
appendEnvDocs(cmd, envs)
|
||||
|
|
|
@ -112,6 +112,26 @@ func KeepAlive() (keepAlive time.Duration) {
|
|||
return keepAlive
|
||||
}
|
||||
|
||||
// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
|
||||
// Zero or Negative values are treated as infinite.
|
||||
// Default is 5 minutes.
|
||||
func LoadTimeout() (loadTimeout time.Duration) {
|
||||
loadTimeout = 5 * time.Minute
|
||||
if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
|
||||
if d, err := time.ParseDuration(s); err == nil {
|
||||
loadTimeout = d
|
||||
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
|
||||
loadTimeout = time.Duration(n) * time.Second
|
||||
}
|
||||
}
|
||||
|
||||
if loadTimeout <= 0 {
|
||||
return time.Duration(math.MaxInt64)
|
||||
}
|
||||
|
||||
return loadTimeout
|
||||
}
|
||||
|
||||
func Bool(k string) func() bool {
|
||||
return func() bool {
|
||||
if s := Var(k); s != "" {
|
||||
|
@ -245,10 +265,8 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
|
|||
}
|
||||
}
|
||||
|
||||
var (
|
||||
// Set aside VRAM per GPU
|
||||
GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
||||
)
|
||||
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
||||
|
||||
type EnvVar struct {
|
||||
Name string
|
||||
|
@ -264,6 +282,7 @@ func AsMap() map[string]EnvVar {
|
|||
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
||||
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
||||
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
||||
"OLLAMA_LOAD_TIMEOUT": {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
|
||||
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
|
||||
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
|
||||
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
||||
|
|
|
@ -215,6 +215,40 @@ func TestKeepAlive(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestLoadTimeout(t *testing.T) {
|
||||
defaultTimeout := 5 * time.Minute
|
||||
cases := map[string]time.Duration{
|
||||
"": defaultTimeout,
|
||||
"1s": time.Second,
|
||||
"1m": time.Minute,
|
||||
"1h": time.Hour,
|
||||
"5m0s": defaultTimeout,
|
||||
"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
|
||||
"0": time.Duration(math.MaxInt64),
|
||||
"60": 60 * time.Second,
|
||||
"120": 2 * time.Minute,
|
||||
"3600": time.Hour,
|
||||
"-0": time.Duration(math.MaxInt64),
|
||||
"-1": time.Duration(math.MaxInt64),
|
||||
"-1m": time.Duration(math.MaxInt64),
|
||||
// invalid values
|
||||
" ": defaultTimeout,
|
||||
"???": defaultTimeout,
|
||||
"1d": defaultTimeout,
|
||||
"1y": defaultTimeout,
|
||||
"1w": defaultTimeout,
|
||||
}
|
||||
|
||||
for tt, expect := range cases {
|
||||
t.Run(tt, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_LOAD_TIMEOUT", tt)
|
||||
if actual := LoadTimeout(); actual != expect {
|
||||
t.Errorf("%s: expected %s, got %s", tt, expect, actual)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestVar(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
"value": "value",
|
||||
|
|
|
@ -584,8 +584,7 @@ func (s *llmServer) Ping(ctx context.Context) error {
|
|||
|
||||
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||
start := time.Now()
|
||||
stallDuration := 5 * time.Minute // If no progress happens
|
||||
finalLoadDuration := 5 * time.Minute // After we hit 100%, give the runner more time to come online
|
||||
stallDuration := envconfig.LoadTimeout() // If no progress happens
|
||||
stallTimer := time.Now().Add(stallDuration) // give up if we stall
|
||||
|
||||
slog.Info("waiting for llama runner to start responding")
|
||||
|
@ -637,7 +636,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
|||
stallTimer = time.Now().Add(stallDuration)
|
||||
} else if !fullyLoaded && int(s.loadProgress*100.0) >= 100 {
|
||||
slog.Debug("model load completed, waiting for server to become available", "status", status.ToString())
|
||||
stallTimer = time.Now().Add(finalLoadDuration)
|
||||
stallTimer = time.Now().Add(stallDuration)
|
||||
fullyLoaded = true
|
||||
}
|
||||
time.Sleep(time.Millisecond * 250)
|
||||
|
|
Loading…
Reference in a new issue