cd5c8f6471
* Optimize container images for startup This change adjusts how to handle runner payloads to support container builds where we keep them extracted in the filesystem. This makes it easier to optimize the cpu/cuda vs cpu/rocm images for size, and should result in faster startup times for container images. * Refactor payload logic and add buildx support for faster builds * Move payloads around * Review comments * Converge to buildx based helper scripts * Use docker buildx action for release
294 lines
9.8 KiB
Go
294 lines
9.8 KiB
Go
package envconfig
|
|
|
|
import (
|
|
"fmt"
|
|
"log/slog"
|
|
"math"
|
|
"net"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
|
|
// Default is scheme "http" and host "127.0.0.1:11434"
|
|
func Host() *url.URL {
|
|
defaultPort := "11434"
|
|
|
|
s := strings.TrimSpace(Var("OLLAMA_HOST"))
|
|
scheme, hostport, ok := strings.Cut(s, "://")
|
|
switch {
|
|
case !ok:
|
|
scheme, hostport = "http", s
|
|
case scheme == "http":
|
|
defaultPort = "80"
|
|
case scheme == "https":
|
|
defaultPort = "443"
|
|
}
|
|
|
|
hostport, path, _ := strings.Cut(hostport, "/")
|
|
host, port, err := net.SplitHostPort(hostport)
|
|
if err != nil {
|
|
host, port = "127.0.0.1", defaultPort
|
|
if ip := net.ParseIP(strings.Trim(hostport, "[]")); ip != nil {
|
|
host = ip.String()
|
|
} else if hostport != "" {
|
|
host = hostport
|
|
}
|
|
}
|
|
|
|
if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
|
|
slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
|
|
port = defaultPort
|
|
}
|
|
|
|
return &url.URL{
|
|
Scheme: scheme,
|
|
Host: net.JoinHostPort(host, port),
|
|
Path: path,
|
|
}
|
|
}
|
|
|
|
// Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
|
|
func Origins() (origins []string) {
|
|
if s := Var("OLLAMA_ORIGINS"); s != "" {
|
|
origins = strings.Split(s, ",")
|
|
}
|
|
|
|
for _, origin := range []string{"localhost", "127.0.0.1", "0.0.0.0"} {
|
|
origins = append(origins,
|
|
fmt.Sprintf("http://%s", origin),
|
|
fmt.Sprintf("https://%s", origin),
|
|
fmt.Sprintf("http://%s", net.JoinHostPort(origin, "*")),
|
|
fmt.Sprintf("https://%s", net.JoinHostPort(origin, "*")),
|
|
)
|
|
}
|
|
|
|
origins = append(origins,
|
|
"app://*",
|
|
"file://*",
|
|
"tauri://*",
|
|
)
|
|
|
|
return origins
|
|
}
|
|
|
|
// Models returns the path to the models directory. Models directory can be configured via the OLLAMA_MODELS environment variable.
|
|
// Default is $HOME/.ollama/models
|
|
func Models() string {
|
|
if s := Var("OLLAMA_MODELS"); s != "" {
|
|
return s
|
|
}
|
|
|
|
home, err := os.UserHomeDir()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
return filepath.Join(home, ".ollama", "models")
|
|
}
|
|
|
|
// KeepAlive returns the duration that models stay loaded in memory. KeepAlive can be configured via the OLLAMA_KEEP_ALIVE environment variable.
|
|
// Negative values are treated as infinite. Zero is treated as no keep alive.
|
|
// Default is 5 minutes.
|
|
func KeepAlive() (keepAlive time.Duration) {
|
|
keepAlive = 5 * time.Minute
|
|
if s := Var("OLLAMA_KEEP_ALIVE"); s != "" {
|
|
if d, err := time.ParseDuration(s); err == nil {
|
|
keepAlive = d
|
|
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
|
|
keepAlive = time.Duration(n) * time.Second
|
|
}
|
|
}
|
|
|
|
if keepAlive < 0 {
|
|
return time.Duration(math.MaxInt64)
|
|
}
|
|
|
|
return keepAlive
|
|
}
|
|
|
|
// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
|
|
// Zero or Negative values are treated as infinite.
|
|
// Default is 5 minutes.
|
|
func LoadTimeout() (loadTimeout time.Duration) {
|
|
loadTimeout = 5 * time.Minute
|
|
if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
|
|
if d, err := time.ParseDuration(s); err == nil {
|
|
loadTimeout = d
|
|
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
|
|
loadTimeout = time.Duration(n) * time.Second
|
|
}
|
|
}
|
|
|
|
if loadTimeout <= 0 {
|
|
return time.Duration(math.MaxInt64)
|
|
}
|
|
|
|
return loadTimeout
|
|
}
|
|
|
|
func Bool(k string) func() bool {
|
|
return func() bool {
|
|
if s := Var(k); s != "" {
|
|
b, err := strconv.ParseBool(s)
|
|
if err != nil {
|
|
return true
|
|
}
|
|
|
|
return b
|
|
}
|
|
|
|
return false
|
|
}
|
|
}
|
|
|
|
var (
|
|
// Debug enabled additional debug information.
|
|
Debug = Bool("OLLAMA_DEBUG")
|
|
// FlashAttention enables the experimental flash attention feature.
|
|
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
|
|
// NoHistory disables readline history.
|
|
NoHistory = Bool("OLLAMA_NOHISTORY")
|
|
// NoPrune disables pruning of model blobs on startup.
|
|
NoPrune = Bool("OLLAMA_NOPRUNE")
|
|
// SchedSpread allows scheduling models across all GPUs.
|
|
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
|
|
// IntelGPU enables experimental Intel GPU detection.
|
|
IntelGPU = Bool("OLLAMA_INTEL_GPU")
|
|
)
|
|
|
|
func String(s string) func() string {
|
|
return func() string {
|
|
return Var(s)
|
|
}
|
|
}
|
|
|
|
var (
|
|
LLMLibrary = String("OLLAMA_LLM_LIBRARY")
|
|
TmpDir = String("OLLAMA_TMPDIR")
|
|
|
|
CudaVisibleDevices = String("CUDA_VISIBLE_DEVICES")
|
|
HipVisibleDevices = String("HIP_VISIBLE_DEVICES")
|
|
RocrVisibleDevices = String("ROCR_VISIBLE_DEVICES")
|
|
GpuDeviceOrdinal = String("GPU_DEVICE_ORDINAL")
|
|
HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
|
|
)
|
|
|
|
func Uint(key string, defaultValue uint) func() uint {
|
|
return func() uint {
|
|
if s := Var(key); s != "" {
|
|
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
|
|
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
|
} else {
|
|
return uint(n)
|
|
}
|
|
}
|
|
|
|
return defaultValue
|
|
}
|
|
}
|
|
|
|
var (
|
|
// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
|
|
NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
|
|
// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
|
|
MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
|
|
// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
|
|
MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
|
|
// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
|
|
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
|
)
|
|
|
|
func Uint64(key string, defaultValue uint64) func() uint64 {
|
|
return func() uint64 {
|
|
if s := Var(key); s != "" {
|
|
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
|
|
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
|
} else {
|
|
return n
|
|
}
|
|
}
|
|
|
|
return defaultValue
|
|
}
|
|
}
|
|
|
|
// Set aside VRAM per GPU
|
|
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
|
|
|
type EnvVar struct {
|
|
Name string
|
|
Value any
|
|
Description string
|
|
}
|
|
|
|
func AsMap() map[string]EnvVar {
|
|
ret := map[string]EnvVar{
|
|
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
|
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
|
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
|
|
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
|
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
|
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
|
"OLLAMA_LOAD_TIMEOUT": {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
|
|
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
|
|
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
|
|
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
|
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
|
|
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
|
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
|
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
|
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
|
|
|
|
// Informational
|
|
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
|
|
"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
|
|
"NO_PROXY": {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
|
|
}
|
|
|
|
if runtime.GOOS != "windows" {
|
|
// Windows environment variables are case-insensitive so there's no need to duplicate them
|
|
ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"}
|
|
ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"}
|
|
ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"}
|
|
}
|
|
|
|
if runtime.GOOS != "darwin" {
|
|
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
|
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
|
|
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible"}
|
|
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible"}
|
|
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
|
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
func Values() map[string]string {
|
|
vals := make(map[string]string)
|
|
for k, v := range AsMap() {
|
|
vals[k] = fmt.Sprintf("%v", v.Value)
|
|
}
|
|
return vals
|
|
}
|
|
|
|
// Var returns an environment variable stripped of leading and trailing quotes or spaces
|
|
func Var(key string) string {
|
|
return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
|
|
}
|
|
|
|
// On windows, we keep the binary at the top directory, but
|
|
// other platforms use a "bin" directory, so this returns ".."
|
|
func LibRelativeToExe() string {
|
|
if runtime.GOOS == "windows" {
|
|
return "."
|
|
}
|
|
return ".."
|
|
}
|