d7c94e0ca6
* Better support for AMD multi-GPU This resolves a number of problems related to AMD multi-GPU setups on linux. The numeric IDs used by rocm are not the same as the numeric IDs exposed in sysfs although the ordering is consistent. We have to count up from the first valid gfx (major/minor/patch with non-zero values) we find starting at zero. There are 3 different env vars for selecting GPUs, and only ROCR_VISIBLE_DEVICES supports UUID based identification, so we should favor that one, and try to use UUIDs if detected to avoid potential ordering bugs with numeric IDs * ROCR_VISIBLE_DEVICES only works on linux Use the numeric ID only HIP_VISIBLE_DEVICES on windows
298 lines
10 KiB
Go
298 lines
10 KiB
Go
package envconfig
|
|
|
|
import (
|
|
"fmt"
|
|
"log/slog"
|
|
"math"
|
|
"net"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
|
|
// Default is scheme "http" and host "127.0.0.1:11434"
|
|
func Host() *url.URL {
|
|
defaultPort := "11434"
|
|
|
|
s := strings.TrimSpace(Var("OLLAMA_HOST"))
|
|
scheme, hostport, ok := strings.Cut(s, "://")
|
|
switch {
|
|
case !ok:
|
|
scheme, hostport = "http", s
|
|
case scheme == "http":
|
|
defaultPort = "80"
|
|
case scheme == "https":
|
|
defaultPort = "443"
|
|
}
|
|
|
|
hostport, path, _ := strings.Cut(hostport, "/")
|
|
host, port, err := net.SplitHostPort(hostport)
|
|
if err != nil {
|
|
host, port = "127.0.0.1", defaultPort
|
|
if ip := net.ParseIP(strings.Trim(hostport, "[]")); ip != nil {
|
|
host = ip.String()
|
|
} else if hostport != "" {
|
|
host = hostport
|
|
}
|
|
}
|
|
|
|
if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
|
|
slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
|
|
port = defaultPort
|
|
}
|
|
|
|
return &url.URL{
|
|
Scheme: scheme,
|
|
Host: net.JoinHostPort(host, port),
|
|
Path: path,
|
|
}
|
|
}
|
|
|
|
// Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
|
|
func Origins() (origins []string) {
|
|
if s := Var("OLLAMA_ORIGINS"); s != "" {
|
|
origins = strings.Split(s, ",")
|
|
}
|
|
|
|
for _, origin := range []string{"localhost", "127.0.0.1", "0.0.0.0"} {
|
|
origins = append(origins,
|
|
fmt.Sprintf("http://%s", origin),
|
|
fmt.Sprintf("https://%s", origin),
|
|
fmt.Sprintf("http://%s", net.JoinHostPort(origin, "*")),
|
|
fmt.Sprintf("https://%s", net.JoinHostPort(origin, "*")),
|
|
)
|
|
}
|
|
|
|
origins = append(origins,
|
|
"app://*",
|
|
"file://*",
|
|
"tauri://*",
|
|
"vscode-webview://*",
|
|
)
|
|
|
|
return origins
|
|
}
|
|
|
|
// Models returns the path to the models directory. Models directory can be configured via the OLLAMA_MODELS environment variable.
|
|
// Default is $HOME/.ollama/models
|
|
func Models() string {
|
|
if s := Var("OLLAMA_MODELS"); s != "" {
|
|
return s
|
|
}
|
|
|
|
home, err := os.UserHomeDir()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
return filepath.Join(home, ".ollama", "models")
|
|
}
|
|
|
|
// KeepAlive returns the duration that models stay loaded in memory. KeepAlive can be configured via the OLLAMA_KEEP_ALIVE environment variable.
|
|
// Negative values are treated as infinite. Zero is treated as no keep alive.
|
|
// Default is 5 minutes.
|
|
func KeepAlive() (keepAlive time.Duration) {
|
|
keepAlive = 5 * time.Minute
|
|
if s := Var("OLLAMA_KEEP_ALIVE"); s != "" {
|
|
if d, err := time.ParseDuration(s); err == nil {
|
|
keepAlive = d
|
|
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
|
|
keepAlive = time.Duration(n) * time.Second
|
|
}
|
|
}
|
|
|
|
if keepAlive < 0 {
|
|
return time.Duration(math.MaxInt64)
|
|
}
|
|
|
|
return keepAlive
|
|
}
|
|
|
|
// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
|
|
// Zero or Negative values are treated as infinite.
|
|
// Default is 5 minutes.
|
|
func LoadTimeout() (loadTimeout time.Duration) {
|
|
loadTimeout = 5 * time.Minute
|
|
if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
|
|
if d, err := time.ParseDuration(s); err == nil {
|
|
loadTimeout = d
|
|
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
|
|
loadTimeout = time.Duration(n) * time.Second
|
|
}
|
|
}
|
|
|
|
if loadTimeout <= 0 {
|
|
return time.Duration(math.MaxInt64)
|
|
}
|
|
|
|
return loadTimeout
|
|
}
|
|
|
|
func Bool(k string) func() bool {
|
|
return func() bool {
|
|
if s := Var(k); s != "" {
|
|
b, err := strconv.ParseBool(s)
|
|
if err != nil {
|
|
return true
|
|
}
|
|
|
|
return b
|
|
}
|
|
|
|
return false
|
|
}
|
|
}
|
|
|
|
var (
|
|
// Debug enabled additional debug information.
|
|
Debug = Bool("OLLAMA_DEBUG")
|
|
// FlashAttention enables the experimental flash attention feature.
|
|
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
|
|
// NoHistory disables readline history.
|
|
NoHistory = Bool("OLLAMA_NOHISTORY")
|
|
// NoPrune disables pruning of model blobs on startup.
|
|
NoPrune = Bool("OLLAMA_NOPRUNE")
|
|
// SchedSpread allows scheduling models across all GPUs.
|
|
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
|
|
// IntelGPU enables experimental Intel GPU detection.
|
|
IntelGPU = Bool("OLLAMA_INTEL_GPU")
|
|
// MultiUserCache optimizes prompt caching for multi-user scenarios
|
|
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
|
|
)
|
|
|
|
func String(s string) func() string {
|
|
return func() string {
|
|
return Var(s)
|
|
}
|
|
}
|
|
|
|
var (
|
|
LLMLibrary = String("OLLAMA_LLM_LIBRARY")
|
|
TmpDir = String("OLLAMA_TMPDIR")
|
|
|
|
CudaVisibleDevices = String("CUDA_VISIBLE_DEVICES")
|
|
HipVisibleDevices = String("HIP_VISIBLE_DEVICES")
|
|
RocrVisibleDevices = String("ROCR_VISIBLE_DEVICES")
|
|
GpuDeviceOrdinal = String("GPU_DEVICE_ORDINAL")
|
|
HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
|
|
)
|
|
|
|
func Uint(key string, defaultValue uint) func() uint {
|
|
return func() uint {
|
|
if s := Var(key); s != "" {
|
|
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
|
|
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
|
} else {
|
|
return uint(n)
|
|
}
|
|
}
|
|
|
|
return defaultValue
|
|
}
|
|
}
|
|
|
|
var (
|
|
// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
|
|
NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
|
|
// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
|
|
MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
|
|
// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
|
|
MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
|
|
// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
|
|
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
|
)
|
|
|
|
func Uint64(key string, defaultValue uint64) func() uint64 {
|
|
return func() uint64 {
|
|
if s := Var(key); s != "" {
|
|
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
|
|
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
|
} else {
|
|
return n
|
|
}
|
|
}
|
|
|
|
return defaultValue
|
|
}
|
|
}
|
|
|
|
// Set aside VRAM per GPU
|
|
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
|
|
|
type EnvVar struct {
|
|
Name string
|
|
Value any
|
|
Description string
|
|
}
|
|
|
|
func AsMap() map[string]EnvVar {
|
|
ret := map[string]EnvVar{
|
|
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
|
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
|
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
|
|
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
|
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
|
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
|
"OLLAMA_LOAD_TIMEOUT": {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
|
|
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
|
|
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
|
|
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
|
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
|
|
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
|
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
|
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
|
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
|
|
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
|
|
|
|
// Informational
|
|
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
|
|
"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
|
|
"NO_PROXY": {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
|
|
}
|
|
|
|
if runtime.GOOS != "windows" {
|
|
// Windows environment variables are case-insensitive so there's no need to duplicate them
|
|
ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"}
|
|
ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"}
|
|
ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"}
|
|
}
|
|
|
|
if runtime.GOOS != "darwin" {
|
|
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
|
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible by numeric ID"}
|
|
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible by UUID or numeric ID"}
|
|
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible by numeric ID"}
|
|
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
|
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
func Values() map[string]string {
|
|
vals := make(map[string]string)
|
|
for k, v := range AsMap() {
|
|
vals[k] = fmt.Sprintf("%v", v.Value)
|
|
}
|
|
return vals
|
|
}
|
|
|
|
// Var returns an environment variable stripped of leading and trailing quotes or spaces
|
|
func Var(key string) string {
|
|
return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
|
|
}
|
|
|
|
// On windows, we keep the binary at the top directory, but
|
|
// other platforms use a "bin" directory, so this returns ".."
|
|
func LibRelativeToExe() string {
|
|
if runtime.GOOS == "windows" {
|
|
return "."
|
|
}
|
|
return ".."
|
|
}
|