This commit is contained in:
Michael Yang 2024-07-03 17:22:13 -07:00
parent 66fe77f084
commit 55cd3ddcca
8 changed files with 82 additions and 83 deletions

View file

@ -157,7 +157,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
return err return err
} }
if envconfig.NoHistory { if envconfig.NoHistory() {
scanner.HistoryDisable() scanner.HistoryDisable()
} }

View file

@ -17,21 +17,6 @@ import (
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST") var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
// Debug returns true if the OLLAMA_DEBUG environment variable is set to a truthy value.
func Debug() bool {
if s := clean("OLLAMA_DEBUG"); s != "" {
b, err := strconv.ParseBool(s)
if err != nil {
// non-empty value is truthy
return true
}
return b
}
return false
}
// Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable. // Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
// Default is scheme "http" and host "127.0.0.1:11434" // Default is scheme "http" and host "127.0.0.1:11434"
func Host() *url.URL { func Host() *url.URL {
@ -77,7 +62,7 @@ func Host() *url.URL {
// Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable. // Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
func Origins() (origins []string) { func Origins() (origins []string) {
if s := clean("OLLAMA_ORIGINS"); s != "" { if s := getenv("OLLAMA_ORIGINS"); s != "" {
origins = strings.Split(s, ",") origins = strings.Split(s, ",")
} }
@ -114,9 +99,37 @@ func Models() string {
return filepath.Join(home, ".ollama", "models") return filepath.Join(home, ".ollama", "models")
} }
func Bool(k string) func() bool {
return func() bool {
if s := getenv(k); s != "" {
b, err := strconv.ParseBool(s)
if err != nil {
return true
}
return b
}
return false
}
}
var (
// Debug enabled additional debug information.
Debug = Bool("OLLAMA_DEBUG")
// FlashAttention enables the experimental flash attention feature.
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
// NoHistory disables readline history.
NoHistory = Bool("OLLAMA_NOHISTORY")
// NoPrune disables pruning of model blobs on startup.
NoPrune = Bool("OLLAMA_NOPRUNE")
// SchedSpread allows scheduling models across all GPUs.
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
// IntelGPU enables experimental Intel GPU detection.
IntelGPU = Bool("OLLAMA_INTEL_GPU")
)
var ( var (
// Experimental flash attention
FlashAttention bool
// Set via OLLAMA_KEEP_ALIVE in the environment // Set via OLLAMA_KEEP_ALIVE in the environment
KeepAlive time.Duration KeepAlive time.Duration
// Set via OLLAMA_LLM_LIBRARY in the environment // Set via OLLAMA_LLM_LIBRARY in the environment
@ -125,22 +138,12 @@ var (
MaxRunners int MaxRunners int
// Set via OLLAMA_MAX_QUEUE in the environment // Set via OLLAMA_MAX_QUEUE in the environment
MaxQueuedRequests int MaxQueuedRequests int
// Set via OLLAMA_MODELS in the environment
ModelsDir string
// Set via OLLAMA_NOHISTORY in the environment
NoHistory bool
// Set via OLLAMA_NOPRUNE in the environment
NoPrune bool
// Set via OLLAMA_NUM_PARALLEL in the environment // Set via OLLAMA_NUM_PARALLEL in the environment
NumParallel int NumParallel int
// Set via OLLAMA_RUNNERS_DIR in the environment // Set via OLLAMA_RUNNERS_DIR in the environment
RunnersDir string RunnersDir string
// Set via OLLAMA_SCHED_SPREAD in the environment
SchedSpread bool
// Set via OLLAMA_TMPDIR in the environment // Set via OLLAMA_TMPDIR in the environment
TmpDir string TmpDir string
// Set via OLLAMA_INTEL_GPU in the environment
IntelGpu bool
// Set via CUDA_VISIBLE_DEVICES in the environment // Set via CUDA_VISIBLE_DEVICES in the environment
CudaVisibleDevices string CudaVisibleDevices string
@ -163,19 +166,19 @@ type EnvVar struct {
func AsMap() map[string]EnvVar { func AsMap() map[string]EnvVar {
ret := map[string]EnvVar{ ret := map[string]EnvVar{
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"}, "OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"}, "OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"}, "OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"}, "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"}, "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"}, "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"}, "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"}, "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"}, "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"}, "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"}, "OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
} }
if runtime.GOOS != "darwin" { if runtime.GOOS != "darwin" {
@ -184,7 +187,7 @@ func AsMap() map[string]EnvVar {
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"} ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"} ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"} ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"} ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
} }
return ret return ret
} }
@ -197,8 +200,8 @@ func Values() map[string]string {
return vals return vals
} }
// Clean quotes and spaces from the value // getenv returns an environment variable stripped of leading and trailing quotes or spaces
func clean(key string) string { func getenv(key string) string {
return strings.Trim(os.Getenv(key), "\"' ") return strings.Trim(os.Getenv(key), "\"' ")
} }
@ -213,14 +216,7 @@ func init() {
} }
func LoadConfig() { func LoadConfig() {
if fa := clean("OLLAMA_FLASH_ATTENTION"); fa != "" { RunnersDir = getenv("OLLAMA_RUNNERS_DIR")
d, err := strconv.ParseBool(fa)
if err == nil {
FlashAttention = d
}
}
RunnersDir = clean("OLLAMA_RUNNERS_DIR")
if runtime.GOOS == "windows" && RunnersDir == "" { if runtime.GOOS == "windows" && RunnersDir == "" {
// On Windows we do not carry the payloads inside the main executable // On Windows we do not carry the payloads inside the main executable
appExe, err := os.Executable() appExe, err := os.Executable()
@ -256,11 +252,11 @@ func LoadConfig() {
} }
} }
TmpDir = clean("OLLAMA_TMPDIR") TmpDir = getenv("OLLAMA_TMPDIR")
LLMLibrary = clean("OLLAMA_LLM_LIBRARY") LLMLibrary = getenv("OLLAMA_LLM_LIBRARY")
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" { if onp := getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
val, err := strconv.Atoi(onp) val, err := strconv.Atoi(onp)
if err != nil { if err != nil {
slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err) slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
@ -269,24 +265,7 @@ func LoadConfig() {
} }
} }
if nohistory := clean("OLLAMA_NOHISTORY"); nohistory != "" { maxRunners := getenv("OLLAMA_MAX_LOADED_MODELS")
NoHistory = true
}
if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
s, err := strconv.ParseBool(spread)
if err == nil {
SchedSpread = s
} else {
SchedSpread = true
}
}
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
NoPrune = true
}
maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
if maxRunners != "" { if maxRunners != "" {
m, err := strconv.Atoi(maxRunners) m, err := strconv.Atoi(maxRunners)
if err != nil { if err != nil {
@ -305,20 +284,16 @@ func LoadConfig() {
} }
} }
ka := clean("OLLAMA_KEEP_ALIVE") ka := getenv("OLLAMA_KEEP_ALIVE")
if ka != "" { if ka != "" {
loadKeepAlive(ka) loadKeepAlive(ka)
} }
if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil { CudaVisibleDevices = getenv("CUDA_VISIBLE_DEVICES")
IntelGpu = set HipVisibleDevices = getenv("HIP_VISIBLE_DEVICES")
} RocrVisibleDevices = getenv("ROCR_VISIBLE_DEVICES")
GpuDeviceOrdinal = getenv("GPU_DEVICE_ORDINAL")
CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES") HsaOverrideGfxVersion = getenv("HSA_OVERRIDE_GFX_VERSION")
HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
} }
func loadKeepAlive(ka string) { func loadKeepAlive(ka string) {

View file

@ -20,8 +20,8 @@ func TestSmoke(t *testing.T) {
require.True(t, Debug()) require.True(t, Debug())
t.Setenv("OLLAMA_FLASH_ATTENTION", "1") t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
LoadConfig() require.True(t, FlashAttention())
require.True(t, FlashAttention)
t.Setenv("OLLAMA_KEEP_ALIVE", "") t.Setenv("OLLAMA_KEEP_ALIVE", "")
LoadConfig() LoadConfig()
require.Equal(t, 5*time.Minute, KeepAlive) require.Equal(t, 5*time.Minute, KeepAlive)
@ -162,3 +162,27 @@ func TestOrigins(t *testing.T) {
}) })
} }
} }
func TestBool(t *testing.T) {
cases := map[string]struct {
value string
expect bool
}{
"empty": {"", false},
"true": {"true", true},
"false": {"false", false},
"1": {"1", true},
"0": {"0", false},
"random": {"random", true},
"something": {"something", true},
}
for name, tt := range cases {
t.Run(name, func(t *testing.T) {
t.Setenv("OLLAMA_BOOL", tt.value)
if b := Bool("OLLAMA_BOOL"); b() != tt.expect {
t.Errorf("%s: expected %t, got %t", name, tt.expect, b())
}
})
}
}

View file

@ -302,7 +302,7 @@ func GetGPUInfo() GpuInfoList {
} }
// Intel // Intel
if envconfig.IntelGpu { if envconfig.IntelGPU() {
oHandles = initOneAPIHandles() oHandles = initOneAPIHandles()
// On windows we bundle the oneapi library one level above the runner dir // On windows we bundle the oneapi library one level above the runner dir
depPath = "" depPath = ""

View file

@ -221,7 +221,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--memory-f32") params = append(params, "--memory-f32")
} }
flashAttnEnabled := envconfig.FlashAttention flashAttnEnabled := envconfig.FlashAttention()
for _, g := range gpus { for _, g := range gpus {
// only cuda (compute capability 7+) and metal support flash attention // only cuda (compute capability 7+) and metal support flash attention

View file

@ -644,7 +644,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
return err return err
} }
if !envconfig.NoPrune && old != nil { if !envconfig.NoPrune() && old != nil {
if err := old.RemoveLayers(); err != nil { if err := old.RemoveLayers(); err != nil {
return err return err
} }
@ -883,7 +883,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
// build deleteMap to prune unused layers // build deleteMap to prune unused layers
deleteMap := make(map[string]struct{}) deleteMap := make(map[string]struct{})
if !envconfig.NoPrune { if !envconfig.NoPrune() {
manifest, _, err = GetManifest(mp) manifest, _, err = GetManifest(mp)
if err != nil && !errors.Is(err, os.ErrNotExist) { if err != nil && !errors.Is(err, os.ErrNotExist) {
return err return err

View file

@ -1121,7 +1121,7 @@ func Serve(ln net.Listener) error {
return err return err
} }
if !envconfig.NoPrune { if !envconfig.NoPrune() {
// clean up unused layers and manifests // clean up unused layers and manifests
if err := PruneLayers(); err != nil { if err := PruneLayers(); err != nil {
return err return err

View file

@ -695,7 +695,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
// First attempt to fit the model into a single GPU // First attempt to fit the model into a single GPU
for _, p := range numParallelToTry { for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread { if !envconfig.SchedSpread() {
for _, g := range sgl { for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))