From 55cd3ddccac14d48f5f129ec35b3a109be215d01 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 3 Jul 2024 17:22:13 -0700
Subject: [PATCH] bool

---
 cmd/interactive.go       |   2 +-
 envconfig/config.go      | 123 ++++++++++++++++-----------------------
 envconfig/config_test.go |  28 ++++++++-
 gpu/gpu.go               |   2 +-
 llm/server.go            |   2 +-
 server/images.go         |   4 +-
 server/routes.go         |   2 +-
 server/sched.go          |   2 +-
 8 files changed, 82 insertions(+), 83 deletions(-)

diff --git a/cmd/interactive.go b/cmd/interactive.go
index adbc3e9f..9fb66851 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -157,7 +157,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		return err
 	}
 
-	if envconfig.NoHistory {
+	if envconfig.NoHistory() {
 		scanner.HistoryDisable()
 	}
 
diff --git a/envconfig/config.go b/envconfig/config.go
index 286f51d4..ea78585b 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -17,21 +17,6 @@ import (
 
 var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
 
-// Debug returns true if the OLLAMA_DEBUG environment variable is set to a truthy value.
-func Debug() bool {
-	if s := clean("OLLAMA_DEBUG"); s != "" {
-		b, err := strconv.ParseBool(s)
-		if err != nil {
-			// non-empty value is truthy
-			return true
-		}
-
-		return b
-	}
-
-	return false
-}
-
 // Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
 // Default is scheme "http" and host "127.0.0.1:11434"
 func Host() *url.URL {
@@ -77,7 +62,7 @@ func Host() *url.URL {
 
 // Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
 func Origins() (origins []string) {
-	if s := clean("OLLAMA_ORIGINS"); s != "" {
+	if s := getenv("OLLAMA_ORIGINS"); s != "" {
 		origins = strings.Split(s, ",")
 	}
 
@@ -114,9 +99,37 @@ func Models() string {
 	return filepath.Join(home, ".ollama", "models")
 }
 
+func Bool(k string) func() bool {
+	return func() bool {
+		if s := getenv(k); s != "" {
+			b, err := strconv.ParseBool(s)
+			if err != nil {
+				return true
+			}
+
+			return b
+		}
+
+		return false
+	}
+}
+
+var (
+	// Debug enabled additional debug information.
+	Debug = Bool("OLLAMA_DEBUG")
+	// FlashAttention enables the experimental flash attention feature.
+	FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
+	// NoHistory disables readline history.
+	NoHistory = Bool("OLLAMA_NOHISTORY")
+	// NoPrune disables pruning of model blobs on startup.
+	NoPrune = Bool("OLLAMA_NOPRUNE")
+	// SchedSpread allows scheduling models across all GPUs.
+	SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
+	// IntelGPU enables experimental Intel GPU detection.
+	IntelGPU = Bool("OLLAMA_INTEL_GPU")
+)
+
 var (
-	// Experimental flash attention
-	FlashAttention bool
 	// Set via OLLAMA_KEEP_ALIVE in the environment
 	KeepAlive time.Duration
 	// Set via OLLAMA_LLM_LIBRARY in the environment
@@ -125,22 +138,12 @@ var (
 	MaxRunners int
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	MaxQueuedRequests int
-	// Set via OLLAMA_MODELS in the environment
-	ModelsDir string
-	// Set via OLLAMA_NOHISTORY in the environment
-	NoHistory bool
-	// Set via OLLAMA_NOPRUNE in the environment
-	NoPrune bool
 	// Set via OLLAMA_NUM_PARALLEL in the environment
 	NumParallel int
 	// Set via OLLAMA_RUNNERS_DIR in the environment
 	RunnersDir string
-	// Set via OLLAMA_SCHED_SPREAD in the environment
-	SchedSpread bool
 	// Set via OLLAMA_TMPDIR in the environment
 	TmpDir string
-	// Set via OLLAMA_INTEL_GPU in the environment
-	IntelGpu bool
 
 	// Set via CUDA_VISIBLE_DEVICES in the environment
 	CudaVisibleDevices string
@@ -163,19 +166,19 @@ type EnvVar struct {
 func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
-		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
+		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
-		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
-		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
+		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
+		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
-		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
+		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
 	}
 	if runtime.GOOS != "darwin" {
@@ -184,7 +187,7 @@ func AsMap() map[string]EnvVar {
 		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
 		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
-		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
+		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
 	}
 	return ret
 }
@@ -197,8 +200,8 @@ func Values() map[string]string {
 	return vals
 }
 
-// Clean quotes and spaces from the value
-func clean(key string) string {
+// getenv returns an environment variable stripped of leading and trailing quotes or spaces
+func getenv(key string) string {
 	return strings.Trim(os.Getenv(key), "\"' ")
 }
 
@@ -213,14 +216,7 @@ func init() {
 }
 
 func LoadConfig() {
-	if fa := clean("OLLAMA_FLASH_ATTENTION"); fa != "" {
-		d, err := strconv.ParseBool(fa)
-		if err == nil {
-			FlashAttention = d
-		}
-	}
-
-	RunnersDir = clean("OLLAMA_RUNNERS_DIR")
+	RunnersDir = getenv("OLLAMA_RUNNERS_DIR")
 	if runtime.GOOS == "windows" && RunnersDir == "" {
 		// On Windows we do not carry the payloads inside the main executable
 		appExe, err := os.Executable()
@@ -256,11 +252,11 @@ func LoadConfig() {
 		}
 	}
 
-	TmpDir = clean("OLLAMA_TMPDIR")
+	TmpDir = getenv("OLLAMA_TMPDIR")
 
-	LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
+	LLMLibrary = getenv("OLLAMA_LLM_LIBRARY")
 
-	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
+	if onp := getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
 		val, err := strconv.Atoi(onp)
 		if err != nil {
 			slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
@@ -269,24 +265,7 @@ func LoadConfig() {
 		}
 	}
 
-	if nohistory := clean("OLLAMA_NOHISTORY"); nohistory != "" {
-		NoHistory = true
-	}
-
-	if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
-		s, err := strconv.ParseBool(spread)
-		if err == nil {
-			SchedSpread = s
-		} else {
-			SchedSpread = true
-		}
-	}
-
-	if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
-		NoPrune = true
-	}
-
-	maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
+	maxRunners := getenv("OLLAMA_MAX_LOADED_MODELS")
 	if maxRunners != "" {
 		m, err := strconv.Atoi(maxRunners)
 		if err != nil {
@@ -305,20 +284,16 @@ func LoadConfig() {
 		}
 	}
 
-	ka := clean("OLLAMA_KEEP_ALIVE")
+	ka := getenv("OLLAMA_KEEP_ALIVE")
 	if ka != "" {
 		loadKeepAlive(ka)
 	}
 
-	if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
-		IntelGpu = set
-	}
-
-	CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
-	HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
-	RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
-	GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
-	HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
+	CudaVisibleDevices = getenv("CUDA_VISIBLE_DEVICES")
+	HipVisibleDevices = getenv("HIP_VISIBLE_DEVICES")
+	RocrVisibleDevices = getenv("ROCR_VISIBLE_DEVICES")
+	GpuDeviceOrdinal = getenv("GPU_DEVICE_ORDINAL")
+	HsaOverrideGfxVersion = getenv("HSA_OVERRIDE_GFX_VERSION")
 }
 
 func loadKeepAlive(ka string) {
diff --git a/envconfig/config_test.go b/envconfig/config_test.go
index dc65ef70..b364b009 100644
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -20,8 +20,8 @@ func TestSmoke(t *testing.T) {
 	require.True(t, Debug())
 
 	t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
-	LoadConfig()
-	require.True(t, FlashAttention)
+	require.True(t, FlashAttention())
+
 	t.Setenv("OLLAMA_KEEP_ALIVE", "")
 	LoadConfig()
 	require.Equal(t, 5*time.Minute, KeepAlive)
@@ -162,3 +162,27 @@ func TestOrigins(t *testing.T) {
 		})
 	}
 }
+
+func TestBool(t *testing.T) {
+	cases := map[string]struct {
+		value  string
+		expect bool
+	}{
+		"empty":     {"", false},
+		"true":      {"true", true},
+		"false":     {"false", false},
+		"1":         {"1", true},
+		"0":         {"0", false},
+		"random":    {"random", true},
+		"something": {"something", true},
+	}
+
+	for name, tt := range cases {
+		t.Run(name, func(t *testing.T) {
+			t.Setenv("OLLAMA_BOOL", tt.value)
+			if b := Bool("OLLAMA_BOOL"); b() != tt.expect {
+				t.Errorf("%s: expected %t, got %t", name, tt.expect, b())
+			}
+		})
+	}
+}
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 1815668f..c3059542 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -302,7 +302,7 @@ func GetGPUInfo() GpuInfoList {
 		}
 
 		// Intel
-		if envconfig.IntelGpu {
+		if envconfig.IntelGPU() {
 			oHandles = initOneAPIHandles()
 			// On windows we bundle the oneapi library one level above the runner dir
 			depPath = ""
diff --git a/llm/server.go b/llm/server.go
index eb966650..84d9e93a 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -221,7 +221,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--memory-f32")
 	}
 
-	flashAttnEnabled := envconfig.FlashAttention
+	flashAttnEnabled := envconfig.FlashAttention()
 
 	for _, g := range gpus {
 		// only cuda (compute capability 7+) and metal support flash attention
diff --git a/server/images.go b/server/images.go
index 574dec19..3eb3b3fa 100644
--- a/server/images.go
+++ b/server/images.go
@@ -644,7 +644,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 		return err
 	}
 
-	if !envconfig.NoPrune && old != nil {
+	if !envconfig.NoPrune() && old != nil {
 		if err := old.RemoveLayers(); err != nil {
 			return err
 		}
@@ -883,7 +883,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]struct{})
 
-	if !envconfig.NoPrune {
+	if !envconfig.NoPrune() {
 		manifest, _, err = GetManifest(mp)
 		if err != nil && !errors.Is(err, os.ErrNotExist) {
 			return err
diff --git a/server/routes.go b/server/routes.go
index 07898d9b..41a73cb4 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -1121,7 +1121,7 @@ func Serve(ln net.Listener) error {
 		return err
 	}
 
-	if !envconfig.NoPrune {
+	if !envconfig.NoPrune() {
 		// clean up unused layers and manifests
 		if err := PruneLayers(); err != nil {
 			return err
diff --git a/server/sched.go b/server/sched.go
index 2daed3ab..e1e986a5 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -695,7 +695,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
 		// First attempt to fit the model into a single GPU
 		for _, p := range numParallelToTry {
 			req.opts.NumCtx = req.origNumCtx * p
-			if !envconfig.SchedSpread {
+			if !envconfig.SchedSpread() {
 				for _, g := range sgl {
 					if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))