From 55cd3ddccac14d48f5f129ec35b3a109be215d01 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 3 Jul 2024 17:22:13 -0700 Subject: [PATCH] bool --- cmd/interactive.go | 2 +- envconfig/config.go | 123 ++++++++++++++++----------------------- envconfig/config_test.go | 28 ++++++++- gpu/gpu.go | 2 +- llm/server.go | 2 +- server/images.go | 4 +- server/routes.go | 2 +- server/sched.go | 2 +- 8 files changed, 82 insertions(+), 83 deletions(-) diff --git a/cmd/interactive.go b/cmd/interactive.go index adbc3e9f..9fb66851 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -157,7 +157,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { return err } - if envconfig.NoHistory { + if envconfig.NoHistory() { scanner.HistoryDisable() } diff --git a/envconfig/config.go b/envconfig/config.go index 286f51d4..ea78585b 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -17,21 +17,6 @@ import ( var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST") -// Debug returns true if the OLLAMA_DEBUG environment variable is set to a truthy value. -func Debug() bool { - if s := clean("OLLAMA_DEBUG"); s != "" { - b, err := strconv.ParseBool(s) - if err != nil { - // non-empty value is truthy - return true - } - - return b - } - - return false -} - // Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable. // Default is scheme "http" and host "127.0.0.1:11434" func Host() *url.URL { @@ -77,7 +62,7 @@ func Host() *url.URL { // Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable. func Origins() (origins []string) { - if s := clean("OLLAMA_ORIGINS"); s != "" { + if s := getenv("OLLAMA_ORIGINS"); s != "" { origins = strings.Split(s, ",") } @@ -114,9 +99,37 @@ func Models() string { return filepath.Join(home, ".ollama", "models") } +func Bool(k string) func() bool { + return func() bool { + if s := getenv(k); s != "" { + b, err := strconv.ParseBool(s) + if err != nil { + return true + } + + return b + } + + return false + } +} + +var ( + // Debug enabled additional debug information. + Debug = Bool("OLLAMA_DEBUG") + // FlashAttention enables the experimental flash attention feature. + FlashAttention = Bool("OLLAMA_FLASH_ATTENTION") + // NoHistory disables readline history. + NoHistory = Bool("OLLAMA_NOHISTORY") + // NoPrune disables pruning of model blobs on startup. + NoPrune = Bool("OLLAMA_NOPRUNE") + // SchedSpread allows scheduling models across all GPUs. + SchedSpread = Bool("OLLAMA_SCHED_SPREAD") + // IntelGPU enables experimental Intel GPU detection. + IntelGPU = Bool("OLLAMA_INTEL_GPU") +) + var ( - // Experimental flash attention - FlashAttention bool // Set via OLLAMA_KEEP_ALIVE in the environment KeepAlive time.Duration // Set via OLLAMA_LLM_LIBRARY in the environment @@ -125,22 +138,12 @@ var ( MaxRunners int // Set via OLLAMA_MAX_QUEUE in the environment MaxQueuedRequests int - // Set via OLLAMA_MODELS in the environment - ModelsDir string - // Set via OLLAMA_NOHISTORY in the environment - NoHistory bool - // Set via OLLAMA_NOPRUNE in the environment - NoPrune bool // Set via OLLAMA_NUM_PARALLEL in the environment NumParallel int // Set via OLLAMA_RUNNERS_DIR in the environment RunnersDir string - // Set via OLLAMA_SCHED_SPREAD in the environment - SchedSpread bool // Set via OLLAMA_TMPDIR in the environment TmpDir string - // Set via OLLAMA_INTEL_GPU in the environment - IntelGpu bool // Set via CUDA_VISIBLE_DEVICES in the environment CudaVisibleDevices string @@ -163,19 +166,19 @@ type EnvVar struct { func AsMap() map[string]EnvVar { ret := map[string]EnvVar{ "OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"}, - "OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"}, + "OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"}, "OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"}, "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"}, "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"}, "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"}, "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"}, - "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"}, - "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"}, + "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"}, + "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"}, "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"}, "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"}, - "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"}, + "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"}, } if runtime.GOOS != "darwin" { @@ -184,7 +187,7 @@ func AsMap() map[string]EnvVar { ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"} ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"} ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"} - ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"} + ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"} } return ret } @@ -197,8 +200,8 @@ func Values() map[string]string { return vals } -// Clean quotes and spaces from the value -func clean(key string) string { +// getenv returns an environment variable stripped of leading and trailing quotes or spaces +func getenv(key string) string { return strings.Trim(os.Getenv(key), "\"' ") } @@ -213,14 +216,7 @@ func init() { } func LoadConfig() { - if fa := clean("OLLAMA_FLASH_ATTENTION"); fa != "" { - d, err := strconv.ParseBool(fa) - if err == nil { - FlashAttention = d - } - } - - RunnersDir = clean("OLLAMA_RUNNERS_DIR") + RunnersDir = getenv("OLLAMA_RUNNERS_DIR") if runtime.GOOS == "windows" && RunnersDir == "" { // On Windows we do not carry the payloads inside the main executable appExe, err := os.Executable() @@ -256,11 +252,11 @@ func LoadConfig() { } } - TmpDir = clean("OLLAMA_TMPDIR") + TmpDir = getenv("OLLAMA_TMPDIR") - LLMLibrary = clean("OLLAMA_LLM_LIBRARY") + LLMLibrary = getenv("OLLAMA_LLM_LIBRARY") - if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" { + if onp := getenv("OLLAMA_NUM_PARALLEL"); onp != "" { val, err := strconv.Atoi(onp) if err != nil { slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err) @@ -269,24 +265,7 @@ func LoadConfig() { } } - if nohistory := clean("OLLAMA_NOHISTORY"); nohistory != "" { - NoHistory = true - } - - if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" { - s, err := strconv.ParseBool(spread) - if err == nil { - SchedSpread = s - } else { - SchedSpread = true - } - } - - if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" { - NoPrune = true - } - - maxRunners := clean("OLLAMA_MAX_LOADED_MODELS") + maxRunners := getenv("OLLAMA_MAX_LOADED_MODELS") if maxRunners != "" { m, err := strconv.Atoi(maxRunners) if err != nil { @@ -305,20 +284,16 @@ func LoadConfig() { } } - ka := clean("OLLAMA_KEEP_ALIVE") + ka := getenv("OLLAMA_KEEP_ALIVE") if ka != "" { loadKeepAlive(ka) } - if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil { - IntelGpu = set - } - - CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES") - HipVisibleDevices = clean("HIP_VISIBLE_DEVICES") - RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES") - GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL") - HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION") + CudaVisibleDevices = getenv("CUDA_VISIBLE_DEVICES") + HipVisibleDevices = getenv("HIP_VISIBLE_DEVICES") + RocrVisibleDevices = getenv("ROCR_VISIBLE_DEVICES") + GpuDeviceOrdinal = getenv("GPU_DEVICE_ORDINAL") + HsaOverrideGfxVersion = getenv("HSA_OVERRIDE_GFX_VERSION") } func loadKeepAlive(ka string) { diff --git a/envconfig/config_test.go b/envconfig/config_test.go index dc65ef70..b364b009 100644 --- a/envconfig/config_test.go +++ b/envconfig/config_test.go @@ -20,8 +20,8 @@ func TestSmoke(t *testing.T) { require.True(t, Debug()) t.Setenv("OLLAMA_FLASH_ATTENTION", "1") - LoadConfig() - require.True(t, FlashAttention) + require.True(t, FlashAttention()) + t.Setenv("OLLAMA_KEEP_ALIVE", "") LoadConfig() require.Equal(t, 5*time.Minute, KeepAlive) @@ -162,3 +162,27 @@ func TestOrigins(t *testing.T) { }) } } + +func TestBool(t *testing.T) { + cases := map[string]struct { + value string + expect bool + }{ + "empty": {"", false}, + "true": {"true", true}, + "false": {"false", false}, + "1": {"1", true}, + "0": {"0", false}, + "random": {"random", true}, + "something": {"something", true}, + } + + for name, tt := range cases { + t.Run(name, func(t *testing.T) { + t.Setenv("OLLAMA_BOOL", tt.value) + if b := Bool("OLLAMA_BOOL"); b() != tt.expect { + t.Errorf("%s: expected %t, got %t", name, tt.expect, b()) + } + }) + } +} diff --git a/gpu/gpu.go b/gpu/gpu.go index 1815668f..c3059542 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -302,7 +302,7 @@ func GetGPUInfo() GpuInfoList { } // Intel - if envconfig.IntelGpu { + if envconfig.IntelGPU() { oHandles = initOneAPIHandles() // On windows we bundle the oneapi library one level above the runner dir depPath = "" diff --git a/llm/server.go b/llm/server.go index eb966650..84d9e93a 100644 --- a/llm/server.go +++ b/llm/server.go @@ -221,7 +221,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--memory-f32") } - flashAttnEnabled := envconfig.FlashAttention + flashAttnEnabled := envconfig.FlashAttention() for _, g := range gpus { // only cuda (compute capability 7+) and metal support flash attention diff --git a/server/images.go b/server/images.go index 574dec19..3eb3b3fa 100644 --- a/server/images.go +++ b/server/images.go @@ -644,7 +644,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio return err } - if !envconfig.NoPrune && old != nil { + if !envconfig.NoPrune() && old != nil { if err := old.RemoveLayers(); err != nil { return err } @@ -883,7 +883,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu // build deleteMap to prune unused layers deleteMap := make(map[string]struct{}) - if !envconfig.NoPrune { + if !envconfig.NoPrune() { manifest, _, err = GetManifest(mp) if err != nil && !errors.Is(err, os.ErrNotExist) { return err diff --git a/server/routes.go b/server/routes.go index 07898d9b..41a73cb4 100644 --- a/server/routes.go +++ b/server/routes.go @@ -1121,7 +1121,7 @@ func Serve(ln net.Listener) error { return err } - if !envconfig.NoPrune { + if !envconfig.NoPrune() { // clean up unused layers and manifests if err := PruneLayers(); err != nil { return err diff --git a/server/sched.go b/server/sched.go index 2daed3ab..e1e986a5 100644 --- a/server/sched.go +++ b/server/sched.go @@ -695,7 +695,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP // First attempt to fit the model into a single GPU for _, p := range numParallelToTry { req.opts.NumCtx = req.origNumCtx * p - if !envconfig.SchedSpread { + if !envconfig.SchedSpread() { for _, g := range sgl { if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))