Introduce GPU Overhead env var (#5922)
Provide a mechanism for users to set aside an amount of VRAM on each GPU to make room for other applications they want to start after Ollama, or workaround memory prediction bugs
This commit is contained in:
parent
a60d9b89ce
commit
b05c9e83d9
3 changed files with 28 additions and 3 deletions
|
@ -1421,6 +1421,7 @@ func NewCLI() *cobra.Command {
|
||||||
envVars["OLLAMA_TMPDIR"],
|
envVars["OLLAMA_TMPDIR"],
|
||||||
envVars["OLLAMA_FLASH_ATTENTION"],
|
envVars["OLLAMA_FLASH_ATTENTION"],
|
||||||
envVars["OLLAMA_LLM_LIBRARY"],
|
envVars["OLLAMA_LLM_LIBRARY"],
|
||||||
|
envVars["OLLAMA_GPU_OVERHEAD"],
|
||||||
})
|
})
|
||||||
default:
|
default:
|
||||||
appendEnvDocs(cmd, envs)
|
appendEnvDocs(cmd, envs)
|
||||||
|
|
|
@ -231,6 +231,25 @@ var (
|
||||||
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func Uint64(key string, defaultValue uint64) func() uint64 {
|
||||||
|
return func() uint64 {
|
||||||
|
if s := Var(key); s != "" {
|
||||||
|
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
|
||||||
|
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
||||||
|
} else {
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return defaultValue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
// Set aside VRAM per GPU
|
||||||
|
GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
||||||
|
)
|
||||||
|
|
||||||
type EnvVar struct {
|
type EnvVar struct {
|
||||||
Name string
|
Name string
|
||||||
Value any
|
Value any
|
||||||
|
@ -241,6 +260,7 @@ func AsMap() map[string]EnvVar {
|
||||||
ret := map[string]EnvVar{
|
ret := map[string]EnvVar{
|
||||||
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
||||||
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
||||||
|
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
|
||||||
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
||||||
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
||||||
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
||||||
|
|
|
@ -7,6 +7,7 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
)
|
)
|
||||||
|
@ -94,6 +95,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
// Overflow that didn't fit into the GPU
|
// Overflow that didn't fit into the GPU
|
||||||
var overflow uint64
|
var overflow uint64
|
||||||
|
|
||||||
|
overhead := envconfig.GpuOverhead()
|
||||||
availableList := make([]string, len(gpus))
|
availableList := make([]string, len(gpus))
|
||||||
for i, gpu := range gpus {
|
for i, gpu := range gpus {
|
||||||
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
|
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
|
||||||
|
@ -164,7 +166,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
gzo = gpuZeroOverhead
|
gzo = gpuZeroOverhead
|
||||||
}
|
}
|
||||||
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
||||||
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
||||||
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
|
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -196,7 +198,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
for j := len(gpusWithSpace); j > 0; j-- {
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
g := gpusWithSpace[i%j]
|
g := gpusWithSpace[i%j]
|
||||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
if g.g.FreeMemory > used+layerSize {
|
if (g.g.FreeMemory - overhead) > used+layerSize {
|
||||||
gpuAllocations[g.i] += layerSize
|
gpuAllocations[g.i] += layerSize
|
||||||
layerCounts[g.i]++
|
layerCounts[g.i]++
|
||||||
layerCount++
|
layerCount++
|
||||||
|
@ -219,7 +221,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
for j := len(gpusWithSpace); j > 0; j-- {
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
g := gpusWithSpace[layerCount%j]
|
g := gpusWithSpace[layerCount%j]
|
||||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
if g.g.FreeMemory > used+memoryLayerOutput {
|
if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
|
||||||
gpuAllocations[g.i] += memoryLayerOutput
|
gpuAllocations[g.i] += memoryLayerOutput
|
||||||
layerCounts[g.i]++
|
layerCounts[g.i]++
|
||||||
layerCount++
|
layerCount++
|
||||||
|
@ -306,6 +308,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m MemoryEstimate) log() {
|
func (m MemoryEstimate) log() {
|
||||||
|
overhead := envconfig.GpuOverhead()
|
||||||
slog.Info(
|
slog.Info(
|
||||||
"offload to "+m.inferenceLibrary,
|
"offload to "+m.inferenceLibrary,
|
||||||
slog.Group(
|
slog.Group(
|
||||||
|
@ -323,6 +326,7 @@ func (m MemoryEstimate) log() {
|
||||||
"memory",
|
"memory",
|
||||||
// memory available by GPU for offloading
|
// memory available by GPU for offloading
|
||||||
"available", m.availableList,
|
"available", m.availableList,
|
||||||
|
"gpu_overhead", format.HumanBytes2(overhead),
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"required",
|
"required",
|
||||||
// memory required for full offloading
|
// memory required for full offloading
|
||||||
|
|
Loading…
Reference in a new issue