Disable concurrency for AMD + Windows
Until ROCm v6.2 ships, we wont be able to get accurate free memory reporting on windows, which makes automatic concurrency too risky. Users can still opt-in but will need to pay attention to model sizes otherwise they may thrash/page VRAM or cause OOM crashes. All other platforms and GPUs have accurate VRAM reporting wired up now, so we can turn on concurrency by default.
This commit is contained in:
parent
17b7186cd7
commit
9929751cc8
4 changed files with 44 additions and 10 deletions
|
@ -85,13 +85,13 @@ func AsMap() map[string]EnvVar {
|
||||||
"OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
"OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
||||||
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
|
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
|
||||||
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
|
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
|
||||||
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU (default 4)"},
|
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU (default auto)"},
|
||||||
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
|
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
|
||||||
"OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
|
"OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
|
||||||
"OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
|
"OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
|
||||||
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
|
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
|
||||||
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
|
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
|
||||||
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
|
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default auto)"},
|
||||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
|
||||||
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
|
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
|
||||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
|
||||||
|
@ -129,8 +129,8 @@ func clean(key string) string {
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
// default values
|
// default values
|
||||||
NumParallel = 0
|
NumParallel = 0 // Autoselect
|
||||||
MaxRunners = 4
|
MaxRunners = 0 // Autoselect
|
||||||
MaxQueuedRequests = 512
|
MaxQueuedRequests = 512
|
||||||
|
|
||||||
LoadConfig()
|
LoadConfig()
|
||||||
|
|
|
@ -115,8 +115,6 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO revisit this once ROCm v6 is available on windows.
|
|
||||||
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
|
|
||||||
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
||||||
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
|
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
|
||||||
gpuInfo := RocmGPUInfo{
|
gpuInfo := RocmGPUInfo{
|
||||||
|
@ -126,6 +124,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
TotalMemory: totalMemory,
|
TotalMemory: totalMemory,
|
||||||
FreeMemory: freeMemory,
|
FreeMemory: freeMemory,
|
||||||
},
|
},
|
||||||
|
// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
|
||||||
|
UnreliableFreeMemory: true,
|
||||||
|
|
||||||
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
||||||
DependencyPath: libDir,
|
DependencyPath: libDir,
|
||||||
MinimumMemory: rocmMinimumMemory,
|
MinimumMemory: rocmMinimumMemory,
|
||||||
|
|
|
@ -29,6 +29,11 @@ type GpuInfo struct {
|
||||||
// Extra environment variables specific to the GPU as list of [key,value]
|
// Extra environment variables specific to the GPU as list of [key,value]
|
||||||
EnvWorkarounds [][2]string `json:"envs,omitempty"`
|
EnvWorkarounds [][2]string `json:"envs,omitempty"`
|
||||||
|
|
||||||
|
// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates
|
||||||
|
// the FreeMemory is best effort, and may over or under report actual memory usage
|
||||||
|
// False indicates FreeMemory can generally be trusted on this GPU
|
||||||
|
UnreliableFreeMemory bool
|
||||||
|
|
||||||
// GPU information
|
// GPU information
|
||||||
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
|
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
|
||||||
Name string `json:"name"` // user friendly name if available
|
Name string `json:"name"` // user friendly name if available
|
||||||
|
|
|
@ -46,6 +46,16 @@ type Scheduler struct {
|
||||||
reschedDelay time.Duration
|
reschedDelay time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Default automatic value for number of models we allow per GPU
|
||||||
|
// Model will still need to fit in VRAM, but loading many small models
|
||||||
|
// on a large GPU can cause stalling
|
||||||
|
var defaultModelsPerGPU = 3
|
||||||
|
|
||||||
|
// Default automatic value for parallel setting
|
||||||
|
// Model will still need to fit in VRAM. If this setting wont fit
|
||||||
|
// we'll back off down to 1 to try to get it to fit
|
||||||
|
var defaultParallel = 4
|
||||||
|
|
||||||
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
||||||
|
|
||||||
func InitScheduler(ctx context.Context) *Scheduler {
|
func InitScheduler(ctx context.Context) *Scheduler {
|
||||||
|
@ -100,7 +110,6 @@ func (s *Scheduler) Run(ctx context.Context) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scheduler) processPending(ctx context.Context) {
|
func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
maxRunnerFactor := 1 // number of GPUs or 1
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
@ -143,7 +152,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
pending.useLoadedRunner(runner, s.finishedReqCh)
|
pending.useLoadedRunner(runner, s.finishedReqCh)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
} else if envconfig.MaxRunners > 0 && loadedCount >= (maxRunnerFactor*envconfig.MaxRunners) {
|
} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
|
||||||
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
||||||
runnerToExpire = s.findRunnerToUnload()
|
runnerToExpire = s.findRunnerToUnload()
|
||||||
} else {
|
} else {
|
||||||
|
@ -155,7 +164,26 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
} else {
|
} else {
|
||||||
gpus = s.getGpuFn()
|
gpus = s.getGpuFn()
|
||||||
}
|
}
|
||||||
maxRunnerFactor = max(len(gpus), 1)
|
|
||||||
|
if envconfig.MaxRunners <= 0 {
|
||||||
|
// No user specified MaxRunners, so figure out what automatic setting to use
|
||||||
|
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
|
||||||
|
// if any GPU has unreliable free memory reporting, 1x the number of GPUs
|
||||||
|
allReliable := true
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
if gpu.UnreliableFreeMemory {
|
||||||
|
allReliable = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if allReliable {
|
||||||
|
envconfig.MaxRunners = defaultModelsPerGPU * len(gpus)
|
||||||
|
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus))
|
||||||
|
} else {
|
||||||
|
slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
|
||||||
|
envconfig.MaxRunners = len(gpus)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Load model for fitting
|
// Load model for fitting
|
||||||
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
||||||
|
@ -647,7 +675,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
|
||||||
var numParallelToTry []int
|
var numParallelToTry []int
|
||||||
if *numParallel <= 0 {
|
if *numParallel <= 0 {
|
||||||
// If no specific parallel setting was provided, try larger then smaller, always end with 1
|
// If no specific parallel setting was provided, try larger then smaller, always end with 1
|
||||||
numParallelToTry = append(numParallelToTry, 4, 1)
|
numParallelToTry = append(numParallelToTry, defaultParallel, 1)
|
||||||
} else {
|
} else {
|
||||||
numParallelToTry = []int{*numParallel}
|
numParallelToTry = []int{*numParallel}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue