34b9db5afc
This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS.
22 lines
490 B
Go
22 lines
490 B
Go
//go:build linux || windows
|
|
|
|
package gpu
|
|
|
|
import (
|
|
"log/slog"
|
|
"strings"
|
|
)
|
|
|
|
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
|
ids := []string{}
|
|
for _, info := range gpuInfo {
|
|
if info.Library != "cuda" {
|
|
// TODO shouldn't happen if things are wired correctly...
|
|
slog.Debug("cudaGetVisibleDevicesEnv skipping over non-cuda device", "library", info.Library)
|
|
continue
|
|
}
|
|
ids = append(ids, info.ID)
|
|
}
|
|
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
|
|
|
|
}
|