From d7c94e0ca6c39f6c64f74799c0dc8f3f91079edc Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Sat, 26 Oct 2024 14:04:14 -0700 Subject: [PATCH] Better support for AMD multi-GPU on linux (#7212) * Better support for AMD multi-GPU This resolves a number of problems related to AMD multi-GPU setups on linux. The numeric IDs used by rocm are not the same as the numeric IDs exposed in sysfs although the ordering is consistent. We have to count up from the first valid gfx (major/minor/patch with non-zero values) we find starting at zero. There are 3 different env vars for selecting GPUs, and only ROCR_VISIBLE_DEVICES supports UUID based identification, so we should favor that one, and try to use UUIDs if detected to avoid potential ordering bugs with numeric IDs * ROCR_VISIBLE_DEVICES only works on linux Use the numeric ID only HIP_VISIBLE_DEVICES on windows --- discover/amd_common.go | 13 -------- discover/amd_hip_windows.go | 2 +- discover/amd_linux.go | 63 +++++++++++++++++++++++-------------- discover/amd_windows.go | 19 ++++++++++- docs/gpu.md | 9 ++++-- envconfig/config.go | 6 ++-- 6 files changed, 69 insertions(+), 43 deletions(-) diff --git a/discover/amd_common.go b/discover/amd_common.go index bf969240..3c630861 100644 --- a/discover/amd_common.go +++ b/discover/amd_common.go @@ -37,19 +37,6 @@ func GetSupportedGFX(libDir string) ([]string, error) { return ret, nil } -func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { - ids := []string{} - for _, info := range gpuInfo { - if info.Library != "rocm" { - // TODO shouldn't happen if things are wired correctly... - slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library) - continue - } - ids = append(ids, info.ID) - } - return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",") -} - func commonAMDValidateLibDir() (string, error) { // Favor our bundled version diff --git a/discover/amd_hip_windows.go b/discover/amd_hip_windows.go index 12a7af0b..bf19ef06 100644 --- a/discover/amd_hip_windows.go +++ b/discover/amd_hip_windows.go @@ -64,7 +64,7 @@ func NewHipLib() (*HipLib, error) { return hl, nil } -// The hip library only evaluates the HIP_VISIBLE_DEVICES variable at startup +// The hip library only evaluates the ROCR_VISIBLE_DEVICES variable at startup // so we have to unload/reset the library after we do our initial discovery // to make sure our updates to that variable are processed by llama.cpp func (hl *HipLib) Release() { diff --git a/discover/amd_linux.go b/discover/amd_linux.go index dd8e605c..fad7b7a6 100644 --- a/discover/amd_linux.go +++ b/discover/amd_linux.go @@ -64,16 +64,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { // Determine if the user has already pre-selected which GPUs to look at, then ignore the others var visibleDevices []string hipVD := envconfig.HipVisibleDevices() // zero based index only - rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID, but consumer cards seem to not support UUID + rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID gpuDO := envconfig.GpuDeviceOrdinal() // zero based index switch { - // TODO is this priorty order right? - case hipVD != "": - visibleDevices = strings.Split(hipVD, ",") case rocrVD != "": visibleDevices = strings.Split(rocrVD, ",") - // TODO - since we don't yet support UUIDs, consider detecting and reporting here - // all our test systems show GPU-XX indicating UUID is not supported + case hipVD != "": + visibleDevices = strings.Split(hipVD, ",") case gpuDO != "": visibleDevices = strings.Split(gpuDO, ",") } @@ -99,7 +96,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { } return a < b }) - cpuCount := 0 + gpuCount := 0 for _, match := range matches { slog.Debug("evaluating amdgpu node " + match) fp, err := os.Open(match) @@ -108,11 +105,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { continue } defer fp.Close() - nodeID, err := strconv.Atoi(filepath.Base(filepath.Dir(match))) - if err != nil { - slog.Debug("failed to parse node ID", "error", err) - continue - } scanner := bufio.NewScanner(fp) isCPU := false @@ -186,20 +178,19 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { // do reliably report VRAM usage. if isCPU { - cpuCount++ continue } - // CPUs are always first in the list - gpuID := nodeID - cpuCount - - // Shouldn't happen, but just in case... - if gpuID < 0 { - err := fmt.Errorf("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue") - slog.Error(err.Error()) - return nil, err + // Skip over any GPUs that are masked + if major == 0 && minor == 0 && patch == 0 { + slog.Debug("skipping gpu with gfx000") + continue } + // Keep track of numeric IDs based on valid GPUs + gpuID := gpuCount + gpuCount += 1 + // Look up the memory for the current node totalMemory := uint64(0) usedMemory := uint64(0) @@ -273,6 +264,14 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { name = fmt.Sprintf("%04x:%04x", vendor, device) } + // Favor UUIDs if available to reduce possibility of getting the numeric IDs wrong + var ID string + if uniqueID != 0 { + ID = fmt.Sprintf("GPU-%016x", uniqueID) + } else { + ID = strconv.Itoa(gpuID) + } + gpuInfo := RocmGPUInfo{ GpuInfo: GpuInfo{ Library: "rocm", @@ -280,7 +279,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { TotalMemory: totalMemory, FreeMemory: (totalMemory - usedMemory), }, - ID: strconv.Itoa(gpuID), + ID: ID, Name: name, Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch), MinimumMemory: rocmMinimumMemory, @@ -288,6 +287,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { DriverMinor: driverMinor, }, usedFilepath: usedFile, + index: gpuID, } // iGPU detection, remove this check once we can support an iGPU variant of the rocm library @@ -319,7 +319,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { if len(visibleDevices) > 0 { include := false for _, visible := range visibleDevices { - if visible == gpuInfo.ID { + if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) { include = true break } @@ -516,3 +516,20 @@ func verifyKFDDriverAccess() error { fd.Close() return nil } + +func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { + ids := []string{} + for _, info := range gpuInfo { + if info.Library != "rocm" { + // TODO shouldn't happen if things are wired correctly... + slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library) + continue + } + ids = append(ids, info.ID) + } + // There are 3 potential env vars to use to select GPUs. + // ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux + // GPU_DEVICE_ORDINAL supports numeric IDs only + // HIP_VISIBLE_DEVICES supports numeric IDs only + return "ROCR_VISIBLE_DEVICES", strings.Join(ids, ",") +} diff --git a/discover/amd_windows.go b/discover/amd_windows.go index a3a6e0c3..b0c76f1e 100644 --- a/discover/amd_windows.go +++ b/discover/amd_windows.go @@ -43,7 +43,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { slog.Debug("error looking up amd driver version", "error", err) } - // Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified + // Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified count := hl.HipGetDeviceCount() if count == 0 { err := fmt.Errorf("no compatible amdgpu devices detected") @@ -201,3 +201,20 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error { } return nil } + +func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { + ids := []string{} + for _, info := range gpuInfo { + if info.Library != "rocm" { + // TODO shouldn't happen if things are wired correctly... + slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library) + continue + } + ids = append(ids, info.ID) + } + // There are 3 potential env vars to use to select GPUs. + // ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows + // HIP_VISIBLE_DEVICES supports numeric IDs only + // GPU_DEVICE_ORDINAL supports numeric IDs only + return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",") +} diff --git a/docs/gpu.md b/docs/gpu.md index 2913a2e2..2de5bd24 100644 --- a/docs/gpu.md +++ b/docs/gpu.md @@ -74,6 +74,10 @@ would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the server. If you have an unsupported AMD GPU you can experiment using the list of supported types below. +If you have multiple GPUs with different GFX versions, append the numeric device +number to the environment variable to set them individually. For example, +`HSA_OVERRIDE_GFX_VERSION_0=10.3.0` and `HSA_OVERRIDE_GFX_VERSION_1=11.0.0` + At this time, the known supported GPU types on linux are the following LLVM Targets. This table shows some example GPUs that map to these LLVM targets: | **LLVM Target** | **An Example GPU** | @@ -99,9 +103,10 @@ Reach out on [Discord](https://discord.gg/ollama) or file an ### GPU Selection If you have multiple AMD GPUs in your system and want to limit Ollama to use a -subset, you can set `HIP_VISIBLE_DEVICES` to a comma separated list of GPUs. +subset, you can set `ROCR_VISIBLE_DEVICES` to a comma separated list of GPUs. You can see the list of devices with `rocminfo`. If you want to ignore the GPUs -and force CPU usage, use an invalid GPU ID (e.g., "-1") +and force CPU usage, use an invalid GPU ID (e.g., "-1"). When available, use the +`Uuid` to uniquely identify the device instead of numeric value. ### Container Permission diff --git a/envconfig/config.go b/envconfig/config.go index db2a5625..e80c67ba 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -265,9 +265,9 @@ func AsMap() map[string]EnvVar { if runtime.GOOS != "darwin" { ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"} - ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"} - ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible"} - ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible"} + ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible by numeric ID"} + ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible by UUID or numeric ID"} + ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible by numeric ID"} ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"} ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"} }