diff --git a/gpu/amd_common.go b/gpu/amd_common.go index deb931ff..cf3348a8 100644 --- a/gpu/amd_common.go +++ b/gpu/amd_common.go @@ -40,19 +40,17 @@ func amdSetVisibleDevices(ids []int, skip map[int]interface{}) { // TODO - does sort order matter? devices := []string{} for i := range ids { - slog.Debug(fmt.Sprintf("i=%d", i)) if _, skipped := skip[i]; skipped { - slog.Debug("skipped") continue } devices = append(devices, strconv.Itoa(i)) } - slog.Debug(fmt.Sprintf("devices=%v", devices)) val := strings.Join(devices, ",") err := os.Setenv("HIP_VISIBLE_DEVICES", val) if err != nil { slog.Warn(fmt.Sprintf("failed to set env: %s", err)) + } else { + slog.Info("Setting HIP_VISIBLE_DEVICES=" + val) } - slog.Debug("HIP_VISIBLE_DEVICES=" + val) } diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index 2a2d22b6..b83f8a68 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -24,6 +24,9 @@ const ( GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line GPUUsedMemoryFileGlob = "mem_banks/*/used_memory" RocmStandardLocation = "/opt/rocm/lib" + + // TODO find a better way to detect iGPU instead of minimum memory + IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU ) var ( @@ -146,8 +149,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) { resp.memInfo.DeviceCount = 0 resp.memInfo.TotalMemory = 0 resp.memInfo.FreeMemory = 0 + slog.Debug("discovering VRAM for amdgpu devices") if len(ids) == 0 { - slog.Debug("discovering all amdgpu devices") entries, err := os.ReadDir(AMDNodesSysfsDir) if err != nil { slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err)) @@ -165,7 +168,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) { ids = append(ids, id) } } - slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids)) + slog.Debug(fmt.Sprintf("amdgpu devices %v", ids)) for _, id := range ids { if _, skipped := skip[id]; skipped { @@ -173,7 +176,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) { } totalMemory := uint64(0) usedMemory := uint64(0) - propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob) + // Adjust for sysfs vs HIP ids + propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob) propFiles, err := filepath.Glob(propGlob) if err != nil { slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err)) @@ -205,6 +209,13 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) { } } if totalMemory == 0 { + slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id)) + skip[id] = struct{}{} + continue + } + if totalMemory < IGPUMemLimit { + slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024)) + skip[id] = struct{}{} continue } usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob) @@ -232,8 +243,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) { } usedMemory += used } - slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory)) - slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %d", id, (totalMemory - usedMemory))) + slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024)) + slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %dM", id, (totalMemory-usedMemory)/1024/1024)) resp.memInfo.DeviceCount++ resp.memInfo.TotalMemory += totalMemory resp.memInfo.FreeMemory += (totalMemory - usedMemory) @@ -358,6 +369,8 @@ func AMDDriverVersion() (string, error) { } func AMDGFXVersions() map[int]Version { + // The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one + // from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU) res := map[int]Version{} matches, _ := filepath.Glob(GPUPropertiesFileGlob) for _, match := range matches { @@ -373,17 +386,20 @@ func AMDGFXVersions() map[int]Version { continue } + if i == 0 { + // Skipping the CPU + continue + } + // Align with HIP IDs (zero is first GPU, not CPU) + i -= 1 + scanner := bufio.NewScanner(fp) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if strings.HasPrefix(line, "gfx_target_version") { ver := strings.Fields(line) if len(ver) != 2 || len(ver[1]) < 5 { - - if ver[1] == "0" { - // Silently skip the CPU - continue - } else { + if ver[1] != "0" { slog.Debug("malformed " + line) } res[i] = Version{