Fix iGPU detection for linux

This fixes a few bugs in the new sysfs discovery logic.  iGPUs are now
correctly identified by their <1G VRAM reported.  the sysfs IDs are off
by one compared to what HIP wants due to the CPU being reported
in amdgpu, but HIP only cares about GPUs.
This commit is contained in:
Daniel Hiltgen 2024-03-12 16:57:19 -07:00
parent 34d00f90b1
commit 82b0c7c27e
2 changed files with 28 additions and 14 deletions

View file

@ -40,19 +40,17 @@ func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
// TODO - does sort order matter? // TODO - does sort order matter?
devices := []string{} devices := []string{}
for i := range ids { for i := range ids {
slog.Debug(fmt.Sprintf("i=%d", i))
if _, skipped := skip[i]; skipped { if _, skipped := skip[i]; skipped {
slog.Debug("skipped")
continue continue
} }
devices = append(devices, strconv.Itoa(i)) devices = append(devices, strconv.Itoa(i))
} }
slog.Debug(fmt.Sprintf("devices=%v", devices))
val := strings.Join(devices, ",") val := strings.Join(devices, ",")
err := os.Setenv("HIP_VISIBLE_DEVICES", val) err := os.Setenv("HIP_VISIBLE_DEVICES", val)
if err != nil { if err != nil {
slog.Warn(fmt.Sprintf("failed to set env: %s", err)) slog.Warn(fmt.Sprintf("failed to set env: %s", err))
} else {
slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
} }
slog.Debug("HIP_VISIBLE_DEVICES=" + val)
} }

View file

@ -24,6 +24,9 @@ const (
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory" GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
RocmStandardLocation = "/opt/rocm/lib" RocmStandardLocation = "/opt/rocm/lib"
// TODO find a better way to detect iGPU instead of minimum memory
IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
) )
var ( var (
@ -146,8 +149,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
resp.memInfo.DeviceCount = 0 resp.memInfo.DeviceCount = 0
resp.memInfo.TotalMemory = 0 resp.memInfo.TotalMemory = 0
resp.memInfo.FreeMemory = 0 resp.memInfo.FreeMemory = 0
slog.Debug("discovering VRAM for amdgpu devices")
if len(ids) == 0 { if len(ids) == 0 {
slog.Debug("discovering all amdgpu devices")
entries, err := os.ReadDir(AMDNodesSysfsDir) entries, err := os.ReadDir(AMDNodesSysfsDir)
if err != nil { if err != nil {
slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err)) slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
@ -165,7 +168,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
ids = append(ids, id) ids = append(ids, id)
} }
} }
slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids)) slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
for _, id := range ids { for _, id := range ids {
if _, skipped := skip[id]; skipped { if _, skipped := skip[id]; skipped {
@ -173,7 +176,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
} }
totalMemory := uint64(0) totalMemory := uint64(0)
usedMemory := uint64(0) usedMemory := uint64(0)
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob) // Adjust for sysfs vs HIP ids
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
propFiles, err := filepath.Glob(propGlob) propFiles, err := filepath.Glob(propGlob)
if err != nil { if err != nil {
slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err)) slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
@ -205,6 +209,13 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
} }
} }
if totalMemory == 0 { if totalMemory == 0 {
slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
skip[id] = struct{}{}
continue
}
if totalMemory < IGPUMemLimit {
slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
skip[id] = struct{}{}
continue continue
} }
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob) usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
@ -232,8 +243,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
} }
usedMemory += used usedMemory += used
} }
slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory)) slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %d", id, (totalMemory - usedMemory))) slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %dM", id, (totalMemory-usedMemory)/1024/1024))
resp.memInfo.DeviceCount++ resp.memInfo.DeviceCount++
resp.memInfo.TotalMemory += totalMemory resp.memInfo.TotalMemory += totalMemory
resp.memInfo.FreeMemory += (totalMemory - usedMemory) resp.memInfo.FreeMemory += (totalMemory - usedMemory)
@ -358,6 +369,8 @@ func AMDDriverVersion() (string, error) {
} }
func AMDGFXVersions() map[int]Version { func AMDGFXVersions() map[int]Version {
// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
res := map[int]Version{} res := map[int]Version{}
matches, _ := filepath.Glob(GPUPropertiesFileGlob) matches, _ := filepath.Glob(GPUPropertiesFileGlob)
for _, match := range matches { for _, match := range matches {
@ -373,17 +386,20 @@ func AMDGFXVersions() map[int]Version {
continue continue
} }
if i == 0 {
// Skipping the CPU
continue
}
// Align with HIP IDs (zero is first GPU, not CPU)
i -= 1
scanner := bufio.NewScanner(fp) scanner := bufio.NewScanner(fp)
for scanner.Scan() { for scanner.Scan() {
line := strings.TrimSpace(scanner.Text()) line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "gfx_target_version") { if strings.HasPrefix(line, "gfx_target_version") {
ver := strings.Fields(line) ver := strings.Fields(line)
if len(ver) != 2 || len(ver[1]) < 5 { if len(ver) != 2 || len(ver[1]) < 5 {
if ver[1] != "0" {
if ver[1] == "0" {
// Silently skip the CPU
continue
} else {
slog.Debug("malformed " + line) slog.Debug("malformed " + line)
} }
res[i] = Version{ res[i] = Version{