Fix iGPU detection for linux
This fixes a few bugs in the new sysfs discovery logic. iGPUs are now correctly identified by their <1G VRAM reported. the sysfs IDs are off by one compared to what HIP wants due to the CPU being reported in amdgpu, but HIP only cares about GPUs.
This commit is contained in:
parent
34d00f90b1
commit
82b0c7c27e
2 changed files with 28 additions and 14 deletions
|
@ -40,19 +40,17 @@ func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
|
||||||
// TODO - does sort order matter?
|
// TODO - does sort order matter?
|
||||||
devices := []string{}
|
devices := []string{}
|
||||||
for i := range ids {
|
for i := range ids {
|
||||||
slog.Debug(fmt.Sprintf("i=%d", i))
|
|
||||||
if _, skipped := skip[i]; skipped {
|
if _, skipped := skip[i]; skipped {
|
||||||
slog.Debug("skipped")
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
devices = append(devices, strconv.Itoa(i))
|
devices = append(devices, strconv.Itoa(i))
|
||||||
}
|
}
|
||||||
slog.Debug(fmt.Sprintf("devices=%v", devices))
|
|
||||||
|
|
||||||
val := strings.Join(devices, ",")
|
val := strings.Join(devices, ",")
|
||||||
err := os.Setenv("HIP_VISIBLE_DEVICES", val)
|
err := os.Setenv("HIP_VISIBLE_DEVICES", val)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn(fmt.Sprintf("failed to set env: %s", err))
|
slog.Warn(fmt.Sprintf("failed to set env: %s", err))
|
||||||
|
} else {
|
||||||
|
slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
|
||||||
}
|
}
|
||||||
slog.Debug("HIP_VISIBLE_DEVICES=" + val)
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,9 @@ const (
|
||||||
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
|
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
|
||||||
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
|
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
|
||||||
RocmStandardLocation = "/opt/rocm/lib"
|
RocmStandardLocation = "/opt/rocm/lib"
|
||||||
|
|
||||||
|
// TODO find a better way to detect iGPU instead of minimum memory
|
||||||
|
IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -146,8 +149,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||||
resp.memInfo.DeviceCount = 0
|
resp.memInfo.DeviceCount = 0
|
||||||
resp.memInfo.TotalMemory = 0
|
resp.memInfo.TotalMemory = 0
|
||||||
resp.memInfo.FreeMemory = 0
|
resp.memInfo.FreeMemory = 0
|
||||||
|
slog.Debug("discovering VRAM for amdgpu devices")
|
||||||
if len(ids) == 0 {
|
if len(ids) == 0 {
|
||||||
slog.Debug("discovering all amdgpu devices")
|
|
||||||
entries, err := os.ReadDir(AMDNodesSysfsDir)
|
entries, err := os.ReadDir(AMDNodesSysfsDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
|
slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
|
||||||
|
@ -165,7 +168,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||||
ids = append(ids, id)
|
ids = append(ids, id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids))
|
slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
|
||||||
|
|
||||||
for _, id := range ids {
|
for _, id := range ids {
|
||||||
if _, skipped := skip[id]; skipped {
|
if _, skipped := skip[id]; skipped {
|
||||||
|
@ -173,7 +176,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||||
}
|
}
|
||||||
totalMemory := uint64(0)
|
totalMemory := uint64(0)
|
||||||
usedMemory := uint64(0)
|
usedMemory := uint64(0)
|
||||||
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob)
|
// Adjust for sysfs vs HIP ids
|
||||||
|
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
|
||||||
propFiles, err := filepath.Glob(propGlob)
|
propFiles, err := filepath.Glob(propGlob)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
|
slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
|
||||||
|
@ -205,6 +209,13 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if totalMemory == 0 {
|
if totalMemory == 0 {
|
||||||
|
slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
|
||||||
|
skip[id] = struct{}{}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if totalMemory < IGPUMemLimit {
|
||||||
|
slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
|
||||||
|
skip[id] = struct{}{}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
|
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
|
||||||
|
@ -232,8 +243,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||||
}
|
}
|
||||||
usedMemory += used
|
usedMemory += used
|
||||||
}
|
}
|
||||||
slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory))
|
slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
|
||||||
slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %d", id, (totalMemory - usedMemory)))
|
slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %dM", id, (totalMemory-usedMemory)/1024/1024))
|
||||||
resp.memInfo.DeviceCount++
|
resp.memInfo.DeviceCount++
|
||||||
resp.memInfo.TotalMemory += totalMemory
|
resp.memInfo.TotalMemory += totalMemory
|
||||||
resp.memInfo.FreeMemory += (totalMemory - usedMemory)
|
resp.memInfo.FreeMemory += (totalMemory - usedMemory)
|
||||||
|
@ -358,6 +369,8 @@ func AMDDriverVersion() (string, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func AMDGFXVersions() map[int]Version {
|
func AMDGFXVersions() map[int]Version {
|
||||||
|
// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
|
||||||
|
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
|
||||||
res := map[int]Version{}
|
res := map[int]Version{}
|
||||||
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
|
@ -373,17 +386,20 @@ func AMDGFXVersions() map[int]Version {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if i == 0 {
|
||||||
|
// Skipping the CPU
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Align with HIP IDs (zero is first GPU, not CPU)
|
||||||
|
i -= 1
|
||||||
|
|
||||||
scanner := bufio.NewScanner(fp)
|
scanner := bufio.NewScanner(fp)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
line := strings.TrimSpace(scanner.Text())
|
line := strings.TrimSpace(scanner.Text())
|
||||||
if strings.HasPrefix(line, "gfx_target_version") {
|
if strings.HasPrefix(line, "gfx_target_version") {
|
||||||
ver := strings.Fields(line)
|
ver := strings.Fields(line)
|
||||||
if len(ver) != 2 || len(ver[1]) < 5 {
|
if len(ver) != 2 || len(ver[1]) < 5 {
|
||||||
|
if ver[1] != "0" {
|
||||||
if ver[1] == "0" {
|
|
||||||
// Silently skip the CPU
|
|
||||||
continue
|
|
||||||
} else {
|
|
||||||
slog.Debug("malformed " + line)
|
slog.Debug("malformed " + line)
|
||||||
}
|
}
|
||||||
res[i] = Version{
|
res[i] = Version{
|
||||||
|
|
Loading…
Reference in a new issue