Merge pull request #3088 from dhiltgen/rocm_igpu_linux
Fix iGPU detection for linux
This commit is contained in:
commit
a54d4a28dc
2 changed files with 28 additions and 14 deletions
|
@ -40,19 +40,17 @@ func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
|
||||||
// TODO - does sort order matter?
|
// TODO - does sort order matter?
|
||||||
devices := []string{}
|
devices := []string{}
|
||||||
for i := range ids {
|
for i := range ids {
|
||||||
slog.Debug(fmt.Sprintf("i=%d", i))
|
|
||||||
if _, skipped := skip[i]; skipped {
|
if _, skipped := skip[i]; skipped {
|
||||||
slog.Debug("skipped")
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
devices = append(devices, strconv.Itoa(i))
|
devices = append(devices, strconv.Itoa(i))
|
||||||
}
|
}
|
||||||
slog.Debug(fmt.Sprintf("devices=%v", devices))
|
|
||||||
|
|
||||||
val := strings.Join(devices, ",")
|
val := strings.Join(devices, ",")
|
||||||
err := os.Setenv("HIP_VISIBLE_DEVICES", val)
|
err := os.Setenv("HIP_VISIBLE_DEVICES", val)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn(fmt.Sprintf("failed to set env: %s", err))
|
slog.Warn(fmt.Sprintf("failed to set env: %s", err))
|
||||||
|
} else {
|
||||||
|
slog.Info("Setting HIP_VISIBLE_DEVICES=" + val)
|
||||||
}
|
}
|
||||||
slog.Debug("HIP_VISIBLE_DEVICES=" + val)
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,9 @@ const (
|
||||||
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
|
GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
|
||||||
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
|
GPUUsedMemoryFileGlob = "mem_banks/*/used_memory"
|
||||||
RocmStandardLocation = "/opt/rocm/lib"
|
RocmStandardLocation = "/opt/rocm/lib"
|
||||||
|
|
||||||
|
// TODO find a better way to detect iGPU instead of minimum memory
|
||||||
|
IGPUMemLimit = 1024 * 1024 * 1024 // 512G is what they typically report, so anything less than 1G must be iGPU
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -146,8 +149,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||||
resp.memInfo.DeviceCount = 0
|
resp.memInfo.DeviceCount = 0
|
||||||
resp.memInfo.TotalMemory = 0
|
resp.memInfo.TotalMemory = 0
|
||||||
resp.memInfo.FreeMemory = 0
|
resp.memInfo.FreeMemory = 0
|
||||||
|
slog.Debug("discovering VRAM for amdgpu devices")
|
||||||
if len(ids) == 0 {
|
if len(ids) == 0 {
|
||||||
slog.Debug("discovering all amdgpu devices")
|
|
||||||
entries, err := os.ReadDir(AMDNodesSysfsDir)
|
entries, err := os.ReadDir(AMDNodesSysfsDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
|
slog.Warn(fmt.Sprintf("failed to read amdgpu sysfs %s - %s", AMDNodesSysfsDir, err))
|
||||||
|
@ -165,7 +168,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||||
ids = append(ids, id)
|
ids = append(ids, id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
slog.Debug(fmt.Sprintf("discovering amdgpu devices %v", ids))
|
slog.Debug(fmt.Sprintf("amdgpu devices %v", ids))
|
||||||
|
|
||||||
for _, id := range ids {
|
for _, id := range ids {
|
||||||
if _, skipped := skip[id]; skipped {
|
if _, skipped := skip[id]; skipped {
|
||||||
|
@ -173,7 +176,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||||
}
|
}
|
||||||
totalMemory := uint64(0)
|
totalMemory := uint64(0)
|
||||||
usedMemory := uint64(0)
|
usedMemory := uint64(0)
|
||||||
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUTotalMemoryFileGlob)
|
// Adjust for sysfs vs HIP ids
|
||||||
|
propGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id+1), GPUTotalMemoryFileGlob)
|
||||||
propFiles, err := filepath.Glob(propGlob)
|
propFiles, err := filepath.Glob(propGlob)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
|
slog.Warn(fmt.Sprintf("error looking up total GPU memory: %s %s", propGlob, err))
|
||||||
|
@ -205,6 +209,13 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if totalMemory == 0 {
|
if totalMemory == 0 {
|
||||||
|
slog.Warn(fmt.Sprintf("amdgpu [%d] reports zero total memory, skipping", id))
|
||||||
|
skip[id] = struct{}{}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if totalMemory < IGPUMemLimit {
|
||||||
|
slog.Info(fmt.Sprintf("amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping", id, totalMemory/1024/1024))
|
||||||
|
skip[id] = struct{}{}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
|
usedGlob := filepath.Join(AMDNodesSysfsDir, strconv.Itoa(id), GPUUsedMemoryFileGlob)
|
||||||
|
@ -232,8 +243,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
|
||||||
}
|
}
|
||||||
usedMemory += used
|
usedMemory += used
|
||||||
}
|
}
|
||||||
slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %d", id, totalMemory))
|
slog.Info(fmt.Sprintf("[%d] amdgpu totalMemory %dM", id, totalMemory/1024/1024))
|
||||||
slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %d", id, (totalMemory - usedMemory)))
|
slog.Info(fmt.Sprintf("[%d] amdgpu freeMemory %dM", id, (totalMemory-usedMemory)/1024/1024))
|
||||||
resp.memInfo.DeviceCount++
|
resp.memInfo.DeviceCount++
|
||||||
resp.memInfo.TotalMemory += totalMemory
|
resp.memInfo.TotalMemory += totalMemory
|
||||||
resp.memInfo.FreeMemory += (totalMemory - usedMemory)
|
resp.memInfo.FreeMemory += (totalMemory - usedMemory)
|
||||||
|
@ -358,6 +369,8 @@ func AMDDriverVersion() (string, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func AMDGFXVersions() map[int]Version {
|
func AMDGFXVersions() map[int]Version {
|
||||||
|
// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
|
||||||
|
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
|
||||||
res := map[int]Version{}
|
res := map[int]Version{}
|
||||||
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
|
@ -373,17 +386,20 @@ func AMDGFXVersions() map[int]Version {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if i == 0 {
|
||||||
|
// Skipping the CPU
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Align with HIP IDs (zero is first GPU, not CPU)
|
||||||
|
i -= 1
|
||||||
|
|
||||||
scanner := bufio.NewScanner(fp)
|
scanner := bufio.NewScanner(fp)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
line := strings.TrimSpace(scanner.Text())
|
line := strings.TrimSpace(scanner.Text())
|
||||||
if strings.HasPrefix(line, "gfx_target_version") {
|
if strings.HasPrefix(line, "gfx_target_version") {
|
||||||
ver := strings.Fields(line)
|
ver := strings.Fields(line)
|
||||||
if len(ver) != 2 || len(ver[1]) < 5 {
|
if len(ver) != 2 || len(ver[1]) < 5 {
|
||||||
|
if ver[1] != "0" {
|
||||||
if ver[1] == "0" {
|
|
||||||
// Silently skip the CPU
|
|
||||||
continue
|
|
||||||
} else {
|
|
||||||
slog.Debug("malformed " + line)
|
slog.Debug("malformed " + line)
|
||||||
}
|
}
|
||||||
res[i] = Version{
|
res[i] = Version{
|
||||||
|
|
Loading…
Add table
Reference in a new issue