Merge pull request #2465 from dhiltgen/block_rocm_pre_9

Detect AMD GPU info via sysfs and block old cards
This commit is contained in:
Daniel Hiltgen 2024-02-12 12:41:43 -08:00 committed by GitHub
commit 76b8728f0c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 151 additions and 34 deletions

91
gpu/amd.go Normal file
View file

@ -0,0 +1,91 @@
package gpu
import (
"bufio"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
)
// TODO - windows vs. non-windows vs darwin
// Discovery logic for AMD/ROCm GPUs
const (
DriverVersionFile = "/sys/module/amdgpu/version"
GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
// TODO probably break these down per GPU to make the logic simpler
GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
GPUUsedMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
)
func AMDDetected() bool {
_, err := AMDDriverVersion()
return err == nil
}
func AMDDriverVersion() (string, error) {
_, err := os.Stat(DriverVersionFile)
if err != nil {
return "", err
}
fp, err := os.Open(DriverVersionFile)
if err != nil {
return "", err
}
defer fp.Close()
verString, err := io.ReadAll(fp)
if err != nil {
return "", err
}
return strings.TrimSpace(string(verString)), nil
}
func AMDGFXVersions() []Version {
res := []Version{}
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
for _, match := range matches {
fp, err := os.Open(match)
if err != nil {
slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
continue
}
defer fp.Close()
scanner := bufio.NewScanner(fp)
// optionally, resize scanner's capacity for lines over 64K, see next example
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "gfx_target_version") {
ver := strings.Fields(line)
if len(ver) != 2 || len(ver[1]) < 5 {
slog.Debug("malformed " + line)
continue
}
l := len(ver[1])
patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
if err1 != nil || err2 != nil || err3 != nil {
slog.Debug("malformed int " + line)
continue
}
res = append(res, Version{
Major: uint(major),
Minor: uint(minor),
Patch: uint(patch),
})
}
}
}
return res
}
func (v Version) ToGFXString() string {
return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
}

View file

@ -149,43 +149,63 @@ func GetGPUInfo() GpuInfo {
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)) slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
} }
} }
} else if gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") { } else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
C.rocm_check_vram(*gpuHandles.rocm, &memInfo) ver, err := AMDDriverVersion()
if memInfo.err != nil { if err == nil {
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))) slog.Info("AMD Driver: " + ver)
C.free(unsafe.Pointer(memInfo.err)) }
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 { gfx := AMDGFXVersions()
// Only one GPU detected and it appears to be an integrated GPU - skip it tooOld := false
slog.Info("ROCm unsupported integrated GPU detected") for _, v := range gfx {
} else if memInfo.count > 0 { if v.Major < 9 {
if memInfo.igpu_index >= 0 { slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
// We have multiple GPUs reported, and one of them is an integrated GPU tooOld = true
// so we have to set the env var to bypass it break
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it }
val := os.Getenv("ROCR_VISIBLE_DEVICES")
if val == "" { // TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
devices := []string{} // e.g. gfx1034 works if we map it to gfx1030 at runtime
for i := 0; i < int(memInfo.count); i++ {
if i == int(memInfo.igpu_index) { }
continue if !tooOld {
// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
// Only one GPU detected and it appears to be an integrated GPU - skip it
slog.Info("ROCm unsupported integrated GPU detected")
} else if memInfo.count > 0 {
if memInfo.igpu_index >= 0 {
// We have multiple GPUs reported, and one of them is an integrated GPU
// so we have to set the env var to bypass it
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
val := os.Getenv("ROCR_VISIBLE_DEVICES")
if val == "" {
devices := []string{}
for i := 0; i < int(memInfo.count); i++ {
if i == int(memInfo.igpu_index) {
continue
}
devices = append(devices, strconv.Itoa(i))
} }
devices = append(devices, strconv.Itoa(i)) val = strings.Join(devices, ",")
os.Setenv("ROCR_VISIBLE_DEVICES", val)
} }
val = strings.Join(devices, ",") slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
os.Setenv("ROCR_VISIBLE_DEVICES", val)
} }
slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val)) resp.Library = "rocm"
var version C.rocm_version_resp_t
C.rocm_get_version(*gpuHandles.rocm, &version)
verString := C.GoString(version.str)
if version.status == 0 {
resp.Variant = "v" + verString
} else {
slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
}
C.free(unsafe.Pointer(version.str))
} }
resp.Library = "rocm"
var version C.rocm_version_resp_t
C.rocm_get_version(*gpuHandles.rocm, &version)
verString := C.GoString(version.str)
if version.status == 0 {
resp.Variant = "v" + verString
} else {
slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
}
C.free(unsafe.Pointer(version.str))
} }
} }
if resp.Library == "" { if resp.Library == "" {

View file

@ -16,3 +16,9 @@ type GpuInfo struct {
// TODO add other useful attributes about the card here for discovery information // TODO add other useful attributes about the card here for discovery information
} }
type Version struct {
Major uint
Minor uint
Patch uint
}

View file

@ -21,7 +21,6 @@ amdGPUs() {
return return
fi fi
GPU_LIST=( GPU_LIST=(
"gfx803"
"gfx900" "gfx900"
"gfx906:xnack-" "gfx906:xnack-"
"gfx908:xnack-" "gfx908:xnack-"

View file

@ -90,6 +90,7 @@ func getDynLibs(gpuInfo gpu.GpuInfo) []string {
if len(dynLibs) == 0 { if len(dynLibs) == 0 {
dynLibs = []string{availableDynLibs["cpu"]} dynLibs = []string{availableDynLibs["cpu"]}
} }
slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
return dynLibs return dynLibs
} }