Merge pull request #2465 from dhiltgen/block_rocm_pre_9
Detect AMD GPU info via sysfs and block old cards
This commit is contained in:
commit
76b8728f0c
5 changed files with 151 additions and 34 deletions
91
gpu/amd.go
Normal file
91
gpu/amd.go
Normal file
|
@ -0,0 +1,91 @@
|
|||
package gpu
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// TODO - windows vs. non-windows vs darwin
|
||||
|
||||
// Discovery logic for AMD/ROCm GPUs
|
||||
|
||||
const (
|
||||
DriverVersionFile = "/sys/module/amdgpu/version"
|
||||
GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
|
||||
// TODO probably break these down per GPU to make the logic simpler
|
||||
GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
|
||||
GPUUsedMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
|
||||
)
|
||||
|
||||
func AMDDetected() bool {
|
||||
_, err := AMDDriverVersion()
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func AMDDriverVersion() (string, error) {
|
||||
_, err := os.Stat(DriverVersionFile)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
fp, err := os.Open(DriverVersionFile)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer fp.Close()
|
||||
verString, err := io.ReadAll(fp)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSpace(string(verString)), nil
|
||||
}
|
||||
|
||||
func AMDGFXVersions() []Version {
|
||||
res := []Version{}
|
||||
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
||||
for _, match := range matches {
|
||||
fp, err := os.Open(match)
|
||||
if err != nil {
|
||||
slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
|
||||
continue
|
||||
}
|
||||
defer fp.Close()
|
||||
|
||||
scanner := bufio.NewScanner(fp)
|
||||
// optionally, resize scanner's capacity for lines over 64K, see next example
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if strings.HasPrefix(line, "gfx_target_version") {
|
||||
ver := strings.Fields(line)
|
||||
if len(ver) != 2 || len(ver[1]) < 5 {
|
||||
slog.Debug("malformed " + line)
|
||||
continue
|
||||
}
|
||||
l := len(ver[1])
|
||||
patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
|
||||
minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
|
||||
major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
|
||||
if err1 != nil || err2 != nil || err3 != nil {
|
||||
slog.Debug("malformed int " + line)
|
||||
continue
|
||||
}
|
||||
|
||||
res = append(res, Version{
|
||||
Major: uint(major),
|
||||
Minor: uint(minor),
|
||||
Patch: uint(patch),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func (v Version) ToGFXString() string {
|
||||
return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
|
||||
}
|
22
gpu/gpu.go
22
gpu/gpu.go
|
@ -149,7 +149,26 @@ func GetGPUInfo() GpuInfo {
|
|||
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||
}
|
||||
}
|
||||
} else if gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
||||
} else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
||||
ver, err := AMDDriverVersion()
|
||||
if err == nil {
|
||||
slog.Info("AMD Driver: " + ver)
|
||||
}
|
||||
gfx := AMDGFXVersions()
|
||||
tooOld := false
|
||||
for _, v := range gfx {
|
||||
if v.Major < 9 {
|
||||
slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
|
||||
tooOld = true
|
||||
break
|
||||
}
|
||||
|
||||
// TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
|
||||
// e.g. gfx1034 works if we map it to gfx1030 at runtime
|
||||
|
||||
}
|
||||
if !tooOld {
|
||||
// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
|
||||
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
||||
if memInfo.err != nil {
|
||||
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
|
||||
|
@ -188,6 +207,7 @@ func GetGPUInfo() GpuInfo {
|
|||
C.free(unsafe.Pointer(version.str))
|
||||
}
|
||||
}
|
||||
}
|
||||
if resp.Library == "" {
|
||||
C.cpu_check_ram(&memInfo)
|
||||
resp.Library = "cpu"
|
||||
|
|
|
@ -16,3 +16,9 @@ type GpuInfo struct {
|
|||
|
||||
// TODO add other useful attributes about the card here for discovery information
|
||||
}
|
||||
|
||||
type Version struct {
|
||||
Major uint
|
||||
Minor uint
|
||||
Patch uint
|
||||
}
|
||||
|
|
|
@ -21,7 +21,6 @@ amdGPUs() {
|
|||
return
|
||||
fi
|
||||
GPU_LIST=(
|
||||
"gfx803"
|
||||
"gfx900"
|
||||
"gfx906:xnack-"
|
||||
"gfx908:xnack-"
|
||||
|
|
|
@ -90,6 +90,7 @@ func getDynLibs(gpuInfo gpu.GpuInfo) []string {
|
|||
if len(dynLibs) == 0 {
|
||||
dynLibs = []string{availableDynLibs["cpu"]}
|
||||
}
|
||||
slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
|
||||
return dynLibs
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue