Detect AMD GPU info via sysfs and block old cards
This wires up some new logic to start using sysfs to discover AMD GPU information and detects old cards we can't yet support so we can fallback to CPU mode.
This commit is contained in:
parent
1c8435ffa9
commit
6d84f07505
5 changed files with 151 additions and 34 deletions
91
gpu/amd.go
Normal file
91
gpu/amd.go
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
package gpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TODO - windows vs. non-windows vs darwin
|
||||||
|
|
||||||
|
// Discovery logic for AMD/ROCm GPUs
|
||||||
|
|
||||||
|
const (
|
||||||
|
DriverVersionFile = "/sys/module/amdgpu/version"
|
||||||
|
GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
|
||||||
|
// TODO probably break these down per GPU to make the logic simpler
|
||||||
|
GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
|
||||||
|
GPUUsedMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
|
||||||
|
)
|
||||||
|
|
||||||
|
func AMDDetected() bool {
|
||||||
|
_, err := AMDDriverVersion()
|
||||||
|
return err == nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func AMDDriverVersion() (string, error) {
|
||||||
|
_, err := os.Stat(DriverVersionFile)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
fp, err := os.Open(DriverVersionFile)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer fp.Close()
|
||||||
|
verString, err := io.ReadAll(fp)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(verString)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func AMDGFXVersions() []Version {
|
||||||
|
res := []Version{}
|
||||||
|
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
||||||
|
for _, match := range matches {
|
||||||
|
fp, err := os.Open(match)
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
defer fp.Close()
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(fp)
|
||||||
|
// optionally, resize scanner's capacity for lines over 64K, see next example
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := strings.TrimSpace(scanner.Text())
|
||||||
|
if strings.HasPrefix(line, "gfx_target_version") {
|
||||||
|
ver := strings.Fields(line)
|
||||||
|
if len(ver) != 2 || len(ver[1]) < 5 {
|
||||||
|
slog.Debug("malformed " + line)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
l := len(ver[1])
|
||||||
|
patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
|
||||||
|
minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
|
||||||
|
major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
|
||||||
|
if err1 != nil || err2 != nil || err3 != nil {
|
||||||
|
slog.Debug("malformed int " + line)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
res = append(res, Version{
|
||||||
|
Major: uint(major),
|
||||||
|
Minor: uint(minor),
|
||||||
|
Patch: uint(patch),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v Version) ToGFXString() string {
|
||||||
|
return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
|
||||||
|
}
|
22
gpu/gpu.go
22
gpu/gpu.go
|
@ -149,7 +149,26 @@ func GetGPUInfo() GpuInfo {
|
||||||
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
} else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
||||||
|
ver, err := AMDDriverVersion()
|
||||||
|
if err == nil {
|
||||||
|
slog.Info("AMD Driver: " + ver)
|
||||||
|
}
|
||||||
|
gfx := AMDGFXVersions()
|
||||||
|
tooOld := false
|
||||||
|
for _, v := range gfx {
|
||||||
|
if v.Major < 9 {
|
||||||
|
slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
|
||||||
|
tooOld = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
|
||||||
|
// e.g. gfx1034 works if we map it to gfx1030 at runtime
|
||||||
|
|
||||||
|
}
|
||||||
|
if !tooOld {
|
||||||
|
// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
|
||||||
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
|
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
|
||||||
|
@ -188,6 +207,7 @@ func GetGPUInfo() GpuInfo {
|
||||||
C.free(unsafe.Pointer(version.str))
|
C.free(unsafe.Pointer(version.str))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
if resp.Library == "" {
|
if resp.Library == "" {
|
||||||
C.cpu_check_ram(&memInfo)
|
C.cpu_check_ram(&memInfo)
|
||||||
resp.Library = "cpu"
|
resp.Library = "cpu"
|
||||||
|
|
|
@ -16,3 +16,9 @@ type GpuInfo struct {
|
||||||
|
|
||||||
// TODO add other useful attributes about the card here for discovery information
|
// TODO add other useful attributes about the card here for discovery information
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type Version struct {
|
||||||
|
Major uint
|
||||||
|
Minor uint
|
||||||
|
Patch uint
|
||||||
|
}
|
||||||
|
|
|
@ -21,7 +21,6 @@ amdGPUs() {
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
GPU_LIST=(
|
GPU_LIST=(
|
||||||
"gfx803"
|
|
||||||
"gfx900"
|
"gfx900"
|
||||||
"gfx906:xnack-"
|
"gfx906:xnack-"
|
||||||
"gfx908:xnack-"
|
"gfx908:xnack-"
|
||||||
|
|
|
@ -90,6 +90,7 @@ func getDynLibs(gpuInfo gpu.GpuInfo) []string {
|
||||||
if len(dynLibs) == 0 {
|
if len(dynLibs) == 0 {
|
||||||
dynLibs = []string{availableDynLibs["cpu"]}
|
dynLibs = []string{availableDynLibs["cpu"]}
|
||||||
}
|
}
|
||||||
|
slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
|
||||||
return dynLibs
|
return dynLibs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue