Merge pull request #2465 from dhiltgen/block_rocm_pre_9
Detect AMD GPU info via sysfs and block old cards
This commit is contained in:
commit
76b8728f0c
5 changed files with 151 additions and 34 deletions
91
gpu/amd.go
Normal file
91
gpu/amd.go
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
package gpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TODO - windows vs. non-windows vs darwin
|
||||||
|
|
||||||
|
// Discovery logic for AMD/ROCm GPUs
|
||||||
|
|
||||||
|
const (
|
||||||
|
DriverVersionFile = "/sys/module/amdgpu/version"
|
||||||
|
GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
|
||||||
|
// TODO probably break these down per GPU to make the logic simpler
|
||||||
|
GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
|
||||||
|
GPUUsedMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
|
||||||
|
)
|
||||||
|
|
||||||
|
func AMDDetected() bool {
|
||||||
|
_, err := AMDDriverVersion()
|
||||||
|
return err == nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func AMDDriverVersion() (string, error) {
|
||||||
|
_, err := os.Stat(DriverVersionFile)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
fp, err := os.Open(DriverVersionFile)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer fp.Close()
|
||||||
|
verString, err := io.ReadAll(fp)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(verString)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func AMDGFXVersions() []Version {
|
||||||
|
res := []Version{}
|
||||||
|
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
||||||
|
for _, match := range matches {
|
||||||
|
fp, err := os.Open(match)
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
defer fp.Close()
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(fp)
|
||||||
|
// optionally, resize scanner's capacity for lines over 64K, see next example
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := strings.TrimSpace(scanner.Text())
|
||||||
|
if strings.HasPrefix(line, "gfx_target_version") {
|
||||||
|
ver := strings.Fields(line)
|
||||||
|
if len(ver) != 2 || len(ver[1]) < 5 {
|
||||||
|
slog.Debug("malformed " + line)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
l := len(ver[1])
|
||||||
|
patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
|
||||||
|
minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
|
||||||
|
major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
|
||||||
|
if err1 != nil || err2 != nil || err3 != nil {
|
||||||
|
slog.Debug("malformed int " + line)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
res = append(res, Version{
|
||||||
|
Major: uint(major),
|
||||||
|
Minor: uint(minor),
|
||||||
|
Patch: uint(patch),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func (v Version) ToGFXString() string {
|
||||||
|
return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
|
||||||
|
}
|
22
gpu/gpu.go
22
gpu/gpu.go
|
@ -149,7 +149,26 @@ func GetGPUInfo() GpuInfo {
|
||||||
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
} else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
|
||||||
|
ver, err := AMDDriverVersion()
|
||||||
|
if err == nil {
|
||||||
|
slog.Info("AMD Driver: " + ver)
|
||||||
|
}
|
||||||
|
gfx := AMDGFXVersions()
|
||||||
|
tooOld := false
|
||||||
|
for _, v := range gfx {
|
||||||
|
if v.Major < 9 {
|
||||||
|
slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
|
||||||
|
tooOld = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
|
||||||
|
// e.g. gfx1034 works if we map it to gfx1030 at runtime
|
||||||
|
|
||||||
|
}
|
||||||
|
if !tooOld {
|
||||||
|
// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
|
||||||
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
|
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
|
||||||
|
@ -188,6 +207,7 @@ func GetGPUInfo() GpuInfo {
|
||||||
C.free(unsafe.Pointer(version.str))
|
C.free(unsafe.Pointer(version.str))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
if resp.Library == "" {
|
if resp.Library == "" {
|
||||||
C.cpu_check_ram(&memInfo)
|
C.cpu_check_ram(&memInfo)
|
||||||
resp.Library = "cpu"
|
resp.Library = "cpu"
|
||||||
|
|
|
@ -16,3 +16,9 @@ type GpuInfo struct {
|
||||||
|
|
||||||
// TODO add other useful attributes about the card here for discovery information
|
// TODO add other useful attributes about the card here for discovery information
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type Version struct {
|
||||||
|
Major uint
|
||||||
|
Minor uint
|
||||||
|
Patch uint
|
||||||
|
}
|
||||||
|
|
|
@ -21,7 +21,6 @@ amdGPUs() {
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
GPU_LIST=(
|
GPU_LIST=(
|
||||||
"gfx803"
|
|
||||||
"gfx900"
|
"gfx900"
|
||||||
"gfx906:xnack-"
|
"gfx906:xnack-"
|
||||||
"gfx908:xnack-"
|
"gfx908:xnack-"
|
||||||
|
|
|
@ -90,6 +90,7 @@ func getDynLibs(gpuInfo gpu.GpuInfo) []string {
|
||||||
if len(dynLibs) == 0 {
|
if len(dynLibs) == 0 {
|
||||||
dynLibs = []string{availableDynLibs["cpu"]}
|
dynLibs = []string{availableDynLibs["cpu"]}
|
||||||
}
|
}
|
||||||
|
slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
|
||||||
return dynLibs
|
return dynLibs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue