d7c94e0ca6
* Better support for AMD multi-GPU This resolves a number of problems related to AMD multi-GPU setups on linux. The numeric IDs used by rocm are not the same as the numeric IDs exposed in sysfs although the ordering is consistent. We have to count up from the first valid gfx (major/minor/patch with non-zero values) we find starting at zero. There are 3 different env vars for selecting GPUs, and only ROCR_VISIBLE_DEVICES supports UUID based identification, so we should favor that one, and try to use UUIDs if detected to avoid potential ordering bugs with numeric IDs * ROCR_VISIBLE_DEVICES only works on linux Use the numeric ID only HIP_VISIBLE_DEVICES on windows
220 lines
6.7 KiB
Go
220 lines
6.7 KiB
Go
package discover
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"path/filepath"
|
|
"slices"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/ollama/ollama/envconfig"
|
|
"github.com/ollama/ollama/format"
|
|
)
|
|
|
|
const (
|
|
|
|
// TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
|
|
iGPUName = "AMD Radeon(TM) Graphics"
|
|
)
|
|
|
|
var (
|
|
// Used to validate if the given ROCm lib is usable
|
|
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // This is not sufficient to discern v5 vs v6
|
|
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
|
|
)
|
|
|
|
// Only called once during bootstrap
|
|
func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
|
resp := []RocmGPUInfo{}
|
|
hl, err := NewHipLib()
|
|
if err != nil {
|
|
slog.Debug(err.Error())
|
|
return nil, err
|
|
}
|
|
defer hl.Release()
|
|
|
|
driverMajor, driverMinor, err := hl.AMDDriverVersion()
|
|
if err != nil {
|
|
// For now this is benign, but we may eventually need to fail compatibility checks
|
|
slog.Debug("error looking up amd driver version", "error", err)
|
|
}
|
|
|
|
// Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
|
|
count := hl.HipGetDeviceCount()
|
|
if count == 0 {
|
|
err := fmt.Errorf("no compatible amdgpu devices detected")
|
|
slog.Info(err.Error())
|
|
return nil, err
|
|
}
|
|
libDir, err := AMDValidateLibDir()
|
|
if err != nil {
|
|
err = fmt.Errorf("unable to verify rocm library: %w", err)
|
|
slog.Warn(err.Error())
|
|
return nil, err
|
|
}
|
|
|
|
var supported []string
|
|
gfxOverride := envconfig.HsaOverrideGfxVersion()
|
|
if gfxOverride == "" {
|
|
supported, err = GetSupportedGFX(libDir)
|
|
if err != nil {
|
|
err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
|
|
slog.Warn(err.Error())
|
|
return nil, err
|
|
}
|
|
} else {
|
|
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
|
|
}
|
|
|
|
slog.Debug("detected hip devices", "count", count)
|
|
// TODO how to determine the underlying device ID when visible devices is causing this to subset?
|
|
for i := range count {
|
|
err = hl.HipSetDevice(i)
|
|
if err != nil {
|
|
slog.Warn("set device", "id", i, "error", err)
|
|
continue
|
|
}
|
|
|
|
props, err := hl.HipGetDeviceProperties(i)
|
|
if err != nil {
|
|
slog.Warn("get properties", "id", i, "error", err)
|
|
continue
|
|
}
|
|
n := bytes.IndexByte(props.Name[:], 0)
|
|
name := string(props.Name[:n])
|
|
// TODO is UUID actually populated on windows?
|
|
// Can luid be used on windows for setting visible devices (and is it actually set?)
|
|
n = bytes.IndexByte(props.GcnArchName[:], 0)
|
|
gfx := string(props.GcnArchName[:n])
|
|
slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
|
|
// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
|
|
// TODO Why isn't props.iGPU accurate!?
|
|
|
|
freeMemory, totalMemory, err := hl.HipMemGetInfo()
|
|
if err != nil {
|
|
slog.Warn("get mem info", "id", i, "error", err)
|
|
continue
|
|
}
|
|
|
|
gpuInfo := RocmGPUInfo{
|
|
GpuInfo: GpuInfo{
|
|
Library: "rocm",
|
|
memInfo: memInfo{
|
|
TotalMemory: totalMemory,
|
|
FreeMemory: freeMemory,
|
|
},
|
|
// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
|
|
UnreliableFreeMemory: true,
|
|
|
|
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
|
DependencyPath: libDir,
|
|
MinimumMemory: rocmMinimumMemory,
|
|
Name: name,
|
|
Compute: gfx,
|
|
DriverMajor: driverMajor,
|
|
DriverMinor: driverMinor,
|
|
},
|
|
index: i,
|
|
}
|
|
|
|
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
|
if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
|
|
reason := "unsupported Radeon iGPU detected skipping"
|
|
slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
|
|
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
|
GpuInfo: gpuInfo.GpuInfo,
|
|
Reason: reason,
|
|
})
|
|
continue
|
|
}
|
|
|
|
// Strip off Target Features when comparing
|
|
if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
|
|
reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
|
|
slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
|
|
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
|
GpuInfo: gpuInfo.GpuInfo,
|
|
Reason: reason,
|
|
})
|
|
// HSA_OVERRIDE_GFX_VERSION not supported on windows
|
|
continue
|
|
} else {
|
|
slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
|
|
}
|
|
|
|
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
|
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
|
|
|
|
resp = append(resp, gpuInfo)
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func AMDValidateLibDir() (string, error) {
|
|
libDir, err := commonAMDValidateLibDir()
|
|
if err == nil {
|
|
return libDir, nil
|
|
}
|
|
|
|
// Installer payload (if we're running from some other location)
|
|
localAppData := os.Getenv("LOCALAPPDATA")
|
|
appDir := filepath.Join(localAppData, "Programs", "Ollama")
|
|
rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
|
|
if rocmLibUsable(rocmTargetDir) {
|
|
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
|
|
return rocmTargetDir, nil
|
|
}
|
|
|
|
// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
|
|
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
|
|
return "", errors.New("no suitable rocm found, falling back to CPU")
|
|
}
|
|
|
|
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
|
|
if len(gpus) == 0 {
|
|
return nil
|
|
}
|
|
hl, err := NewHipLib()
|
|
if err != nil {
|
|
slog.Debug(err.Error())
|
|
return nil
|
|
}
|
|
defer hl.Release()
|
|
|
|
for i := range gpus {
|
|
err := hl.HipSetDevice(gpus[i].index)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
freeMemory, _, err := hl.HipMemGetInfo()
|
|
if err != nil {
|
|
slog.Warn("get mem info", "id", i, "error", err)
|
|
continue
|
|
}
|
|
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
|
|
gpus[i].FreeMemory = freeMemory
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
|
ids := []string{}
|
|
for _, info := range gpuInfo {
|
|
if info.Library != "rocm" {
|
|
// TODO shouldn't happen if things are wired correctly...
|
|
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
|
continue
|
|
}
|
|
ids = append(ids, info.ID)
|
|
}
|
|
// There are 3 potential env vars to use to select GPUs.
|
|
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
|
|
// HIP_VISIBLE_DEVICES supports numeric IDs only
|
|
// GPU_DEVICE_ORDINAL supports numeric IDs only
|
|
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
|
|
}
|