2024-10-16 17:45:00 -07:00
|
|
|
package discover
|
2024-02-15 17:15:09 -08:00
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
2024-08-01 14:52:15 -07:00
|
|
|
"errors"
|
2024-10-14 16:26:45 -07:00
|
|
|
"fmt"
|
2024-02-15 17:15:09 -08:00
|
|
|
"log/slog"
|
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
"slices"
|
2024-06-05 12:07:20 -07:00
|
|
|
"strconv"
|
2024-02-15 17:15:09 -08:00
|
|
|
"strings"
|
2024-03-30 09:50:05 -07:00
|
|
|
|
2024-05-08 11:11:50 -07:00
|
|
|
"github.com/ollama/ollama/envconfig"
|
2024-03-30 09:50:05 -07:00
|
|
|
"github.com/ollama/ollama/format"
|
2024-02-15 17:15:09 -08:00
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
|
|
|
// TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
|
|
|
|
iGPUName = "AMD Radeon(TM) Graphics"
|
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
// Used to validate if the given ROCm lib is usable
|
2024-07-10 11:01:22 -07:00
|
|
|
ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // This is not sufficient to discern v5 vs v6
|
|
|
|
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
|
2024-02-15 17:15:09 -08:00
|
|
|
)
|
|
|
|
|
2024-10-14 16:26:45 -07:00
|
|
|
// Only called once during bootstrap
|
|
|
|
func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
2024-05-15 15:13:16 -07:00
|
|
|
resp := []RocmGPUInfo{}
|
2024-02-15 17:15:09 -08:00
|
|
|
hl, err := NewHipLib()
|
|
|
|
if err != nil {
|
|
|
|
slog.Debug(err.Error())
|
2024-10-14 16:26:45 -07:00
|
|
|
return nil, err
|
2024-02-15 17:15:09 -08:00
|
|
|
}
|
|
|
|
defer hl.Release()
|
|
|
|
|
2024-06-18 16:22:47 -07:00
|
|
|
driverMajor, driverMinor, err := hl.AMDDriverVersion()
|
|
|
|
if err != nil {
|
|
|
|
// For now this is benign, but we may eventually need to fail compatibility checks
|
|
|
|
slog.Debug("error looking up amd driver version", "error", err)
|
|
|
|
}
|
2024-02-15 17:15:09 -08:00
|
|
|
|
2024-10-26 14:04:14 -07:00
|
|
|
// Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
|
2024-02-15 17:15:09 -08:00
|
|
|
count := hl.HipGetDeviceCount()
|
|
|
|
if count == 0 {
|
2024-10-14 16:26:45 -07:00
|
|
|
err := fmt.Errorf("no compatible amdgpu devices detected")
|
|
|
|
slog.Info(err.Error())
|
|
|
|
return nil, err
|
2024-02-15 17:15:09 -08:00
|
|
|
}
|
|
|
|
libDir, err := AMDValidateLibDir()
|
|
|
|
if err != nil {
|
2024-10-14 16:26:45 -07:00
|
|
|
err = fmt.Errorf("unable to verify rocm library: %w", err)
|
|
|
|
slog.Warn(err.Error())
|
|
|
|
return nil, err
|
2024-02-15 17:15:09 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
var supported []string
|
2024-07-03 19:30:19 -07:00
|
|
|
gfxOverride := envconfig.HsaOverrideGfxVersion()
|
2024-02-15 17:15:09 -08:00
|
|
|
if gfxOverride == "" {
|
|
|
|
supported, err = GetSupportedGFX(libDir)
|
|
|
|
if err != nil {
|
2024-10-14 16:26:45 -07:00
|
|
|
err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
|
|
|
|
slog.Warn(err.Error())
|
|
|
|
return nil, err
|
2024-02-15 17:15:09 -08:00
|
|
|
}
|
|
|
|
} else {
|
2024-05-07 14:54:26 -07:00
|
|
|
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
|
2024-02-15 17:15:09 -08:00
|
|
|
}
|
|
|
|
|
2024-05-07 14:54:26 -07:00
|
|
|
slog.Debug("detected hip devices", "count", count)
|
2024-03-30 09:50:05 -07:00
|
|
|
// TODO how to determine the underlying device ID when visible devices is causing this to subset?
|
2024-05-22 09:26:45 -07:00
|
|
|
for i := range count {
|
2024-02-15 17:15:09 -08:00
|
|
|
err = hl.HipSetDevice(i)
|
|
|
|
if err != nil {
|
2024-03-30 09:50:05 -07:00
|
|
|
slog.Warn("set device", "id", i, "error", err)
|
2024-02-15 17:15:09 -08:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
props, err := hl.HipGetDeviceProperties(i)
|
|
|
|
if err != nil {
|
2024-03-30 09:50:05 -07:00
|
|
|
slog.Warn("get properties", "id", i, "error", err)
|
2024-02-15 17:15:09 -08:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
n := bytes.IndexByte(props.Name[:], 0)
|
|
|
|
name := string(props.Name[:n])
|
2024-03-30 09:50:05 -07:00
|
|
|
// TODO is UUID actually populated on windows?
|
|
|
|
// Can luid be used on windows for setting visible devices (and is it actually set?)
|
2024-02-15 17:15:09 -08:00
|
|
|
n = bytes.IndexByte(props.GcnArchName[:], 0)
|
|
|
|
gfx := string(props.GcnArchName[:n])
|
2024-05-07 14:54:26 -07:00
|
|
|
slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
|
2024-08-01 14:52:15 -07:00
|
|
|
// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
|
2024-02-15 17:15:09 -08:00
|
|
|
// TODO Why isn't props.iGPU accurate!?
|
|
|
|
|
2024-03-30 09:50:05 -07:00
|
|
|
freeMemory, totalMemory, err := hl.HipMemGetInfo()
|
2024-02-15 17:15:09 -08:00
|
|
|
if err != nil {
|
2024-03-30 09:50:05 -07:00
|
|
|
slog.Warn("get mem info", "id", i, "error", err)
|
2024-02-15 17:15:09 -08:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2024-05-15 15:13:16 -07:00
|
|
|
gpuInfo := RocmGPUInfo{
|
|
|
|
GpuInfo: GpuInfo{
|
|
|
|
Library: "rocm",
|
|
|
|
memInfo: memInfo{
|
|
|
|
TotalMemory: totalMemory,
|
|
|
|
FreeMemory: freeMemory,
|
|
|
|
},
|
2024-06-19 13:35:38 -07:00
|
|
|
// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
|
|
|
|
UnreliableFreeMemory: true,
|
|
|
|
|
2024-06-05 12:07:20 -07:00
|
|
|
ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
|
2024-05-15 15:13:16 -07:00
|
|
|
DependencyPath: libDir,
|
|
|
|
MinimumMemory: rocmMinimumMemory,
|
|
|
|
Name: name,
|
|
|
|
Compute: gfx,
|
2024-06-18 16:22:47 -07:00
|
|
|
DriverMajor: driverMajor,
|
|
|
|
DriverMinor: driverMinor,
|
2024-03-30 09:50:05 -07:00
|
|
|
},
|
2024-05-15 15:13:16 -07:00
|
|
|
index: i,
|
2024-03-30 09:50:05 -07:00
|
|
|
}
|
|
|
|
|
2024-10-14 16:26:45 -07:00
|
|
|
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
|
|
|
if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
|
|
|
|
reason := "unsupported Radeon iGPU detected skipping"
|
|
|
|
slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
|
|
|
|
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
|
|
|
GpuInfo: gpuInfo.GpuInfo,
|
|
|
|
Reason: reason,
|
|
|
|
})
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Strip off Target Features when comparing
|
|
|
|
if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
|
|
|
|
reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
|
|
|
|
slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
|
|
|
|
unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
|
|
|
|
GpuInfo: gpuInfo.GpuInfo,
|
|
|
|
Reason: reason,
|
|
|
|
})
|
|
|
|
// HSA_OVERRIDE_GFX_VERSION not supported on windows
|
|
|
|
continue
|
|
|
|
} else {
|
|
|
|
slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
|
|
|
|
}
|
|
|
|
|
|
|
|
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
|
|
|
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
|
|
|
|
|
2024-03-30 09:50:05 -07:00
|
|
|
resp = append(resp, gpuInfo)
|
2024-02-15 17:15:09 -08:00
|
|
|
}
|
2024-03-30 09:50:05 -07:00
|
|
|
|
2024-10-14 16:26:45 -07:00
|
|
|
return resp, nil
|
2024-02-15 17:15:09 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
func AMDValidateLibDir() (string, error) {
|
2024-03-30 09:50:05 -07:00
|
|
|
libDir, err := commonAMDValidateLibDir()
|
2024-02-15 17:15:09 -08:00
|
|
|
if err == nil {
|
2024-03-30 09:50:05 -07:00
|
|
|
return libDir, nil
|
2024-02-15 17:15:09 -08:00
|
|
|
}
|
|
|
|
|
2024-03-08 09:45:55 -08:00
|
|
|
// Installer payload (if we're running from some other location)
|
|
|
|
localAppData := os.Getenv("LOCALAPPDATA")
|
|
|
|
appDir := filepath.Join(localAppData, "Programs", "Ollama")
|
2024-08-27 16:19:00 -07:00
|
|
|
rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
|
2024-02-15 17:15:09 -08:00
|
|
|
if rocmLibUsable(rocmTargetDir) {
|
2024-03-08 09:45:55 -08:00
|
|
|
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
|
2024-02-15 17:15:09 -08:00
|
|
|
return rocmTargetDir, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
|
2024-03-08 09:45:55 -08:00
|
|
|
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
|
2024-08-01 14:52:15 -07:00
|
|
|
return "", errors.New("no suitable rocm found, falling back to CPU")
|
2024-02-15 17:15:09 -08:00
|
|
|
}
|
2024-05-15 15:13:16 -07:00
|
|
|
|
|
|
|
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
|
|
|
|
if len(gpus) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
hl, err := NewHipLib()
|
|
|
|
if err != nil {
|
|
|
|
slog.Debug(err.Error())
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
defer hl.Release()
|
|
|
|
|
|
|
|
for i := range gpus {
|
|
|
|
err := hl.HipSetDevice(gpus[i].index)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
freeMemory, _, err := hl.HipMemGetInfo()
|
|
|
|
if err != nil {
|
|
|
|
slog.Warn("get mem info", "id", i, "error", err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
|
|
|
|
gpus[i].FreeMemory = freeMemory
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
2024-10-26 14:04:14 -07:00
|
|
|
|
|
|
|
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
|
|
|
ids := []string{}
|
|
|
|
for _, info := range gpuInfo {
|
|
|
|
if info.Library != "rocm" {
|
|
|
|
// TODO shouldn't happen if things are wired correctly...
|
|
|
|
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
ids = append(ids, info.ID)
|
|
|
|
}
|
|
|
|
// There are 3 potential env vars to use to select GPUs.
|
|
|
|
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
|
|
|
|
// HIP_VISIBLE_DEVICES supports numeric IDs only
|
|
|
|
// GPU_DEVICE_ORDINAL supports numeric IDs only
|
|
|
|
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
|
|
|
|
}
|