Better support for AMD multi-GPU on linux (#7212)

* Better support for AMD multi-GPU

This resolves a number of problems related to AMD multi-GPU setups on linux.

The numeric IDs used by rocm are not the same as the numeric IDs exposed in
sysfs although the ordering is consistent.  We have to count up from the first
valid gfx (major/minor/patch with non-zero values) we find starting at zero.

There are 3 different env vars for selecting GPUs, and only ROCR_VISIBLE_DEVICES
supports UUID based identification, so we should favor that one, and try
to use UUIDs if detected to avoid potential ordering bugs with numeric IDs

* ROCR_VISIBLE_DEVICES only works on linux

Use the numeric ID only HIP_VISIBLE_DEVICES on windows
This commit is contained in:
Daniel Hiltgen 2024-10-26 14:04:14 -07:00 committed by GitHub
parent 35ec7f079f
commit d7c94e0ca6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 69 additions and 43 deletions

View file

@ -37,19 +37,6 @@ func GetSupportedGFX(libDir string) ([]string, error) {
return ret, nil return ret, nil
} }
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "rocm" {
// TODO shouldn't happen if things are wired correctly...
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
continue
}
ids = append(ids, info.ID)
}
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
}
func commonAMDValidateLibDir() (string, error) { func commonAMDValidateLibDir() (string, error) {
// Favor our bundled version // Favor our bundled version

View file

@ -64,7 +64,7 @@ func NewHipLib() (*HipLib, error) {
return hl, nil return hl, nil
} }
// The hip library only evaluates the HIP_VISIBLE_DEVICES variable at startup // The hip library only evaluates the ROCR_VISIBLE_DEVICES variable at startup
// so we have to unload/reset the library after we do our initial discovery // so we have to unload/reset the library after we do our initial discovery
// to make sure our updates to that variable are processed by llama.cpp // to make sure our updates to that variable are processed by llama.cpp
func (hl *HipLib) Release() { func (hl *HipLib) Release() {

View file

@ -64,16 +64,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others // Determine if the user has already pre-selected which GPUs to look at, then ignore the others
var visibleDevices []string var visibleDevices []string
hipVD := envconfig.HipVisibleDevices() // zero based index only hipVD := envconfig.HipVisibleDevices() // zero based index only
rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID, but consumer cards seem to not support UUID rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID
gpuDO := envconfig.GpuDeviceOrdinal() // zero based index gpuDO := envconfig.GpuDeviceOrdinal() // zero based index
switch { switch {
// TODO is this priorty order right?
case hipVD != "":
visibleDevices = strings.Split(hipVD, ",")
case rocrVD != "": case rocrVD != "":
visibleDevices = strings.Split(rocrVD, ",") visibleDevices = strings.Split(rocrVD, ",")
// TODO - since we don't yet support UUIDs, consider detecting and reporting here case hipVD != "":
// all our test systems show GPU-XX indicating UUID is not supported visibleDevices = strings.Split(hipVD, ",")
case gpuDO != "": case gpuDO != "":
visibleDevices = strings.Split(gpuDO, ",") visibleDevices = strings.Split(gpuDO, ",")
} }
@ -99,7 +96,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
} }
return a < b return a < b
}) })
cpuCount := 0 gpuCount := 0
for _, match := range matches { for _, match := range matches {
slog.Debug("evaluating amdgpu node " + match) slog.Debug("evaluating amdgpu node " + match)
fp, err := os.Open(match) fp, err := os.Open(match)
@ -108,11 +105,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
continue continue
} }
defer fp.Close() defer fp.Close()
nodeID, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
if err != nil {
slog.Debug("failed to parse node ID", "error", err)
continue
}
scanner := bufio.NewScanner(fp) scanner := bufio.NewScanner(fp)
isCPU := false isCPU := false
@ -186,20 +178,19 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
// do reliably report VRAM usage. // do reliably report VRAM usage.
if isCPU { if isCPU {
cpuCount++
continue continue
} }
// CPUs are always first in the list // Skip over any GPUs that are masked
gpuID := nodeID - cpuCount if major == 0 && minor == 0 && patch == 0 {
slog.Debug("skipping gpu with gfx000")
// Shouldn't happen, but just in case... continue
if gpuID < 0 {
err := fmt.Errorf("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
slog.Error(err.Error())
return nil, err
} }
// Keep track of numeric IDs based on valid GPUs
gpuID := gpuCount
gpuCount += 1
// Look up the memory for the current node // Look up the memory for the current node
totalMemory := uint64(0) totalMemory := uint64(0)
usedMemory := uint64(0) usedMemory := uint64(0)
@ -273,6 +264,14 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
name = fmt.Sprintf("%04x:%04x", vendor, device) name = fmt.Sprintf("%04x:%04x", vendor, device)
} }
// Favor UUIDs if available to reduce possibility of getting the numeric IDs wrong
var ID string
if uniqueID != 0 {
ID = fmt.Sprintf("GPU-%016x", uniqueID)
} else {
ID = strconv.Itoa(gpuID)
}
gpuInfo := RocmGPUInfo{ gpuInfo := RocmGPUInfo{
GpuInfo: GpuInfo{ GpuInfo: GpuInfo{
Library: "rocm", Library: "rocm",
@ -280,7 +279,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
TotalMemory: totalMemory, TotalMemory: totalMemory,
FreeMemory: (totalMemory - usedMemory), FreeMemory: (totalMemory - usedMemory),
}, },
ID: strconv.Itoa(gpuID), ID: ID,
Name: name, Name: name,
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch), Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
MinimumMemory: rocmMinimumMemory, MinimumMemory: rocmMinimumMemory,
@ -288,6 +287,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
DriverMinor: driverMinor, DriverMinor: driverMinor,
}, },
usedFilepath: usedFile, usedFilepath: usedFile,
index: gpuID,
} }
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library // iGPU detection, remove this check once we can support an iGPU variant of the rocm library
@ -319,7 +319,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
if len(visibleDevices) > 0 { if len(visibleDevices) > 0 {
include := false include := false
for _, visible := range visibleDevices { for _, visible := range visibleDevices {
if visible == gpuInfo.ID { if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) {
include = true include = true
break break
} }
@ -516,3 +516,20 @@ func verifyKFDDriverAccess() error {
fd.Close() fd.Close()
return nil return nil
} }
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "rocm" {
// TODO shouldn't happen if things are wired correctly...
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
continue
}
ids = append(ids, info.ID)
}
// There are 3 potential env vars to use to select GPUs.
// ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux
// GPU_DEVICE_ORDINAL supports numeric IDs only
// HIP_VISIBLE_DEVICES supports numeric IDs only
return "ROCR_VISIBLE_DEVICES", strings.Join(ids, ",")
}

View file

@ -43,7 +43,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
slog.Debug("error looking up amd driver version", "error", err) slog.Debug("error looking up amd driver version", "error", err)
} }
// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified // Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
count := hl.HipGetDeviceCount() count := hl.HipGetDeviceCount()
if count == 0 { if count == 0 {
err := fmt.Errorf("no compatible amdgpu devices detected") err := fmt.Errorf("no compatible amdgpu devices detected")
@ -201,3 +201,20 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
} }
return nil return nil
} }
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "rocm" {
// TODO shouldn't happen if things are wired correctly...
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
continue
}
ids = append(ids, info.ID)
}
// There are 3 potential env vars to use to select GPUs.
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
// HIP_VISIBLE_DEVICES supports numeric IDs only
// GPU_DEVICE_ORDINAL supports numeric IDs only
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
}

View file

@ -74,6 +74,10 @@ would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
server. If you have an unsupported AMD GPU you can experiment using the list of server. If you have an unsupported AMD GPU you can experiment using the list of
supported types below. supported types below.
If you have multiple GPUs with different GFX versions, append the numeric device
number to the environment variable to set them individually. For example,
`HSA_OVERRIDE_GFX_VERSION_0=10.3.0` and `HSA_OVERRIDE_GFX_VERSION_1=11.0.0`
At this time, the known supported GPU types on linux are the following LLVM Targets. At this time, the known supported GPU types on linux are the following LLVM Targets.
This table shows some example GPUs that map to these LLVM targets: This table shows some example GPUs that map to these LLVM targets:
| **LLVM Target** | **An Example GPU** | | **LLVM Target** | **An Example GPU** |
@ -99,9 +103,10 @@ Reach out on [Discord](https://discord.gg/ollama) or file an
### GPU Selection ### GPU Selection
If you have multiple AMD GPUs in your system and want to limit Ollama to use a If you have multiple AMD GPUs in your system and want to limit Ollama to use a
subset, you can set `HIP_VISIBLE_DEVICES` to a comma separated list of GPUs. subset, you can set `ROCR_VISIBLE_DEVICES` to a comma separated list of GPUs.
You can see the list of devices with `rocminfo`. If you want to ignore the GPUs You can see the list of devices with `rocminfo`. If you want to ignore the GPUs
and force CPU usage, use an invalid GPU ID (e.g., "-1") and force CPU usage, use an invalid GPU ID (e.g., "-1"). When available, use the
`Uuid` to uniquely identify the device instead of numeric value.
### Container Permission ### Container Permission

View file

@ -265,9 +265,9 @@ func AsMap() map[string]EnvVar {
if runtime.GOOS != "darwin" { if runtime.GOOS != "darwin" {
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"} ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"} ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible by numeric ID"}
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible"} ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible by UUID or numeric ID"}
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible"} ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible by numeric ID"}
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"} ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"} ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
} }