diff --git a/discover/amd_common.go b/discover/amd_common.go index bf969240..3c630861 100644 --- a/discover/amd_common.go +++ b/discover/amd_common.go @@ -37,19 +37,6 @@ func GetSupportedGFX(libDir string) ([]string, error) { return ret, nil } -func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { - ids := []string{} - for _, info := range gpuInfo { - if info.Library != "rocm" { - // TODO shouldn't happen if things are wired correctly... - slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library) - continue - } - ids = append(ids, info.ID) - } - return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",") -} - func commonAMDValidateLibDir() (string, error) { // Favor our bundled version diff --git a/discover/amd_hip_windows.go b/discover/amd_hip_windows.go index 12a7af0b..bf19ef06 100644 --- a/discover/amd_hip_windows.go +++ b/discover/amd_hip_windows.go @@ -64,7 +64,7 @@ func NewHipLib() (*HipLib, error) { return hl, nil } -// The hip library only evaluates the HIP_VISIBLE_DEVICES variable at startup +// The hip library only evaluates the ROCR_VISIBLE_DEVICES variable at startup // so we have to unload/reset the library after we do our initial discovery // to make sure our updates to that variable are processed by llama.cpp func (hl *HipLib) Release() { diff --git a/discover/amd_linux.go b/discover/amd_linux.go index dd8e605c..fad7b7a6 100644 --- a/discover/amd_linux.go +++ b/discover/amd_linux.go @@ -64,16 +64,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { // Determine if the user has already pre-selected which GPUs to look at, then ignore the others var visibleDevices []string hipVD := envconfig.HipVisibleDevices() // zero based index only - rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID, but consumer cards seem to not support UUID + rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID gpuDO := envconfig.GpuDeviceOrdinal() // zero based index switch { - // TODO is this priorty order right? - case hipVD != "": - visibleDevices = strings.Split(hipVD, ",") case rocrVD != "": visibleDevices = strings.Split(rocrVD, ",") - // TODO - since we don't yet support UUIDs, consider detecting and reporting here - // all our test systems show GPU-XX indicating UUID is not supported + case hipVD != "": + visibleDevices = strings.Split(hipVD, ",") case gpuDO != "": visibleDevices = strings.Split(gpuDO, ",") } @@ -99,7 +96,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { } return a < b }) - cpuCount := 0 + gpuCount := 0 for _, match := range matches { slog.Debug("evaluating amdgpu node " + match) fp, err := os.Open(match) @@ -108,11 +105,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { continue } defer fp.Close() - nodeID, err := strconv.Atoi(filepath.Base(filepath.Dir(match))) - if err != nil { - slog.Debug("failed to parse node ID", "error", err) - continue - } scanner := bufio.NewScanner(fp) isCPU := false @@ -186,20 +178,19 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { // do reliably report VRAM usage. if isCPU { - cpuCount++ continue } - // CPUs are always first in the list - gpuID := nodeID - cpuCount - - // Shouldn't happen, but just in case... - if gpuID < 0 { - err := fmt.Errorf("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue") - slog.Error(err.Error()) - return nil, err + // Skip over any GPUs that are masked + if major == 0 && minor == 0 && patch == 0 { + slog.Debug("skipping gpu with gfx000") + continue } + // Keep track of numeric IDs based on valid GPUs + gpuID := gpuCount + gpuCount += 1 + // Look up the memory for the current node totalMemory := uint64(0) usedMemory := uint64(0) @@ -273,6 +264,14 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { name = fmt.Sprintf("%04x:%04x", vendor, device) } + // Favor UUIDs if available to reduce possibility of getting the numeric IDs wrong + var ID string + if uniqueID != 0 { + ID = fmt.Sprintf("GPU-%016x", uniqueID) + } else { + ID = strconv.Itoa(gpuID) + } + gpuInfo := RocmGPUInfo{ GpuInfo: GpuInfo{ Library: "rocm", @@ -280,7 +279,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { TotalMemory: totalMemory, FreeMemory: (totalMemory - usedMemory), }, - ID: strconv.Itoa(gpuID), + ID: ID, Name: name, Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch), MinimumMemory: rocmMinimumMemory, @@ -288,6 +287,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { DriverMinor: driverMinor, }, usedFilepath: usedFile, + index: gpuID, } // iGPU detection, remove this check once we can support an iGPU variant of the rocm library @@ -319,7 +319,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { if len(visibleDevices) > 0 { include := false for _, visible := range visibleDevices { - if visible == gpuInfo.ID { + if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) { include = true break } @@ -516,3 +516,20 @@ func verifyKFDDriverAccess() error { fd.Close() return nil } + +func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { + ids := []string{} + for _, info := range gpuInfo { + if info.Library != "rocm" { + // TODO shouldn't happen if things are wired correctly... + slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library) + continue + } + ids = append(ids, info.ID) + } + // There are 3 potential env vars to use to select GPUs. + // ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux + // GPU_DEVICE_ORDINAL supports numeric IDs only + // HIP_VISIBLE_DEVICES supports numeric IDs only + return "ROCR_VISIBLE_DEVICES", strings.Join(ids, ",") +} diff --git a/discover/amd_windows.go b/discover/amd_windows.go index a3a6e0c3..b0c76f1e 100644 --- a/discover/amd_windows.go +++ b/discover/amd_windows.go @@ -43,7 +43,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) { slog.Debug("error looking up amd driver version", "error", err) } - // Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified + // Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified count := hl.HipGetDeviceCount() if count == 0 { err := fmt.Errorf("no compatible amdgpu devices detected") @@ -201,3 +201,20 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error { } return nil } + +func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { + ids := []string{} + for _, info := range gpuInfo { + if info.Library != "rocm" { + // TODO shouldn't happen if things are wired correctly... + slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library) + continue + } + ids = append(ids, info.ID) + } + // There are 3 potential env vars to use to select GPUs. + // ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows + // HIP_VISIBLE_DEVICES supports numeric IDs only + // GPU_DEVICE_ORDINAL supports numeric IDs only + return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",") +} diff --git a/docs/gpu.md b/docs/gpu.md index 2913a2e2..2de5bd24 100644 --- a/docs/gpu.md +++ b/docs/gpu.md @@ -74,6 +74,10 @@ would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the server. If you have an unsupported AMD GPU you can experiment using the list of supported types below. +If you have multiple GPUs with different GFX versions, append the numeric device +number to the environment variable to set them individually. For example, +`HSA_OVERRIDE_GFX_VERSION_0=10.3.0` and `HSA_OVERRIDE_GFX_VERSION_1=11.0.0` + At this time, the known supported GPU types on linux are the following LLVM Targets. This table shows some example GPUs that map to these LLVM targets: | **LLVM Target** | **An Example GPU** | @@ -99,9 +103,10 @@ Reach out on [Discord](https://discord.gg/ollama) or file an ### GPU Selection If you have multiple AMD GPUs in your system and want to limit Ollama to use a -subset, you can set `HIP_VISIBLE_DEVICES` to a comma separated list of GPUs. +subset, you can set `ROCR_VISIBLE_DEVICES` to a comma separated list of GPUs. You can see the list of devices with `rocminfo`. If you want to ignore the GPUs -and force CPU usage, use an invalid GPU ID (e.g., "-1") +and force CPU usage, use an invalid GPU ID (e.g., "-1"). When available, use the +`Uuid` to uniquely identify the device instead of numeric value. ### Container Permission diff --git a/envconfig/config.go b/envconfig/config.go index db2a5625..e80c67ba 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -265,9 +265,9 @@ func AsMap() map[string]EnvVar { if runtime.GOOS != "darwin" { ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"} - ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"} - ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible"} - ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible"} + ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible by numeric ID"} + ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible by UUID or numeric ID"} + ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible by numeric ID"} ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"} ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"} }