Ignore AMD integrated GPUs
Detect and ignore integrated GPUs reported by rocm.
This commit is contained in:
parent
197e420a97
commit
9d7b5d6c91
3 changed files with 35 additions and 3 deletions
26
gpu/gpu.go
26
gpu/gpu.go
|
@ -16,6 +16,7 @@ import (
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
@ -147,7 +148,28 @@ func GetGPUInfo() GpuInfo {
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
|
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
|
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
|
||||||
|
// Only one GPU detected and it appears to be an integrated GPU - skip it
|
||||||
|
slog.Info("ROCm unsupported integrated GPU detected")
|
||||||
} else {
|
} else {
|
||||||
|
if memInfo.igpu_index >= 0 {
|
||||||
|
// We have multiple GPUs reported, and one of them is an integrated GPU
|
||||||
|
// so we have to set the env var to bypass it
|
||||||
|
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
|
||||||
|
val := os.Getenv("ROCR_VISIBLE_DEVICES")
|
||||||
|
if val == "" {
|
||||||
|
devices := []string{}
|
||||||
|
for i := 0; i < int(memInfo.count); i++ {
|
||||||
|
if i == int(memInfo.igpu_index) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
devices = append(devices, strconv.Itoa(i))
|
||||||
|
}
|
||||||
|
val = strings.Join(devices, ",")
|
||||||
|
os.Setenv("ROCR_VISIBLE_DEVICES", val)
|
||||||
|
}
|
||||||
|
slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
|
||||||
|
}
|
||||||
resp.Library = "rocm"
|
resp.Library = "rocm"
|
||||||
var version C.rocm_version_resp_t
|
var version C.rocm_version_resp_t
|
||||||
C.rocm_get_version(*gpuHandles.rocm, &version)
|
C.rocm_get_version(*gpuHandles.rocm, &version)
|
||||||
|
@ -199,7 +221,9 @@ func CheckVRAM() (int64, error) {
|
||||||
if overhead < gpus*1024*1024*1024 {
|
if overhead < gpus*1024*1024*1024 {
|
||||||
overhead = gpus * 1024 * 1024 * 1024
|
overhead = gpus * 1024 * 1024 * 1024
|
||||||
}
|
}
|
||||||
return int64(gpuInfo.FreeMemory - overhead), nil
|
avail := int64(gpuInfo.FreeMemory - overhead)
|
||||||
|
slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
|
||||||
|
return avail, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
|
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
|
||||||
|
|
|
@ -42,6 +42,7 @@ typedef struct mem_info {
|
||||||
uint64_t total;
|
uint64_t total;
|
||||||
uint64_t free;
|
uint64_t free;
|
||||||
unsigned int count;
|
unsigned int count;
|
||||||
|
int igpu_index; // If >= 0, we detected an integrated GPU to ignore
|
||||||
char *err; // If non-nill, caller responsible for freeing
|
char *err; // If non-nill, caller responsible for freeing
|
||||||
} mem_info_t;
|
} mem_info_t;
|
||||||
|
|
||||||
|
|
|
@ -77,6 +77,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
||||||
|
|
||||||
void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
|
resp->igpu_index = -1;
|
||||||
uint64_t totalMem = 0;
|
uint64_t totalMem = 0;
|
||||||
uint64_t usedMem = 0;
|
uint64_t usedMem = 0;
|
||||||
rsmi_status_t ret;
|
rsmi_status_t ret;
|
||||||
|
@ -162,8 +163,14 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
||||||
}
|
}
|
||||||
LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
|
LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
|
||||||
LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
|
LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
|
||||||
resp->total += totalMem;
|
if (totalMem < 1024 * 1024 * 1024) {
|
||||||
resp->free += totalMem - usedMem;
|
// Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
|
||||||
|
LOG(h.verbose, "[%d] ROCm integrated GPU\n", i);
|
||||||
|
resp->igpu_index = i;
|
||||||
|
} else {
|
||||||
|
resp->total += totalMem;
|
||||||
|
resp->free += totalMem - usedMem;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue