diff --git a/gpu/gpu.go b/gpu/gpu.go index fb120ea5..743b27d1 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -16,6 +16,7 @@ import ( "os" "path/filepath" "runtime" + "strconv" "strings" "sync" "unsafe" @@ -147,7 +148,28 @@ func GetGPUInfo() GpuInfo { if memInfo.err != nil { slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))) C.free(unsafe.Pointer(memInfo.err)) + } else if memInfo.igpu_index >= 0 && memInfo.count == 1 { + // Only one GPU detected and it appears to be an integrated GPU - skip it + slog.Info("ROCm unsupported integrated GPU detected") } else { + if memInfo.igpu_index >= 0 { + // We have multiple GPUs reported, and one of them is an integrated GPU + // so we have to set the env var to bypass it + // If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it + val := os.Getenv("ROCR_VISIBLE_DEVICES") + if val == "" { + devices := []string{} + for i := 0; i < int(memInfo.count); i++ { + if i == int(memInfo.igpu_index) { + continue + } + devices = append(devices, strconv.Itoa(i)) + } + val = strings.Join(devices, ",") + os.Setenv("ROCR_VISIBLE_DEVICES", val) + } + slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val)) + } resp.Library = "rocm" var version C.rocm_version_resp_t C.rocm_get_version(*gpuHandles.rocm, &version) @@ -199,7 +221,9 @@ func CheckVRAM() (int64, error) { if overhead < gpus*1024*1024*1024 { overhead = gpus * 1024 * 1024 * 1024 } - return int64(gpuInfo.FreeMemory - overhead), nil + avail := int64(gpuInfo.FreeMemory - overhead) + slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024)) + return avail, nil } return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h index f32efa8e..e52d2066 100644 --- a/gpu/gpu_info.h +++ b/gpu/gpu_info.h @@ -42,6 +42,7 @@ typedef struct mem_info { uint64_t total; uint64_t free; unsigned int count; + int igpu_index; // If >= 0, we detected an integrated GPU to ignore char *err; // If non-nill, caller responsible for freeing } mem_info_t; diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c index 59ab0817..2d1db7bb 100644 --- a/gpu/gpu_info_rocm.c +++ b/gpu/gpu_info_rocm.c @@ -77,6 +77,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { resp->err = NULL; + resp->igpu_index = -1; uint64_t totalMem = 0; uint64_t usedMem = 0; rsmi_status_t ret; @@ -162,8 +163,14 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { } LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem); LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem); - resp->total += totalMem; - resp->free += totalMem - usedMem; + if (totalMem < 1024 * 1024 * 1024) { + // Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory + LOG(h.verbose, "[%d] ROCm integrated GPU\n", i); + resp->igpu_index = i; + } else { + resp->total += totalMem; + resp->free += totalMem - usedMem; + } } }