From f3c8b898cde83ed5977f227fca66665846de7955 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 14 Oct 2024 16:26:45 -0700 Subject: [PATCH] Track GPU discovery failure information (#5820) * Expose GPU discovery failure information * Remove exposed API for now --- gpu/amd_linux.go | 92 ++++++++++++++++++++-------- gpu/amd_windows.go | 73 +++++++++++++---------- gpu/gpu.go | 145 ++++++++++++++++++++++++++++++++++----------- gpu/gpu_darwin.go | 12 ++++ gpu/types.go | 12 ++++ 5 files changed, 242 insertions(+), 92 deletions(-) diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index d3f5b9fc..72dfb4db 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -47,10 +47,11 @@ var ( ) // Gather GPU information from the amdgpu driver if any supported GPUs are detected -func AMDGetGPUInfo() []RocmGPUInfo { +// Only called once during bootstrap +func AMDGetGPUInfo() ([]RocmGPUInfo, error) { resp := []RocmGPUInfo{} if !AMDDetected() { - return resp + return resp, fmt.Errorf("AMD GPUs not detected") } // Opportunistic logging of driver version to aid in troubleshooting @@ -194,13 +195,9 @@ func AMDGetGPUInfo() []RocmGPUInfo { // Shouldn't happen, but just in case... if gpuID < 0 { - slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue") - return nil - } - - if int(major) < RocmComputeMin { - slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID) - continue + err := fmt.Errorf("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue") + slog.Error(err.Error()) + return nil, err } // Look up the memory for the current node @@ -270,19 +267,12 @@ func AMDGetGPUInfo() []RocmGPUInfo { break } - // iGPU detection, remove this check once we can support an iGPU variant of the rocm library - if totalMemory < IGPUMemLimit { - slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory)) - continue - } var name string // TODO - PCI ID lookup if vendor > 0 && device > 0 { name = fmt.Sprintf("%04x:%04x", vendor, device) } - slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory)) - slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory)) gpuInfo := RocmGPUInfo{ GpuInfo: GpuInfo{ Library: "rocm", @@ -300,6 +290,31 @@ func AMDGetGPUInfo() []RocmGPUInfo { usedFilepath: usedFile, } + // iGPU detection, remove this check once we can support an iGPU variant of the rocm library + if totalMemory < IGPUMemLimit { + reason := "unsupported Radeon iGPU detected skipping" + slog.Info(reason, "id", gpuID, "total", format.HumanBytes2(totalMemory)) + unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{ + GpuInfo: gpuInfo.GpuInfo, + Reason: reason, + }) + continue + } + + if int(major) < RocmComputeMin { + reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch) + slog.Warn(reason, "gpu", gpuID) + unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{ + GpuInfo: gpuInfo.GpuInfo, + Reason: reason, + }) + + continue + } + + slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory)) + slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory)) + // If the user wants to filter to a subset of devices, filter out if we aren't a match if len(visibleDevices) > 0 { include := false @@ -310,7 +325,13 @@ func AMDGetGPUInfo() []RocmGPUInfo { } } if !include { - slog.Info("filtering out device per user request", "id", gpuInfo.ID, "visible_devices", visibleDevices) + reason := "filtering out device per user request" + slog.Info(reason, "id", gpuInfo.ID, "visible_devices", visibleDevices) + unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{ + GpuInfo: gpuInfo.GpuInfo, + Reason: reason, + }) + continue } } @@ -320,8 +341,13 @@ func AMDGetGPUInfo() []RocmGPUInfo { if libDir == "" { libDir, err = AMDValidateLibDir() if err != nil { - slog.Warn("unable to verify rocm library, will use cpu", "error", err) - return nil + err = fmt.Errorf("unable to verify rocm library: %w", err) + slog.Warn(err.Error()) + unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{ + GpuInfo: gpuInfo.GpuInfo, + Reason: err.Error(), + }) + return nil, err } } gpuInfo.DependencyPath = libDir @@ -331,14 +357,25 @@ func AMDGetGPUInfo() []RocmGPUInfo { if len(supported) == 0 { supported, err = GetSupportedGFX(libDir) if err != nil { - slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err) - return nil + err = fmt.Errorf("failed to lookup supported GFX types: %w", err) + slog.Warn(err.Error()) + unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{ + GpuInfo: gpuInfo.GpuInfo, + Reason: err.Error(), + }) + return nil, err } slog.Debug("rocm supported GPUs", "types", supported) } gfx := gpuInfo.Compute if !slices.Contains[[]string, string](supported, gfx) { - slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported) + reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported) + slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir) + unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{ + GpuInfo: gpuInfo.GpuInfo, + Reason: reason, + }) + // TODO - consider discrete markdown just for ROCM troubleshooting? slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage") continue @@ -358,13 +395,16 @@ func AMDGetGPUInfo() []RocmGPUInfo { resp = append(resp, gpuInfo) } if len(resp) == 0 { - slog.Info("no compatible amdgpu devices detected") + err := fmt.Errorf("no compatible amdgpu devices detected") + slog.Info(err.Error()) + return nil, err } if err := verifyKFDDriverAccess(); err != nil { - slog.Error("amdgpu devices detected but permission problems block access", "error", err) - return nil + err = fmt.Errorf("amdgpu devices detected but permission problems block access: %w", err) + slog.Error(err.Error()) + return nil, err } - return resp + return resp, nil } // Quick check for AMD driver so we can skip amdgpu discovery if not present diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index ef6bf830..4da6b7cc 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -3,6 +3,7 @@ package gpu import ( "bytes" "errors" + "fmt" "log/slog" "os" "path/filepath" @@ -26,12 +27,13 @@ var ( RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob? ) -func AMDGetGPUInfo() []RocmGPUInfo { +// Only called once during bootstrap +func AMDGetGPUInfo() ([]RocmGPUInfo, error) { resp := []RocmGPUInfo{} hl, err := NewHipLib() if err != nil { slog.Debug(err.Error()) - return nil + return nil, err } defer hl.Release() @@ -44,12 +46,15 @@ func AMDGetGPUInfo() []RocmGPUInfo { // Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified count := hl.HipGetDeviceCount() if count == 0 { - return nil + err := fmt.Errorf("no compatible amdgpu devices detected") + slog.Info(err.Error()) + return nil, err } libDir, err := AMDValidateLibDir() if err != nil { - slog.Warn("unable to verify rocm library, will use cpu", "error", err) - return nil + err = fmt.Errorf("unable to verify rocm library: %w", err) + slog.Warn(err.Error()) + return nil, err } var supported []string @@ -57,8 +62,9 @@ func AMDGetGPUInfo() []RocmGPUInfo { if gfxOverride == "" { supported, err = GetSupportedGFX(libDir) if err != nil { - slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err) - return nil + err = fmt.Errorf("failed to lookup supported GFX types: %w", err) + slog.Warn(err.Error()) + return nil, err } } else { slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride) @@ -87,21 +93,6 @@ func AMDGetGPUInfo() []RocmGPUInfo { slog.Debug("hip device", "id", i, "name", name, "gfx", gfx) // slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0 // TODO Why isn't props.iGPU accurate!? - if strings.EqualFold(name, iGPUName) { - slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx) - continue - } - if gfxOverride == "" { - // Strip off Target Features when comparing - if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) { - slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported) - // TODO - consider discrete markdown just for ROCM troubleshooting? - slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage") - continue - } else { - slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx) - } - } freeMemory, totalMemory, err := hl.HipMemGetInfo() if err != nil { @@ -109,14 +100,6 @@ func AMDGetGPUInfo() []RocmGPUInfo { continue } - // iGPU detection, remove this check once we can support an iGPU variant of the rocm library - if totalMemory < IGPUMemLimit { - slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory)) - continue - } - - slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory)) - slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory)) gpuInfo := RocmGPUInfo{ GpuInfo: GpuInfo{ Library: "rocm", @@ -138,10 +121,38 @@ func AMDGetGPUInfo() []RocmGPUInfo { index: i, } + // iGPU detection, remove this check once we can support an iGPU variant of the rocm library + if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit { + reason := "unsupported Radeon iGPU detected skipping" + slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory)) + unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{ + GpuInfo: gpuInfo.GpuInfo, + Reason: reason, + }) + continue + } + + // Strip off Target Features when comparing + if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) { + reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported) + slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir) + unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{ + GpuInfo: gpuInfo.GpuInfo, + Reason: reason, + }) + // HSA_OVERRIDE_GFX_VERSION not supported on windows + continue + } else { + slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx) + } + + slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory)) + slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory)) + resp = append(resp, gpuInfo) } - return resp + return resp, nil } func AMDValidateLibDir() (string, error) { diff --git a/gpu/gpu.go b/gpu/gpu.go index db0e247b..a5c265e1 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -54,6 +54,13 @@ var ( nvmlLibPath string rocmGPUs []RocmGPUInfo oneapiGPUs []OneapiGPUInfo + + // If any discovered GPUs are incompatible, report why + unsupportedGPUs []UnsupportedGPUInfo + + // Keep track of errors during bootstrapping so that if GPUs are missing + // they expected to be present this may explain why + bootstrapErrors []error ) // With our current CUDA compile flags, older than 5.0 will not work properly @@ -70,16 +77,17 @@ func initCudaHandles() *cudaHandles { cHandles := &cudaHandles{} // Short Circuit if we already know which library to use + // ignore bootstrap errors in this case since we already recorded them if nvmlLibPath != "" { - cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath}) + cHandles.nvml, _, _ = loadNVMLMgmt([]string{nvmlLibPath}) return cHandles } if nvcudaLibPath != "" { - cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath}) + cHandles.deviceCount, cHandles.nvcuda, _, _ = loadNVCUDAMgmt([]string{nvcudaLibPath}) return cHandles } if cudartLibPath != "" { - cHandles.deviceCount, cHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath}) + cHandles.deviceCount, cHandles.cudart, _, _ = loadCUDARTMgmt([]string{cudartLibPath}) return cHandles } @@ -102,18 +110,21 @@ func initCudaHandles() *cudaHandles { if len(NvmlGlobs) > 0 { nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs) if len(nvmlLibPaths) > 0 { - nvml, libPath := LoadNVMLMgmt(nvmlLibPaths) + nvml, libPath, err := loadNVMLMgmt(nvmlLibPaths) if nvml != nil { slog.Debug("nvidia-ml loaded", "library", libPath) cHandles.nvml = nvml nvmlLibPath = libPath } + if err != nil { + bootstrapErrors = append(bootstrapErrors, err) + } } } nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns) if len(nvcudaLibPaths) > 0 { - deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths) + deviceCount, nvcuda, libPath, err := loadNVCUDAMgmt(nvcudaLibPaths) if nvcuda != nil { slog.Debug("detected GPUs", "count", deviceCount, "library", libPath) cHandles.nvcuda = nvcuda @@ -121,11 +132,14 @@ func initCudaHandles() *cudaHandles { nvcudaLibPath = libPath return cHandles } + if err != nil { + bootstrapErrors = append(bootstrapErrors, err) + } } cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns) if len(cudartLibPaths) > 0 { - deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths) + deviceCount, cudart, libPath, err := loadCUDARTMgmt(cudartLibPaths) if cudart != nil { slog.Debug("detected GPUs", "library", libPath, "count", deviceCount) cHandles.cudart = cudart @@ -133,6 +147,9 @@ func initCudaHandles() *cudaHandles { cudartLibPath = libPath return cHandles } + if err != nil { + bootstrapErrors = append(bootstrapErrors, err) + } } return cHandles @@ -143,14 +160,19 @@ func initOneAPIHandles() *oneapiHandles { oHandles := &oneapiHandles{} // Short Circuit if we already know which library to use + // ignore bootstrap errors in this case since we already recorded them if oneapiLibPath != "" { - oHandles.deviceCount, oHandles.oneapi, _ = LoadOneapiMgmt([]string{oneapiLibPath}) + oHandles.deviceCount, oHandles.oneapi, _, _ = loadOneapiMgmt([]string{oneapiLibPath}) return oHandles } oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs) if len(oneapiLibPaths) > 0 { - oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths) + var err error + oHandles.deviceCount, oHandles.oneapi, oneapiLibPath, err = loadOneapiMgmt(oneapiLibPaths) + if err != nil { + bootstrapErrors = append(bootstrapErrors, err) + } } return oHandles @@ -197,6 +219,7 @@ func GetGPUInfo() GpuInfoList { if !bootstrapped { slog.Info("looking for compatible GPUs") + bootstrapErrors = []error{} needRefresh = false cpuCapability = GetCPUCapability() var memInfo C.mem_info_t @@ -221,7 +244,9 @@ func GetGPUInfo() GpuInfoList { // Fallback to CPU mode if we're lacking required vector extensions on x86 if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" { - slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability) + err := fmt.Errorf("CPU does not have minimum vector extensions, GPU inference disabled. Required:%s Detected:%s", GPURunnerCPUCapability, cpuCapability) + slog.Warn(err.Error()) + bootstrapErrors = append(bootstrapErrors, err) bootstrapped = true // No need to do any GPU discovery, since we can't run on them return GpuInfoList{cpus[0].GpuInfo} @@ -253,10 +278,6 @@ func GetGPUInfo() GpuInfoList { C.free(unsafe.Pointer(memInfo.err)) continue } - if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) { - slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor)) - continue - } gpuInfo.TotalMemory = uint64(memInfo.total) gpuInfo.FreeMemory = uint64(memInfo.free) gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) @@ -279,6 +300,15 @@ func GetGPUInfo() GpuInfoList { gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.Variant = variant + if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) { + unsupportedGPUs = append(unsupportedGPUs, + UnsupportedGPUInfo{ + GpuInfo: gpuInfo.GpuInfo, + }) + slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor)) + continue + } + // query the management library as well so we can record any skew between the two // which represents overhead on the GPU we must set aside on subsequent updates if cHandles.nvml != nil { @@ -341,7 +371,10 @@ func GetGPUInfo() GpuInfoList { } } - rocmGPUs = AMDGetGPUInfo() + rocmGPUs, err = AMDGetGPUInfo() + if err != nil { + bootstrapErrors = append(bootstrapErrors, err) + } bootstrapped = true if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 { slog.Info("no compatible GPUs were discovered") @@ -526,92 +559,114 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string { return gpuLibPaths } -func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) { +// Bootstrap the runtime library +// Returns: num devices, handle, libPath, error +func loadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string, error) { var resp C.cudart_init_resp_t resp.ch.verbose = getVerboseState() + var err error for _, libPath := range cudartLibPaths { lib := C.CString(libPath) defer C.free(unsafe.Pointer(lib)) C.cudart_init(lib, &resp) if resp.err != nil { - slog.Debug("Unable to load cudart", "library", libPath, "error", C.GoString(resp.err)) + err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err)) + slog.Debug(err.Error()) C.free(unsafe.Pointer(resp.err)) } else { - return int(resp.num_devices), &resp.ch, libPath + err = nil + return int(resp.num_devices), &resp.ch, libPath, err } } - return 0, nil, "" + return 0, nil, "", err } -func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) { +// Bootstrap the driver library +// Returns: num devices, handle, libPath, error +func loadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string, error) { var resp C.nvcuda_init_resp_t resp.ch.verbose = getVerboseState() + var err error for _, libPath := range nvcudaLibPaths { lib := C.CString(libPath) defer C.free(unsafe.Pointer(lib)) C.nvcuda_init(lib, &resp) if resp.err != nil { // Decide what log level based on the type of error message to help users understand why - msg := C.GoString(resp.err) switch resp.cudaErr { case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: - slog.Warn("version mismatch between driver and cuda driver library - reboot or upgrade may be required", "library", libPath, "error", msg) + err = fmt.Errorf("version mismatch between driver and cuda driver library - reboot or upgrade may be required: library %s", libPath) + slog.Warn(err.Error()) case C.CUDA_ERROR_NO_DEVICE: - slog.Info("no nvidia devices detected", "library", libPath) + err = fmt.Errorf("no nvidia devices detected by library %s", libPath) + slog.Info(err.Error()) case C.CUDA_ERROR_UNKNOWN: - slog.Warn("unknown error initializing cuda driver library", "library", libPath, "error", msg) - slog.Warn("see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information") + err = fmt.Errorf("unknown error initializing cuda driver library %s: %s. see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information", libPath, C.GoString(resp.err)) + slog.Warn(err.Error()) default: + msg := C.GoString(resp.err) if strings.Contains(msg, "wrong ELF class") { slog.Debug("skipping 32bit library", "library", libPath) } else { - slog.Info("unable to load cuda driver library", "library", libPath, "error", msg) + err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err)) + slog.Info(err.Error()) } } C.free(unsafe.Pointer(resp.err)) } else { - return int(resp.num_devices), &resp.ch, libPath + err = nil + return int(resp.num_devices), &resp.ch, libPath, err } } - return 0, nil, "" + return 0, nil, "", err } -func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) { +// Bootstrap the management library +// Returns: handle, libPath, error +func loadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string, error) { var resp C.nvml_init_resp_t resp.ch.verbose = getVerboseState() + var err error for _, libPath := range nvmlLibPaths { lib := C.CString(libPath) defer C.free(unsafe.Pointer(lib)) C.nvml_init(lib, &resp) if resp.err != nil { - slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err))) + err = fmt.Errorf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)) + slog.Info(err.Error()) C.free(unsafe.Pointer(resp.err)) } else { - return &resp.ch, libPath + err = nil + return &resp.ch, libPath, err } } - return nil, "" + return nil, "", err } -func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) { +// bootstrap the Intel GPU library +// Returns: num devices, handle, libPath, error +func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, error) { var resp C.oneapi_init_resp_t num_devices := 0 resp.oh.verbose = getVerboseState() + var err error for _, libPath := range oneapiLibPaths { lib := C.CString(libPath) defer C.free(unsafe.Pointer(lib)) C.oneapi_init(lib, &resp) if resp.err != nil { - slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err)) + err = fmt.Errorf("Unable to load oneAPI management library %s: %s", libPath, C.GoString(resp.err)) + slog.Debug(err.Error()) C.free(unsafe.Pointer(resp.err)) } else { + err = nil for i := range resp.oh.num_drivers { num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i))) } - return num_devices, &resp.oh, libPath + return num_devices, &resp.oh, libPath, err } } - return 0, nil, "" + return 0, nil, "", err } func getVerboseState() C.uint16_t { @@ -669,3 +724,23 @@ func LibraryDir() string { slog.Warn("unable to locate gpu dependency libraries") return "" } + +func GetSystemInfo() SystemInfo { + gpus := GetGPUInfo() + gpuMutex.Lock() + defer gpuMutex.Unlock() + discoveryErrors := []string{} + for _, err := range bootstrapErrors { + discoveryErrors = append(discoveryErrors, err.Error()) + } + if len(gpus) == 1 && gpus[0].Library == "cpu" { + gpus = []GpuInfo{} + } + + return SystemInfo{ + System: cpus[0], + GPUs: gpus, + UnsupportedGPUs: unsupportedGPUs, + DiscoveryErrors: discoveryErrors, + } +} diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 417b48df..c8623bcf 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -66,3 +66,15 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { // No-op on darwin return "", "" } + +func GetSystemInfo() SystemInfo { + mem, _ := GetCPUMem() + return SystemInfo{ + System: CPUInfo{ + GpuInfo: GpuInfo{ + memInfo: mem, + }, + }, + GPUs: GetGPUInfo(), + } +} diff --git a/gpu/types.go b/gpu/types.go index a30e5fb3..0a038ecd 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -76,6 +76,11 @@ type OneapiGPUInfoList []OneapiGPUInfo type GpuInfoList []GpuInfo +type UnsupportedGPUInfo struct { + GpuInfo + Reason string `json:"reason"` +} + // Split up the set of gpu info's by Library and variant func (l GpuInfoList) ByLibrary() []GpuInfoList { resp := []GpuInfoList{} @@ -146,3 +151,10 @@ func (c CPUCapability) String() string { return "no vector extensions" } } + +type SystemInfo struct { + System CPUInfo `json:"system"` + GPUs []GpuInfo `json:"gpus"` + UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"` + DiscoveryErrors []string `json:"discovery_errors"` +}