Refine GPU discovery to bootstrap once

Now that we call the GPU discovery routines many times to update memory, this splits initial discovery from free memory updating.
2024-05-15 15:13:16 -07:00 · 2024-05-15 15:13:16 -07:00 · 43ed358f9a
commit 43ed358f9a
parent b32ebb4f29
9 changed files with 383 additions and 149 deletions
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@ -44,8 +44,8 @@ var (
 )

 // Gather GPU information from the amdgpu driver if any supported GPUs are detected
-func AMDGetGPUInfo() []GpuInfo {
-	resp := []GpuInfo{}
+func AMDGetGPUInfo() []RocmGPUInfo {
+	resp := []RocmGPUInfo{}
 	if !AMDDetected() {
 		return resp
 	}
@ -178,7 +178,7 @@ func AMDGetGPUInfo() []GpuInfo {
 		// Shouldn't happen, but just in case...
 		if gpuID < 0 {
 			slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
-			return []GpuInfo{}
+			return []RocmGPUInfo{}
 		}

 		if int(major) < RocmComputeMin {
@ -189,6 +189,7 @@ func AMDGetGPUInfo() []GpuInfo {
 		// Look up the memory for the current node
 		totalMemory := uint64(0)
 		usedMemory := uint64(0)
+		var usedFile string
 		mapping := []struct {
 			id       uint64
 			filename string
@ -255,22 +256,10 @@ func AMDGetGPUInfo() []GpuInfo {
 				break
 			}

-			usedFile := filepath.Join(devDir, DRMUsedMemoryFile)
-			usedFp, err := os.Open(usedFile)
+			usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
+			usedMemory, err = getFreeMemory(usedFile)
 			if err != nil {
-				slog.Debug("failed to open sysfs node", "file", usedFile, "error", err)
-				break
-			}
-			defer totalFp.Close()
-			buf, err = io.ReadAll(usedFp)
-			if err != nil {
-				slog.Debug("failed to read sysfs node", "file", usedFile, "error", err)
-				break
-			}
-			usedMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
-			if err != nil {
-				slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
-				break
+				slog.Debug("failed to update used memory", "error", err)
 			}
 			break
 		}
@ -288,18 +277,21 @@ func AMDGetGPUInfo() []GpuInfo {

 		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
 		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
-		gpuInfo := GpuInfo{
-			Library: "rocm",
-			memInfo: memInfo{
-				TotalMemory: totalMemory,
-				FreeMemory:  (totalMemory - usedMemory),
+		gpuInfo := RocmGPUInfo{
+			GpuInfo: GpuInfo{
+				Library: "rocm",
+				memInfo: memInfo{
+					TotalMemory: totalMemory,
+					FreeMemory:  (totalMemory - usedMemory),
+				},
+				ID:            fmt.Sprintf("%d", gpuID),
+				Name:          name,
+				Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
+				MinimumMemory: rocmMinimumMemory,
+				DriverMajor:   driverMajor,
+				DriverMinor:   driverMinor,
 			},
-			ID:            fmt.Sprintf("%d", gpuID),
-			Name:          name,
-			Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
-			MinimumMemory: rocmMinimumMemory,
-			DriverMajor:   driverMajor,
-			DriverMinor:   driverMinor,
+			usedFilepath: usedFile,
 		}

 		// If the user wants to filter to a subset of devices, filter out if we aren't a match
@ -323,7 +315,7 @@ func AMDGetGPUInfo() []GpuInfo {
 			libDir, err = AMDValidateLibDir()
 			if err != nil {
 				slog.Warn("unable to verify rocm library, will use cpu", "error", err)
-				return []GpuInfo{}
+				return []RocmGPUInfo{}
 			}
 		}
 		gpuInfo.DependencyPath = libDir
@ -334,7 +326,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				supported, err = GetSupportedGFX(libDir)
 				if err != nil {
 					slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
-					return []GpuInfo{}
+					return []RocmGPUInfo{}
 				}
 				slog.Debug("rocm supported GPUs", "types", supported)
 			}
@ -425,3 +417,36 @@ func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
 	}
 	return driverMajor, driverMinor, nil
 }
+
+func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
+	if len(gpus) == 0 {
+		return nil
+	}
+	for i := range gpus {
+		usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
+		if err != nil {
+			return err
+		}
+		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
+		gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
+	}
+	return nil
+}
+
+func getFreeMemory(usedFile string) (uint64, error) {
+	usedFp, err := os.Open(usedFile)
+	if err != nil {
+		return 0, fmt.Errorf("failed to open sysfs node %s %w", usedFile, err)
+	}
+	defer usedFp.Close()
+	buf, err := io.ReadAll(usedFp)
+	if err != nil {
+		return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
+	}
+	usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
+	if err != nil {
+		slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
+		return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
+	}
+	return usedMemory, nil
+}
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@ -24,8 +24,8 @@ var (
 	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
 )

-func AMDGetGPUInfo() []GpuInfo {
-	resp := []GpuInfo{}
+func AMDGetGPUInfo() []RocmGPUInfo {
+	resp := []RocmGPUInfo{}
 	hl, err := NewHipLib()
 	if err != nil {
 		slog.Debug(err.Error())
@ -117,21 +117,24 @@ func AMDGetGPUInfo() []GpuInfo {
 		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
 		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
 		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
-		gpuInfo := GpuInfo{
-			Library: "rocm",
-			memInfo: memInfo{
-				TotalMemory: totalMemory,
-				FreeMemory:  freeMemory,
-			},
-			ID:             fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
-			DependencyPath: libDir,
-			MinimumMemory:  rocmMinimumMemory,
-			Name:           name,
-			Compute:        gfx,
+		gpuInfo := RocmGPUInfo{
+			GpuInfo: GpuInfo{
+				Library: "rocm",
+				memInfo: memInfo{
+					TotalMemory: totalMemory,
+					FreeMemory:  freeMemory,
+				},
+				ID:             fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
+				DependencyPath: libDir,
+				MinimumMemory:  rocmMinimumMemory,
+				Name:           name,
+				Compute:        gfx,

-			// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
-			// DriverMajor:    driverMajor,
-			// DriverMinor:    driverMinor,
+				// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
+				// DriverMajor:    driverMajor,
+				// DriverMinor:    driverMinor,
+			},
+			index: i,
 		}

 		resp = append(resp, gpuInfo)
@ -159,3 +162,30 @@ func AMDValidateLibDir() (string, error) {
 	slog.Warn("amdgpu detected, but no compatible rocm library found.  Please install ROCm")
 	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
 }
+
+func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
+	if len(gpus) == 0 {
+		return nil
+	}
+	hl, err := NewHipLib()
+	if err != nil {
+		slog.Debug(err.Error())
+		return nil
+	}
+	defer hl.Release()
+
+	for i := range gpus {
+		err := hl.HipSetDevice(gpus[i].index)
+		if err != nil {
+			return err
+		}
+		freeMemory, _, err := hl.HipMemGetInfo()
+		if err != nil {
+			slog.Warn("get mem info", "id", i, "error", err)
+			continue
+		}
+		slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
+		gpus[i].FreeMemory = freeMemory
+	}
+	return nil
+}
--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@ -1,21 +1,20 @@
 package gpu

 import (
-	"log/slog"
-
 	"golang.org/x/sys/cpu"
 )

 func GetCPUVariant() string {
+	return getCPUCapability().ToVariant()
+}
+
+func getCPUCapability() CPUCapability {
 	if cpu.X86.HasAVX2 {
-		slog.Debug("CPU has AVX2")
-		return "avx2"
+		return CPUCapabilityAVX2
 	}
 	if cpu.X86.HasAVX {
-		slog.Debug("CPU has AVX")
-		return "avx"
+		return CPUCapabilityAVX
 	}
-	slog.Debug("CPU does not have vector extensions")
 	// else LCD
-	return ""
+	return CPUCapabilityBase
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -21,8 +21,8 @@ import (
 	"sync"
 	"unsafe"

-	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/format"
 )

 type handles struct {
@ -37,7 +37,18 @@ const (
 	rocmMinimumMemory = 457 * format.MebiByte
 )

-var gpuMutex sync.Mutex
+var (
+	gpuMutex      sync.Mutex
+	bootstrapped  bool
+	cpuCapability CPUCapability
+	cpus          []CPUInfo
+	cudaGPUs      []CudaGPUInfo
+	nvcudaLibPath string
+	cudartLibPath string
+	oneapiLibPath string
+	rocmGPUs      []RocmGPUInfo
+	oneapiGPUs    []OneapiGPUInfo
+)

 // With our current CUDA compile flags, older than 5.0 will not work properly
 var CudaComputeMin = [2]C.int{5, 0}
@ -96,11 +107,22 @@ var OneapiLinuxGlobs = []string{
 var CudaTegra string = os.Getenv("JETSON_JETPACK")

 // Note: gpuMutex must already be held
-func initGPUHandles() *handles {
+func initCudaHandles() *handles {

 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing

 	gpuHandles := &handles{}
+	// Short Circuit if we already know which library to use
+	if nvcudaLibPath != "" {
+		gpuHandles.deviceCount, gpuHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
+		return gpuHandles
+	}
+	if cudartLibPath != "" {
+		gpuHandles.deviceCount, gpuHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath})
+		return gpuHandles
+	}
+
+	slog.Debug("searching for GPU discovery libraries for NVIDIA")
 	var cudartMgmtName string
 	var cudartMgmtPatterns []string
 	var nvcudaMgmtName string
@ -136,7 +158,6 @@ func initGPUHandles() *handles {
 		return gpuHandles
 	}

-	slog.Debug("Detecting GPUs")
 	nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
 	if len(nvcudaLibPaths) > 0 {
 		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
@ -144,6 +165,7 @@ func initGPUHandles() *handles {
 			slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
 			gpuHandles.nvcuda = nvcuda
 			gpuHandles.deviceCount = deviceCount
+			nvcudaLibPath = libPath
 			return gpuHandles
 		}
 	}
@ -155,6 +177,7 @@ func initGPUHandles() *handles {
 			slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
 			gpuHandles.cudart = cudart
 			gpuHandles.deviceCount = deviceCount
+			cudartLibPath = libPath
 			return gpuHandles
 		}
 	}
@ -166,6 +189,7 @@ func initGPUHandles() *handles {
 			slog.Debug("detected Intel GPUs", "library", libPath, "count", deviceCount)
 			gpuHandles.oneapi = oneapi
 			gpuHandles.deviceCount = deviceCount
+			oneapiLibPath = libPath
 			return gpuHandles
 		}
 	}
@ -178,9 +202,12 @@ func GetGPUInfo() GpuInfoList {
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
 	gpuMutex.Lock()
 	defer gpuMutex.Unlock()
-
-	gpuHandles := initGPUHandles()
+	needRefresh := true
+	var gpuHandles *handles
 	defer func() {
+		if gpuHandles == nil {
+			return
+		}
 		if gpuHandles.cudart != nil {
 			C.cudart_release(*gpuHandles.cudart)
 		}
@ -189,97 +216,156 @@ func GetGPUInfo() GpuInfoList {
 		}
 	}()

-	// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
-	cpuVariant := GetCPUVariant()
-	if cpuVariant == "" && runtime.GOARCH == "amd64" {
-		slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
-	}
-
-	// On windows we bundle the nvidia library one level above the runner dir
-	depPath := ""
-	if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
-		depPath = filepath.Dir(envconfig.RunnersDir)
-	}
-
-	var memInfo C.mem_info_t
-	resp := []GpuInfo{}
-
-	// NVIDIA first
-	for i := range gpuHandles.deviceCount {
-		// TODO once we support CPU compilation variants of GPU libraries refine this...
-		if cpuVariant == "" && runtime.GOARCH == "amd64" {
-			continue
-		}
-		if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
-			gpuInfo := GpuInfo{
-				Library: "cuda",
-			}
-			var driverMajor int
-			var driverMinor int
-			if gpuHandles.cudart != nil {
-				C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
-			} else {
-				C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
-				driverMajor = int(gpuHandles.nvcuda.driver_major)
-				driverMinor = int(gpuHandles.nvcuda.driver_minor)
-			}
-			if memInfo.err != nil {
-				slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
-				C.free(unsafe.Pointer(memInfo.err))
-				continue
-			}
-			if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
-				slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
-				continue
-			}
-			gpuInfo.TotalMemory = uint64(memInfo.total)
-			gpuInfo.FreeMemory = uint64(memInfo.free)
-			gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-			gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
-			gpuInfo.MinimumMemory = cudaMinimumMemory
-			gpuInfo.DependencyPath = depPath
-			gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-			gpuInfo.DriverMajor = driverMajor
-			gpuInfo.DriverMinor = driverMinor
-
-			// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
-			resp = append(resp, gpuInfo)
-		}
-		if gpuHandles.oneapi != nil {
-			gpuInfo := GpuInfo{
-				Library: "oneapi",
-			}
-			C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
-			var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
-			memInfo.free = C.uint64_t(totalFreeMem)
-			gpuInfo.TotalMemory = uint64(memInfo.total)
-			gpuInfo.FreeMemory = uint64(memInfo.free)
-			gpuInfo.ID = strconv.Itoa(i)
-			resp = append(resp, gpuInfo)
-		}
-	}
-
-	// Then AMD
-	resp = append(resp, AMDGetGPUInfo()...)
-
-	if len(resp) == 0 {
+	if !bootstrapped {
+		slog.Debug("Detecting GPUs")
+		needRefresh = false
+		cpuCapability = getCPUCapability()
+		var memInfo C.mem_info_t
 		C.cpu_check_ram(&memInfo)
 		if memInfo.err != nil {
 			slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
 			C.free(unsafe.Pointer(memInfo.err))
-			return resp
+			return []GpuInfo{}
 		}
-		gpuInfo := GpuInfo{
-			Library: "cpu",
-			Variant: cpuVariant,
+		cpuInfo := CPUInfo{
+			GpuInfo: GpuInfo{
+				Library: "cpu",
+				Variant: cpuCapability.ToVariant(),
+			},
 		}
-		gpuInfo.TotalMemory = uint64(memInfo.total)
-		gpuInfo.FreeMemory = uint64(memInfo.free)
-		gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+		cpuInfo.TotalMemory = uint64(memInfo.total)
+		cpuInfo.FreeMemory = uint64(memInfo.free)
+		cpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+		cpus = []CPUInfo{cpuInfo}

-		resp = append(resp, gpuInfo)
+		// Fallback to CPU mode if we're lacking required vector extensions on x86
+		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
+			slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability.ToString(), "detected", cpuCapability.ToString())
+			bootstrapped = true
+			// No need to do any GPU discovery, since we can't run on them
+			return GpuInfoList{cpus[0].GpuInfo}
+		}
+
+		// TODO - implement
+
+		// TODO refine the discovery to only gather total memory
+
+		// On windows we bundle the nvidia library one level above the runner dir
+		depPath := ""
+		if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
+			depPath = filepath.Dir(envconfig.RunnersDir)
+		}
+
+		// Load ALL libraries
+		gpuHandles = initCudaHandles()
+
+		// TODO needs a refactoring pass to init oneapi handles
+
+		// NVIDIA
+		for i := range gpuHandles.deviceCount {
+			if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
+				gpuInfo := CudaGPUInfo{
+					GpuInfo: GpuInfo{
+						Library: "cuda",
+					},
+					index: i,
+				}
+				var driverMajor int
+				var driverMinor int
+				if gpuHandles.cudart != nil {
+					C.cudart_bootstrap(*gpuHandles.cudart, C.int(i), &memInfo)
+				} else {
+					C.nvcuda_bootstrap(*gpuHandles.nvcuda, C.int(i), &memInfo)
+					driverMajor = int(gpuHandles.nvcuda.driver_major)
+					driverMinor = int(gpuHandles.nvcuda.driver_minor)
+				}
+				if memInfo.err != nil {
+					slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
+					C.free(unsafe.Pointer(memInfo.err))
+					continue
+				}
+				if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
+					slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
+					continue
+				}
+				gpuInfo.TotalMemory = uint64(memInfo.total)
+				gpuInfo.FreeMemory = uint64(memInfo.free)
+				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
+				gpuInfo.MinimumMemory = cudaMinimumMemory
+				gpuInfo.DependencyPath = depPath
+				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+				gpuInfo.DriverMajor = int(driverMajor)
+				gpuInfo.DriverMinor = int(driverMinor)
+
+				// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
+				cudaGPUs = append(cudaGPUs, gpuInfo)
+			}
+			if gpuHandles.oneapi != nil {
+				gpuInfo := OneapiGPUInfo{
+					GpuInfo: GpuInfo{
+						Library: "oneapi",
+					},
+					index: i,
+				}
+				// TODO - split bootstrapping from updating free memory
+				C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
+				var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
+				memInfo.free = C.uint64_t(totalFreeMem)
+				gpuInfo.TotalMemory = uint64(memInfo.total)
+				gpuInfo.FreeMemory = uint64(memInfo.free)
+				gpuInfo.ID = strconv.Itoa(i)
+				oneapiGPUs = append(oneapiGPUs, gpuInfo)
+			}
+		}
+
+		rocmGPUs = AMDGetGPUInfo()
+		bootstrapped = true
 	}

+	// For detected GPUs, load library if not loaded
+
+	// Refresh free memory usage
+	if needRefresh {
+		// TODO - CPU system memory tracking/refresh
+		var memInfo C.mem_info_t
+		if gpuHandles == nil && len(cudaGPUs) > 0 {
+			gpuHandles = initCudaHandles()
+		}
+		for i, gpu := range cudaGPUs {
+			if gpuHandles.cudart != nil {
+				C.cudart_bootstrap(*gpuHandles.cudart, C.int(gpu.index), &memInfo)
+			} else {
+				C.nvcuda_get_free(*gpuHandles.nvcuda, C.int(gpu.index), &memInfo.free)
+			}
+			if memInfo.err != nil {
+				slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
+				C.free(unsafe.Pointer(memInfo.err))
+				continue
+			}
+			if memInfo.free == 0 {
+				slog.Warn("error looking up nvidia GPU memory")
+				continue
+			}
+			slog.Debug("updating cuda free memory", "gpu", gpu.ID, "name", gpu.Name, "before", format.HumanBytes2(gpu.FreeMemory), "now", format.HumanBytes2(uint64(memInfo.free)))
+			cudaGPUs[i].FreeMemory = uint64(memInfo.free)
+		}
+		err := RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
+		if err != nil {
+			slog.Debug("problem refreshing ROCm free memory", "error", err)
+		}
+	}
+
+	resp := []GpuInfo{}
+	for _, gpu := range cudaGPUs {
+		resp = append(resp, gpu.GpuInfo)
+	}
+	for _, gpu := range rocmGPUs {
+		resp = append(resp, gpu.GpuInfo)
+	}
+	if len(resp) == 0 {
+		resp = append(resp, cpus[0].GpuInfo)
+	}
 	return resp
 }

--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@ -94,7 +94,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
 }


-void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
+void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
  resp->err = NULL;
  cudartMemory_t memInfo = {0,0,0};
  cudartReturn_t ret;
--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
@ -140,7 +140,8 @@ typedef struct cudart_init_resp {
 } cudart_init_resp_t;

 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
-void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);
+void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
+// TODO - if we keep this library longer term, add cudart_get_free
 void cudart_release(cudart_handle_t ch);

 #endif  // __GPU_INFO_CUDART_H__
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@ -96,7 +96,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
 }

 const int buflen = 256;
-void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
+void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
  resp->err = NULL;
  nvcudaMemory_t memInfo = {0,0};
  CUresult ret;
@ -168,7 +168,7 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
+    snprintf(buf, buflen, "nvcuda failed to get device context %d", ret);
    resp->err = strdup(buf);
    return;
  }
@ -193,7 +193,42 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {

  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda failed to release primary device context %d", ret);
+    LOG(1, "nvcuda failed to release device context %d", ret);
+  }
+}
+
+void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free) {
+  CUresult ret;
+  CUcontext ctx = NULL;
+  CUdevice device = -1;
+  *free = 0;
+  uint64_t total = 0;
+
+  ret = (*h.cuDeviceGet)(&device, i);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "nvcuda device failed to initialize");
+    return;
+  }
+
+
+  // To get memory we have to set (and release) a context
+  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "nvcuda failed to get device context %d", ret);
+    return;
+  }
+
+  ret = (*h.cuMemGetInfo_v2)(free, &total);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "nvcuda device memory info lookup failure %d", ret);
+    // Best effort on failure...
+    (*h.cuCtxDestroy)(ctx);
+    return;
+  }
+
+  ret = (*h.cuCtxDestroy)(ctx);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "nvcuda failed to release device context %d", ret);
  }
 }

--- a/gpu/gpu_info_nvcuda.h
+++ b/gpu/gpu_info_nvcuda.h
@ -67,7 +67,8 @@ typedef struct nvcuda_init_resp {
 } nvcuda_init_resp_t;

 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
-void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
+void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
+void nvcuda_get_free(nvcuda_handle_t ch,  int device_id, uint64_t *free);
 void nvcuda_release(nvcuda_handle_t ch);

 #endif  // __GPU_INFO_NVCUDA_H__
--- a/gpu/types.go
+++ b/gpu/types.go
@ -38,6 +38,29 @@ type GpuInfo struct {
 	// TODO other performance capability info to help in scheduling decisions
 }

+type CPUInfo struct {
+	GpuInfo
+}
+
+type CudaGPUInfo struct {
+	GpuInfo
+	index int // device index
+}
+type CudaGPUInfoList []CudaGPUInfo
+
+type RocmGPUInfo struct {
+	GpuInfo
+	usedFilepath string // linux
+	index        int    // device index on windows
+}
+type RocmGPUInfoList []RocmGPUInfo
+
+type OneapiGPUInfo struct {
+	GpuInfo
+	index int // device index
+}
+type OneapiGPUInfoList []OneapiGPUInfo
+
 type GpuInfoList []GpuInfo

 // Split up the set of gpu info's by Library and variant
@ -86,3 +109,37 @@ type ByFreeMemory []GpuInfo
 func (a ByFreeMemory) Len() int           { return len(a) }
 func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
+
+type CPUCapability uint32
+
+// Override at build time when building base GPU runners
+var GPURunnerCPUCapability = CPUCapabilityAVX
+
+const (
+	CPUCapabilityBase CPUCapability = iota
+	CPUCapabilityAVX
+	CPUCapabilityAVX2
+	// TODO AVX512
+)
+
+func (c CPUCapability) ToString() string {
+	switch c {
+	case CPUCapabilityAVX:
+		return "AVX"
+	case CPUCapabilityAVX2:
+		return "AVX2"
+	default:
+		return "no vector extensions"
+	}
+}
+
+func (c CPUCapability) ToVariant() string {
+	switch c {
+	case CPUCapabilityAVX:
+		return "avx"
+	case CPUCapabilityAVX2:
+		return "avx2"
+	default:
+		return ""
+	}
+}