Refine GPU discovery to bootstrap once

Now that we call the GPU discovery routines many times to
update memory, this splits initial discovery from free memory
updating.
This commit is contained in:
Daniel Hiltgen 2024-05-15 15:13:16 -07:00
parent b32ebb4f29
commit 43ed358f9a
9 changed files with 383 additions and 149 deletions

View file

@ -44,8 +44,8 @@ var (
) )
// Gather GPU information from the amdgpu driver if any supported GPUs are detected // Gather GPU information from the amdgpu driver if any supported GPUs are detected
func AMDGetGPUInfo() []GpuInfo { func AMDGetGPUInfo() []RocmGPUInfo {
resp := []GpuInfo{} resp := []RocmGPUInfo{}
if !AMDDetected() { if !AMDDetected() {
return resp return resp
} }
@ -178,7 +178,7 @@ func AMDGetGPUInfo() []GpuInfo {
// Shouldn't happen, but just in case... // Shouldn't happen, but just in case...
if gpuID < 0 { if gpuID < 0 {
slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue") slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
return []GpuInfo{} return []RocmGPUInfo{}
} }
if int(major) < RocmComputeMin { if int(major) < RocmComputeMin {
@ -189,6 +189,7 @@ func AMDGetGPUInfo() []GpuInfo {
// Look up the memory for the current node // Look up the memory for the current node
totalMemory := uint64(0) totalMemory := uint64(0)
usedMemory := uint64(0) usedMemory := uint64(0)
var usedFile string
mapping := []struct { mapping := []struct {
id uint64 id uint64
filename string filename string
@ -255,22 +256,10 @@ func AMDGetGPUInfo() []GpuInfo {
break break
} }
usedFile := filepath.Join(devDir, DRMUsedMemoryFile) usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
usedFp, err := os.Open(usedFile) usedMemory, err = getFreeMemory(usedFile)
if err != nil { if err != nil {
slog.Debug("failed to open sysfs node", "file", usedFile, "error", err) slog.Debug("failed to update used memory", "error", err)
break
}
defer totalFp.Close()
buf, err = io.ReadAll(usedFp)
if err != nil {
slog.Debug("failed to read sysfs node", "file", usedFile, "error", err)
break
}
usedMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
if err != nil {
slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
break
} }
break break
} }
@ -288,7 +277,8 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory)) slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory)) slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
gpuInfo := GpuInfo{ gpuInfo := RocmGPUInfo{
GpuInfo: GpuInfo{
Library: "rocm", Library: "rocm",
memInfo: memInfo{ memInfo: memInfo{
TotalMemory: totalMemory, TotalMemory: totalMemory,
@ -300,6 +290,8 @@ func AMDGetGPUInfo() []GpuInfo {
MinimumMemory: rocmMinimumMemory, MinimumMemory: rocmMinimumMemory,
DriverMajor: driverMajor, DriverMajor: driverMajor,
DriverMinor: driverMinor, DriverMinor: driverMinor,
},
usedFilepath: usedFile,
} }
// If the user wants to filter to a subset of devices, filter out if we aren't a match // If the user wants to filter to a subset of devices, filter out if we aren't a match
@ -323,7 +315,7 @@ func AMDGetGPUInfo() []GpuInfo {
libDir, err = AMDValidateLibDir() libDir, err = AMDValidateLibDir()
if err != nil { if err != nil {
slog.Warn("unable to verify rocm library, will use cpu", "error", err) slog.Warn("unable to verify rocm library, will use cpu", "error", err)
return []GpuInfo{} return []RocmGPUInfo{}
} }
} }
gpuInfo.DependencyPath = libDir gpuInfo.DependencyPath = libDir
@ -334,7 +326,7 @@ func AMDGetGPUInfo() []GpuInfo {
supported, err = GetSupportedGFX(libDir) supported, err = GetSupportedGFX(libDir)
if err != nil { if err != nil {
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err) slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
return []GpuInfo{} return []RocmGPUInfo{}
} }
slog.Debug("rocm supported GPUs", "types", supported) slog.Debug("rocm supported GPUs", "types", supported)
} }
@ -425,3 +417,36 @@ func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
} }
return driverMajor, driverMinor, nil return driverMajor, driverMinor, nil
} }
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
if len(gpus) == 0 {
return nil
}
for i := range gpus {
usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
if err != nil {
return err
}
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
}
return nil
}
func getFreeMemory(usedFile string) (uint64, error) {
usedFp, err := os.Open(usedFile)
if err != nil {
return 0, fmt.Errorf("failed to open sysfs node %s %w", usedFile, err)
}
defer usedFp.Close()
buf, err := io.ReadAll(usedFp)
if err != nil {
return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
}
usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
if err != nil {
slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
}
return usedMemory, nil
}

View file

@ -24,8 +24,8 @@ var (
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob? RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
) )
func AMDGetGPUInfo() []GpuInfo { func AMDGetGPUInfo() []RocmGPUInfo {
resp := []GpuInfo{} resp := []RocmGPUInfo{}
hl, err := NewHipLib() hl, err := NewHipLib()
if err != nil { if err != nil {
slog.Debug(err.Error()) slog.Debug(err.Error())
@ -117,7 +117,8 @@ func AMDGetGPUInfo() []GpuInfo {
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable // v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory)) slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory)) slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
gpuInfo := GpuInfo{ gpuInfo := RocmGPUInfo{
GpuInfo: GpuInfo{
Library: "rocm", Library: "rocm",
memInfo: memInfo{ memInfo: memInfo{
TotalMemory: totalMemory, TotalMemory: totalMemory,
@ -132,6 +133,8 @@ func AMDGetGPUInfo() []GpuInfo {
// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve // TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
// DriverMajor: driverMajor, // DriverMajor: driverMajor,
// DriverMinor: driverMinor, // DriverMinor: driverMinor,
},
index: i,
} }
resp = append(resp, gpuInfo) resp = append(resp, gpuInfo)
@ -159,3 +162,30 @@ func AMDValidateLibDir() (string, error) {
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm") slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
return "", fmt.Errorf("no suitable rocm found, falling back to CPU") return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
} }
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
if len(gpus) == 0 {
return nil
}
hl, err := NewHipLib()
if err != nil {
slog.Debug(err.Error())
return nil
}
defer hl.Release()
for i := range gpus {
err := hl.HipSetDevice(gpus[i].index)
if err != nil {
return err
}
freeMemory, _, err := hl.HipMemGetInfo()
if err != nil {
slog.Warn("get mem info", "id", i, "error", err)
continue
}
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
gpus[i].FreeMemory = freeMemory
}
return nil
}

View file

@ -1,21 +1,20 @@
package gpu package gpu
import ( import (
"log/slog"
"golang.org/x/sys/cpu" "golang.org/x/sys/cpu"
) )
func GetCPUVariant() string { func GetCPUVariant() string {
return getCPUCapability().ToVariant()
}
func getCPUCapability() CPUCapability {
if cpu.X86.HasAVX2 { if cpu.X86.HasAVX2 {
slog.Debug("CPU has AVX2") return CPUCapabilityAVX2
return "avx2"
} }
if cpu.X86.HasAVX { if cpu.X86.HasAVX {
slog.Debug("CPU has AVX") return CPUCapabilityAVX
return "avx"
} }
slog.Debug("CPU does not have vector extensions")
// else LCD // else LCD
return "" return CPUCapabilityBase
} }

View file

@ -21,8 +21,8 @@ import (
"sync" "sync"
"unsafe" "unsafe"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
) )
type handles struct { type handles struct {
@ -37,7 +37,18 @@ const (
rocmMinimumMemory = 457 * format.MebiByte rocmMinimumMemory = 457 * format.MebiByte
) )
var gpuMutex sync.Mutex var (
gpuMutex sync.Mutex
bootstrapped bool
cpuCapability CPUCapability
cpus []CPUInfo
cudaGPUs []CudaGPUInfo
nvcudaLibPath string
cudartLibPath string
oneapiLibPath string
rocmGPUs []RocmGPUInfo
oneapiGPUs []OneapiGPUInfo
)
// With our current CUDA compile flags, older than 5.0 will not work properly // With our current CUDA compile flags, older than 5.0 will not work properly
var CudaComputeMin = [2]C.int{5, 0} var CudaComputeMin = [2]C.int{5, 0}
@ -96,11 +107,22 @@ var OneapiLinuxGlobs = []string{
var CudaTegra string = os.Getenv("JETSON_JETPACK") var CudaTegra string = os.Getenv("JETSON_JETPACK")
// Note: gpuMutex must already be held // Note: gpuMutex must already be held
func initGPUHandles() *handles { func initCudaHandles() *handles {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing // TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
gpuHandles := &handles{} gpuHandles := &handles{}
// Short Circuit if we already know which library to use
if nvcudaLibPath != "" {
gpuHandles.deviceCount, gpuHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
return gpuHandles
}
if cudartLibPath != "" {
gpuHandles.deviceCount, gpuHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath})
return gpuHandles
}
slog.Debug("searching for GPU discovery libraries for NVIDIA")
var cudartMgmtName string var cudartMgmtName string
var cudartMgmtPatterns []string var cudartMgmtPatterns []string
var nvcudaMgmtName string var nvcudaMgmtName string
@ -136,7 +158,6 @@ func initGPUHandles() *handles {
return gpuHandles return gpuHandles
} }
slog.Debug("Detecting GPUs")
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns) nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
if len(nvcudaLibPaths) > 0 { if len(nvcudaLibPaths) > 0 {
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths) deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
@ -144,6 +165,7 @@ func initGPUHandles() *handles {
slog.Debug("detected GPUs", "count", deviceCount, "library", libPath) slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
gpuHandles.nvcuda = nvcuda gpuHandles.nvcuda = nvcuda
gpuHandles.deviceCount = deviceCount gpuHandles.deviceCount = deviceCount
nvcudaLibPath = libPath
return gpuHandles return gpuHandles
} }
} }
@ -155,6 +177,7 @@ func initGPUHandles() *handles {
slog.Debug("detected GPUs", "library", libPath, "count", deviceCount) slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
gpuHandles.cudart = cudart gpuHandles.cudart = cudart
gpuHandles.deviceCount = deviceCount gpuHandles.deviceCount = deviceCount
cudartLibPath = libPath
return gpuHandles return gpuHandles
} }
} }
@ -166,6 +189,7 @@ func initGPUHandles() *handles {
slog.Debug("detected Intel GPUs", "library", libPath, "count", deviceCount) slog.Debug("detected Intel GPUs", "library", libPath, "count", deviceCount)
gpuHandles.oneapi = oneapi gpuHandles.oneapi = oneapi
gpuHandles.deviceCount = deviceCount gpuHandles.deviceCount = deviceCount
oneapiLibPath = libPath
return gpuHandles return gpuHandles
} }
} }
@ -178,9 +202,12 @@ func GetGPUInfo() GpuInfoList {
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries // GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
gpuMutex.Lock() gpuMutex.Lock()
defer gpuMutex.Unlock() defer gpuMutex.Unlock()
needRefresh := true
gpuHandles := initGPUHandles() var gpuHandles *handles
defer func() { defer func() {
if gpuHandles == nil {
return
}
if gpuHandles.cudart != nil { if gpuHandles.cudart != nil {
C.cudart_release(*gpuHandles.cudart) C.cudart_release(*gpuHandles.cudart)
} }
@ -189,11 +216,39 @@ func GetGPUInfo() GpuInfoList {
} }
}() }()
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX if !bootstrapped {
cpuVariant := GetCPUVariant() slog.Debug("Detecting GPUs")
if cpuVariant == "" && runtime.GOARCH == "amd64" { needRefresh = false
slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.") cpuCapability = getCPUCapability()
var memInfo C.mem_info_t
C.cpu_check_ram(&memInfo)
if memInfo.err != nil {
slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
return []GpuInfo{}
} }
cpuInfo := CPUInfo{
GpuInfo: GpuInfo{
Library: "cpu",
Variant: cpuCapability.ToVariant(),
},
}
cpuInfo.TotalMemory = uint64(memInfo.total)
cpuInfo.FreeMemory = uint64(memInfo.free)
cpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
cpus = []CPUInfo{cpuInfo}
// Fallback to CPU mode if we're lacking required vector extensions on x86
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability.ToString(), "detected", cpuCapability.ToString())
bootstrapped = true
// No need to do any GPU discovery, since we can't run on them
return GpuInfoList{cpus[0].GpuInfo}
}
// TODO - implement
// TODO refine the discovery to only gather total memory
// On windows we bundle the nvidia library one level above the runner dir // On windows we bundle the nvidia library one level above the runner dir
depPath := "" depPath := ""
@ -201,25 +256,26 @@ func GetGPUInfo() GpuInfoList {
depPath = filepath.Dir(envconfig.RunnersDir) depPath = filepath.Dir(envconfig.RunnersDir)
} }
var memInfo C.mem_info_t // Load ALL libraries
resp := []GpuInfo{} gpuHandles = initCudaHandles()
// NVIDIA first // TODO needs a refactoring pass to init oneapi handles
// NVIDIA
for i := range gpuHandles.deviceCount { for i := range gpuHandles.deviceCount {
// TODO once we support CPU compilation variants of GPU libraries refine this...
if cpuVariant == "" && runtime.GOARCH == "amd64" {
continue
}
if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil { if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
gpuInfo := GpuInfo{ gpuInfo := CudaGPUInfo{
GpuInfo: GpuInfo{
Library: "cuda", Library: "cuda",
},
index: i,
} }
var driverMajor int var driverMajor int
var driverMinor int var driverMinor int
if gpuHandles.cudart != nil { if gpuHandles.cudart != nil {
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo) C.cudart_bootstrap(*gpuHandles.cudart, C.int(i), &memInfo)
} else { } else {
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo) C.nvcuda_bootstrap(*gpuHandles.nvcuda, C.int(i), &memInfo)
driverMajor = int(gpuHandles.nvcuda.driver_major) driverMajor = int(gpuHandles.nvcuda.driver_major)
driverMinor = int(gpuHandles.nvcuda.driver_minor) driverMinor = int(gpuHandles.nvcuda.driver_minor)
} }
@ -239,47 +295,77 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.MinimumMemory = cudaMinimumMemory gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMajor = int(driverMajor)
gpuInfo.DriverMinor = driverMinor gpuInfo.DriverMinor = int(driverMinor)
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does... // TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
resp = append(resp, gpuInfo) cudaGPUs = append(cudaGPUs, gpuInfo)
} }
if gpuHandles.oneapi != nil { if gpuHandles.oneapi != nil {
gpuInfo := GpuInfo{ gpuInfo := OneapiGPUInfo{
GpuInfo: GpuInfo{
Library: "oneapi", Library: "oneapi",
},
index: i,
} }
// TODO - split bootstrapping from updating free memory
C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo) C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
memInfo.free = C.uint64_t(totalFreeMem) memInfo.free = C.uint64_t(totalFreeMem)
gpuInfo.TotalMemory = uint64(memInfo.total) gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free) gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = strconv.Itoa(i) gpuInfo.ID = strconv.Itoa(i)
resp = append(resp, gpuInfo) oneapiGPUs = append(oneapiGPUs, gpuInfo)
} }
} }
// Then AMD rocmGPUs = AMDGetGPUInfo()
resp = append(resp, AMDGetGPUInfo()...) bootstrapped = true
}
if len(resp) == 0 { // For detected GPUs, load library if not loaded
C.cpu_check_ram(&memInfo)
// Refresh free memory usage
if needRefresh {
// TODO - CPU system memory tracking/refresh
var memInfo C.mem_info_t
if gpuHandles == nil && len(cudaGPUs) > 0 {
gpuHandles = initCudaHandles()
}
for i, gpu := range cudaGPUs {
if gpuHandles.cudart != nil {
C.cudart_bootstrap(*gpuHandles.cudart, C.int(gpu.index), &memInfo)
} else {
C.nvcuda_get_free(*gpuHandles.nvcuda, C.int(gpu.index), &memInfo.free)
}
if memInfo.err != nil { if memInfo.err != nil {
slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err)) slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err)) C.free(unsafe.Pointer(memInfo.err))
return resp continue
} }
gpuInfo := GpuInfo{ if memInfo.free == 0 {
Library: "cpu", slog.Warn("error looking up nvidia GPU memory")
Variant: cpuVariant, continue
}
slog.Debug("updating cuda free memory", "gpu", gpu.ID, "name", gpu.Name, "before", format.HumanBytes2(gpu.FreeMemory), "now", format.HumanBytes2(uint64(memInfo.free)))
cudaGPUs[i].FreeMemory = uint64(memInfo.free)
}
err := RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
if err != nil {
slog.Debug("problem refreshing ROCm free memory", "error", err)
} }
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
resp = append(resp, gpuInfo)
} }
resp := []GpuInfo{}
for _, gpu := range cudaGPUs {
resp = append(resp, gpu.GpuInfo)
}
for _, gpu := range rocmGPUs {
resp = append(resp, gpu.GpuInfo)
}
if len(resp) == 0 {
resp = append(resp, cpus[0].GpuInfo)
}
return resp return resp
} }

View file

@ -94,7 +94,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
} }
void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) { void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
resp->err = NULL; resp->err = NULL;
cudartMemory_t memInfo = {0,0,0}; cudartMemory_t memInfo = {0,0,0};
cudartReturn_t ret; cudartReturn_t ret;

View file

@ -140,7 +140,8 @@ typedef struct cudart_init_resp {
} cudart_init_resp_t; } cudart_init_resp_t;
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp); void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp); void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
// TODO - if we keep this library longer term, add cudart_get_free
void cudart_release(cudart_handle_t ch); void cudart_release(cudart_handle_t ch);
#endif // __GPU_INFO_CUDART_H__ #endif // __GPU_INFO_CUDART_H__

View file

@ -96,7 +96,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
} }
const int buflen = 256; const int buflen = 256;
void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) { void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
resp->err = NULL; resp->err = NULL;
nvcudaMemory_t memInfo = {0,0}; nvcudaMemory_t memInfo = {0,0};
CUresult ret; CUresult ret;
@ -168,7 +168,7 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
// To get memory we have to set (and release) a context // To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret); snprintf(buf, buflen, "nvcuda failed to get device context %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
@ -193,7 +193,42 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
ret = (*h.cuCtxDestroy)(ctx); ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release primary device context %d", ret); LOG(1, "nvcuda failed to release device context %d", ret);
}
}
void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free) {
CUresult ret;
CUcontext ctx = NULL;
CUdevice device = -1;
*free = 0;
uint64_t total = 0;
ret = (*h.cuDeviceGet)(&device, i);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda device failed to initialize");
return;
}
// To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to get device context %d", ret);
return;
}
ret = (*h.cuMemGetInfo_v2)(free, &total);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda device memory info lookup failure %d", ret);
// Best effort on failure...
(*h.cuCtxDestroy)(ctx);
return;
}
ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release device context %d", ret);
} }
} }

View file

@ -67,7 +67,8 @@ typedef struct nvcuda_init_resp {
} nvcuda_init_resp_t; } nvcuda_init_resp_t;
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp); void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp); void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
void nvcuda_get_free(nvcuda_handle_t ch, int device_id, uint64_t *free);
void nvcuda_release(nvcuda_handle_t ch); void nvcuda_release(nvcuda_handle_t ch);
#endif // __GPU_INFO_NVCUDA_H__ #endif // __GPU_INFO_NVCUDA_H__

View file

@ -38,6 +38,29 @@ type GpuInfo struct {
// TODO other performance capability info to help in scheduling decisions // TODO other performance capability info to help in scheduling decisions
} }
type CPUInfo struct {
GpuInfo
}
type CudaGPUInfo struct {
GpuInfo
index int // device index
}
type CudaGPUInfoList []CudaGPUInfo
type RocmGPUInfo struct {
GpuInfo
usedFilepath string // linux
index int // device index on windows
}
type RocmGPUInfoList []RocmGPUInfo
type OneapiGPUInfo struct {
GpuInfo
index int // device index
}
type OneapiGPUInfoList []OneapiGPUInfo
type GpuInfoList []GpuInfo type GpuInfoList []GpuInfo
// Split up the set of gpu info's by Library and variant // Split up the set of gpu info's by Library and variant
@ -86,3 +109,37 @@ type ByFreeMemory []GpuInfo
func (a ByFreeMemory) Len() int { return len(a) } func (a ByFreeMemory) Len() int { return len(a) }
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] } func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory } func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
type CPUCapability uint32
// Override at build time when building base GPU runners
var GPURunnerCPUCapability = CPUCapabilityAVX
const (
CPUCapabilityBase CPUCapability = iota
CPUCapabilityAVX
CPUCapabilityAVX2
// TODO AVX512
)
func (c CPUCapability) ToString() string {
switch c {
case CPUCapabilityAVX:
return "AVX"
case CPUCapabilityAVX2:
return "AVX2"
default:
return "no vector extensions"
}
}
func (c CPUCapability) ToVariant() string {
switch c {
case CPUCapabilityAVX:
return "avx"
case CPUCapabilityAVX2:
return "avx2"
default:
return ""
}
}