Refine GPU discovery to bootstrap once

Now that we call the GPU discovery routines many times to
update memory, this splits initial discovery from free memory
updating.
This commit is contained in:
Daniel Hiltgen 2024-05-15 15:13:16 -07:00
parent b32ebb4f29
commit 43ed358f9a
9 changed files with 383 additions and 149 deletions

View file

@ -44,8 +44,8 @@ var (
)
// Gather GPU information from the amdgpu driver if any supported GPUs are detected
func AMDGetGPUInfo() []GpuInfo {
resp := []GpuInfo{}
func AMDGetGPUInfo() []RocmGPUInfo {
resp := []RocmGPUInfo{}
if !AMDDetected() {
return resp
}
@ -178,7 +178,7 @@ func AMDGetGPUInfo() []GpuInfo {
// Shouldn't happen, but just in case...
if gpuID < 0 {
slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
return []GpuInfo{}
return []RocmGPUInfo{}
}
if int(major) < RocmComputeMin {
@ -189,6 +189,7 @@ func AMDGetGPUInfo() []GpuInfo {
// Look up the memory for the current node
totalMemory := uint64(0)
usedMemory := uint64(0)
var usedFile string
mapping := []struct {
id uint64
filename string
@ -255,22 +256,10 @@ func AMDGetGPUInfo() []GpuInfo {
break
}
usedFile := filepath.Join(devDir, DRMUsedMemoryFile)
usedFp, err := os.Open(usedFile)
usedFile = filepath.Join(devDir, DRMUsedMemoryFile)
usedMemory, err = getFreeMemory(usedFile)
if err != nil {
slog.Debug("failed to open sysfs node", "file", usedFile, "error", err)
break
}
defer totalFp.Close()
buf, err = io.ReadAll(usedFp)
if err != nil {
slog.Debug("failed to read sysfs node", "file", usedFile, "error", err)
break
}
usedMemory, err = strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
if err != nil {
slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
break
slog.Debug("failed to update used memory", "error", err)
}
break
}
@ -288,7 +277,8 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
gpuInfo := GpuInfo{
gpuInfo := RocmGPUInfo{
GpuInfo: GpuInfo{
Library: "rocm",
memInfo: memInfo{
TotalMemory: totalMemory,
@ -300,6 +290,8 @@ func AMDGetGPUInfo() []GpuInfo {
MinimumMemory: rocmMinimumMemory,
DriverMajor: driverMajor,
DriverMinor: driverMinor,
},
usedFilepath: usedFile,
}
// If the user wants to filter to a subset of devices, filter out if we aren't a match
@ -323,7 +315,7 @@ func AMDGetGPUInfo() []GpuInfo {
libDir, err = AMDValidateLibDir()
if err != nil {
slog.Warn("unable to verify rocm library, will use cpu", "error", err)
return []GpuInfo{}
return []RocmGPUInfo{}
}
}
gpuInfo.DependencyPath = libDir
@ -334,7 +326,7 @@ func AMDGetGPUInfo() []GpuInfo {
supported, err = GetSupportedGFX(libDir)
if err != nil {
slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
return []GpuInfo{}
return []RocmGPUInfo{}
}
slog.Debug("rocm supported GPUs", "types", supported)
}
@ -425,3 +417,36 @@ func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
}
return driverMajor, driverMinor, nil
}
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
if len(gpus) == 0 {
return nil
}
for i := range gpus {
usedMemory, err := getFreeMemory(gpus[i].usedFilepath)
if err != nil {
return err
}
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(gpus[i].TotalMemory-usedMemory))
gpus[i].FreeMemory = gpus[i].TotalMemory - usedMemory
}
return nil
}
func getFreeMemory(usedFile string) (uint64, error) {
usedFp, err := os.Open(usedFile)
if err != nil {
return 0, fmt.Errorf("failed to open sysfs node %s %w", usedFile, err)
}
defer usedFp.Close()
buf, err := io.ReadAll(usedFp)
if err != nil {
return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
}
usedMemory, err := strconv.ParseUint(strings.TrimSpace(string(buf)), 10, 64)
if err != nil {
slog.Debug("failed to parse sysfs node", "file", usedFile, "error", err)
return 0, fmt.Errorf("failed to parse sysfs node %s %w", usedFile, err)
}
return usedMemory, nil
}

View file

@ -24,8 +24,8 @@ var (
RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
)
func AMDGetGPUInfo() []GpuInfo {
resp := []GpuInfo{}
func AMDGetGPUInfo() []RocmGPUInfo {
resp := []RocmGPUInfo{}
hl, err := NewHipLib()
if err != nil {
slog.Debug(err.Error())
@ -117,7 +117,8 @@ func AMDGetGPUInfo() []GpuInfo {
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
gpuInfo := GpuInfo{
gpuInfo := RocmGPUInfo{
GpuInfo: GpuInfo{
Library: "rocm",
memInfo: memInfo{
TotalMemory: totalMemory,
@ -132,6 +133,8 @@ func AMDGetGPUInfo() []GpuInfo {
// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
// DriverMajor: driverMajor,
// DriverMinor: driverMinor,
},
index: i,
}
resp = append(resp, gpuInfo)
@ -159,3 +162,30 @@ func AMDValidateLibDir() (string, error) {
slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}
func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
if len(gpus) == 0 {
return nil
}
hl, err := NewHipLib()
if err != nil {
slog.Debug(err.Error())
return nil
}
defer hl.Release()
for i := range gpus {
err := hl.HipSetDevice(gpus[i].index)
if err != nil {
return err
}
freeMemory, _, err := hl.HipMemGetInfo()
if err != nil {
slog.Warn("get mem info", "id", i, "error", err)
continue
}
slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
gpus[i].FreeMemory = freeMemory
}
return nil
}

View file

@ -1,21 +1,20 @@
package gpu
import (
"log/slog"
"golang.org/x/sys/cpu"
)
func GetCPUVariant() string {
return getCPUCapability().ToVariant()
}
func getCPUCapability() CPUCapability {
if cpu.X86.HasAVX2 {
slog.Debug("CPU has AVX2")
return "avx2"
return CPUCapabilityAVX2
}
if cpu.X86.HasAVX {
slog.Debug("CPU has AVX")
return "avx"
return CPUCapabilityAVX
}
slog.Debug("CPU does not have vector extensions")
// else LCD
return ""
return CPUCapabilityBase
}

View file

@ -21,8 +21,8 @@ import (
"sync"
"unsafe"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
type handles struct {
@ -37,7 +37,18 @@ const (
rocmMinimumMemory = 457 * format.MebiByte
)
var gpuMutex sync.Mutex
var (
gpuMutex sync.Mutex
bootstrapped bool
cpuCapability CPUCapability
cpus []CPUInfo
cudaGPUs []CudaGPUInfo
nvcudaLibPath string
cudartLibPath string
oneapiLibPath string
rocmGPUs []RocmGPUInfo
oneapiGPUs []OneapiGPUInfo
)
// With our current CUDA compile flags, older than 5.0 will not work properly
var CudaComputeMin = [2]C.int{5, 0}
@ -96,11 +107,22 @@ var OneapiLinuxGlobs = []string{
var CudaTegra string = os.Getenv("JETSON_JETPACK")
// Note: gpuMutex must already be held
func initGPUHandles() *handles {
func initCudaHandles() *handles {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
gpuHandles := &handles{}
// Short Circuit if we already know which library to use
if nvcudaLibPath != "" {
gpuHandles.deviceCount, gpuHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
return gpuHandles
}
if cudartLibPath != "" {
gpuHandles.deviceCount, gpuHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath})
return gpuHandles
}
slog.Debug("searching for GPU discovery libraries for NVIDIA")
var cudartMgmtName string
var cudartMgmtPatterns []string
var nvcudaMgmtName string
@ -136,7 +158,6 @@ func initGPUHandles() *handles {
return gpuHandles
}
slog.Debug("Detecting GPUs")
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
if len(nvcudaLibPaths) > 0 {
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
@ -144,6 +165,7 @@ func initGPUHandles() *handles {
slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
gpuHandles.nvcuda = nvcuda
gpuHandles.deviceCount = deviceCount
nvcudaLibPath = libPath
return gpuHandles
}
}
@ -155,6 +177,7 @@ func initGPUHandles() *handles {
slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
gpuHandles.cudart = cudart
gpuHandles.deviceCount = deviceCount
cudartLibPath = libPath
return gpuHandles
}
}
@ -166,6 +189,7 @@ func initGPUHandles() *handles {
slog.Debug("detected Intel GPUs", "library", libPath, "count", deviceCount)
gpuHandles.oneapi = oneapi
gpuHandles.deviceCount = deviceCount
oneapiLibPath = libPath
return gpuHandles
}
}
@ -178,9 +202,12 @@ func GetGPUInfo() GpuInfoList {
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
gpuMutex.Lock()
defer gpuMutex.Unlock()
gpuHandles := initGPUHandles()
needRefresh := true
var gpuHandles *handles
defer func() {
if gpuHandles == nil {
return
}
if gpuHandles.cudart != nil {
C.cudart_release(*gpuHandles.cudart)
}
@ -189,11 +216,39 @@ func GetGPUInfo() GpuInfoList {
}
}()
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
cpuVariant := GetCPUVariant()
if cpuVariant == "" && runtime.GOARCH == "amd64" {
slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
if !bootstrapped {
slog.Debug("Detecting GPUs")
needRefresh = false
cpuCapability = getCPUCapability()
var memInfo C.mem_info_t
C.cpu_check_ram(&memInfo)
if memInfo.err != nil {
slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
return []GpuInfo{}
}
cpuInfo := CPUInfo{
GpuInfo: GpuInfo{
Library: "cpu",
Variant: cpuCapability.ToVariant(),
},
}
cpuInfo.TotalMemory = uint64(memInfo.total)
cpuInfo.FreeMemory = uint64(memInfo.free)
cpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
cpus = []CPUInfo{cpuInfo}
// Fallback to CPU mode if we're lacking required vector extensions on x86
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability.ToString(), "detected", cpuCapability.ToString())
bootstrapped = true
// No need to do any GPU discovery, since we can't run on them
return GpuInfoList{cpus[0].GpuInfo}
}
// TODO - implement
// TODO refine the discovery to only gather total memory
// On windows we bundle the nvidia library one level above the runner dir
depPath := ""
@ -201,25 +256,26 @@ func GetGPUInfo() GpuInfoList {
depPath = filepath.Dir(envconfig.RunnersDir)
}
var memInfo C.mem_info_t
resp := []GpuInfo{}
// Load ALL libraries
gpuHandles = initCudaHandles()
// NVIDIA first
// TODO needs a refactoring pass to init oneapi handles
// NVIDIA
for i := range gpuHandles.deviceCount {
// TODO once we support CPU compilation variants of GPU libraries refine this...
if cpuVariant == "" && runtime.GOARCH == "amd64" {
continue
}
if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
gpuInfo := GpuInfo{
gpuInfo := CudaGPUInfo{
GpuInfo: GpuInfo{
Library: "cuda",
},
index: i,
}
var driverMajor int
var driverMinor int
if gpuHandles.cudart != nil {
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
C.cudart_bootstrap(*gpuHandles.cudart, C.int(i), &memInfo)
} else {
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
C.nvcuda_bootstrap(*gpuHandles.nvcuda, C.int(i), &memInfo)
driverMajor = int(gpuHandles.nvcuda.driver_major)
driverMinor = int(gpuHandles.nvcuda.driver_minor)
}
@ -239,47 +295,77 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = driverMajor
gpuInfo.DriverMinor = driverMinor
gpuInfo.DriverMajor = int(driverMajor)
gpuInfo.DriverMinor = int(driverMinor)
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
resp = append(resp, gpuInfo)
cudaGPUs = append(cudaGPUs, gpuInfo)
}
if gpuHandles.oneapi != nil {
gpuInfo := GpuInfo{
gpuInfo := OneapiGPUInfo{
GpuInfo: GpuInfo{
Library: "oneapi",
},
index: i,
}
// TODO - split bootstrapping from updating free memory
C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
memInfo.free = C.uint64_t(totalFreeMem)
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = strconv.Itoa(i)
resp = append(resp, gpuInfo)
oneapiGPUs = append(oneapiGPUs, gpuInfo)
}
}
// Then AMD
resp = append(resp, AMDGetGPUInfo()...)
rocmGPUs = AMDGetGPUInfo()
bootstrapped = true
}
if len(resp) == 0 {
C.cpu_check_ram(&memInfo)
// For detected GPUs, load library if not loaded
// Refresh free memory usage
if needRefresh {
// TODO - CPU system memory tracking/refresh
var memInfo C.mem_info_t
if gpuHandles == nil && len(cudaGPUs) > 0 {
gpuHandles = initCudaHandles()
}
for i, gpu := range cudaGPUs {
if gpuHandles.cudart != nil {
C.cudart_bootstrap(*gpuHandles.cudart, C.int(gpu.index), &memInfo)
} else {
C.nvcuda_get_free(*gpuHandles.nvcuda, C.int(gpu.index), &memInfo.free)
}
if memInfo.err != nil {
slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
return resp
continue
}
gpuInfo := GpuInfo{
Library: "cpu",
Variant: cpuVariant,
if memInfo.free == 0 {
slog.Warn("error looking up nvidia GPU memory")
continue
}
slog.Debug("updating cuda free memory", "gpu", gpu.ID, "name", gpu.Name, "before", format.HumanBytes2(gpu.FreeMemory), "now", format.HumanBytes2(uint64(memInfo.free)))
cudaGPUs[i].FreeMemory = uint64(memInfo.free)
}
err := RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
if err != nil {
slog.Debug("problem refreshing ROCm free memory", "error", err)
}
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
resp = append(resp, gpuInfo)
}
resp := []GpuInfo{}
for _, gpu := range cudaGPUs {
resp = append(resp, gpu.GpuInfo)
}
for _, gpu := range rocmGPUs {
resp = append(resp, gpu.GpuInfo)
}
if len(resp) == 0 {
resp = append(resp, cpus[0].GpuInfo)
}
return resp
}

View file

@ -94,7 +94,7 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
}
void cudart_check_vram(cudart_handle_t h, int i, mem_info_t *resp) {
void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
resp->err = NULL;
cudartMemory_t memInfo = {0,0,0};
cudartReturn_t ret;

View file

@ -140,7 +140,8 @@ typedef struct cudart_init_resp {
} cudart_init_resp_t;
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);
void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
// TODO - if we keep this library longer term, add cudart_get_free
void cudart_release(cudart_handle_t ch);
#endif // __GPU_INFO_CUDART_H__

View file

@ -96,7 +96,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
}
const int buflen = 256;
void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
resp->err = NULL;
nvcudaMemory_t memInfo = {0,0};
CUresult ret;
@ -168,7 +168,7 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
// To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) {
snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
snprintf(buf, buflen, "nvcuda failed to get device context %d", ret);
resp->err = strdup(buf);
return;
}
@ -193,7 +193,42 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release primary device context %d", ret);
LOG(1, "nvcuda failed to release device context %d", ret);
}
}
void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free) {
CUresult ret;
CUcontext ctx = NULL;
CUdevice device = -1;
*free = 0;
uint64_t total = 0;
ret = (*h.cuDeviceGet)(&device, i);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda device failed to initialize");
return;
}
// To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to get device context %d", ret);
return;
}
ret = (*h.cuMemGetInfo_v2)(free, &total);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda device memory info lookup failure %d", ret);
// Best effort on failure...
(*h.cuCtxDestroy)(ctx);
return;
}
ret = (*h.cuCtxDestroy)(ctx);
if (ret != CUDA_SUCCESS) {
LOG(1, "nvcuda failed to release device context %d", ret);
}
}

View file

@ -67,7 +67,8 @@ typedef struct nvcuda_init_resp {
} nvcuda_init_resp_t;
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
void nvcuda_get_free(nvcuda_handle_t ch, int device_id, uint64_t *free);
void nvcuda_release(nvcuda_handle_t ch);
#endif // __GPU_INFO_NVCUDA_H__

View file

@ -38,6 +38,29 @@ type GpuInfo struct {
// TODO other performance capability info to help in scheduling decisions
}
type CPUInfo struct {
GpuInfo
}
type CudaGPUInfo struct {
GpuInfo
index int // device index
}
type CudaGPUInfoList []CudaGPUInfo
type RocmGPUInfo struct {
GpuInfo
usedFilepath string // linux
index int // device index on windows
}
type RocmGPUInfoList []RocmGPUInfo
type OneapiGPUInfo struct {
GpuInfo
index int // device index
}
type OneapiGPUInfoList []OneapiGPUInfo
type GpuInfoList []GpuInfo
// Split up the set of gpu info's by Library and variant
@ -86,3 +109,37 @@ type ByFreeMemory []GpuInfo
func (a ByFreeMemory) Len() int { return len(a) }
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
type CPUCapability uint32
// Override at build time when building base GPU runners
var GPURunnerCPUCapability = CPUCapabilityAVX
const (
CPUCapabilityBase CPUCapability = iota
CPUCapabilityAVX
CPUCapabilityAVX2
// TODO AVX512
)
func (c CPUCapability) ToString() string {
switch c {
case CPUCapabilityAVX:
return "AVX"
case CPUCapabilityAVX2:
return "AVX2"
default:
return "no vector extensions"
}
}
func (c CPUCapability) ToVariant() string {
switch c {
case CPUCapabilityAVX:
return "avx"
case CPUCapabilityAVX2:
return "avx2"
default:
return ""
}
}