Record more GPU information

This cleans up the logging for GPU discovery a bit, and can
serve as a foundation to report GPU information in a future UX.
This commit is contained in:
Daniel Hiltgen 2024-05-07 14:54:26 -07:00
parent d0425f26cf
commit 8727a9c140
10 changed files with 150 additions and 96 deletions

View file

@ -3,7 +3,6 @@ package gpu
import ( import (
"fmt" "fmt"
"log/slog" "log/slog"
"strconv"
"syscall" "syscall"
"unsafe" "unsafe"
@ -74,16 +73,22 @@ func (hl *HipLib) Release() {
hl.dll = 0 hl.dll = 0
} }
func (hl *HipLib) AMDDriverVersion() (string, error) { func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
if hl.dll == 0 { if hl.dll == 0 {
return "", fmt.Errorf("dll has been unloaded") return 0, 0, fmt.Errorf("dll has been unloaded")
} }
var version int var version int
status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version))) status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
if status != hipSuccess { if status != hipSuccess {
return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err) return 0, 0, fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
} }
return strconv.Itoa(version), nil
slog.Debug("hipDriverGetVersion", "version", version)
// TODO - this isn't actually right, but the docs claim hipDriverGetVersion isn't accurate anyway...
driverMajor = version / 1000
driverMinor = (version - (driverMajor * 1000)) / 10
return driverMajor, driverMinor, nil
} }
func (hl *HipLib) HipGetDeviceCount() int { func (hl *HipLib) HipGetDeviceCount() int {

View file

@ -8,6 +8,7 @@ import (
"log/slog" "log/slog"
"os" "os"
"path/filepath" "path/filepath"
"regexp"
"slices" "slices"
"strconv" "strconv"
"strings" "strings"
@ -41,10 +42,8 @@ func AMDGetGPUInfo() []GpuInfo {
} }
// Opportunistic logging of driver version to aid in troubleshooting // Opportunistic logging of driver version to aid in troubleshooting
ver, err := AMDDriverVersion() driverMajor, driverMinor, err := AMDDriverVersion()
if err == nil { if err != nil {
slog.Info("AMD Driver: " + ver)
} else {
// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU // TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err) slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
} }
@ -91,6 +90,7 @@ func AMDGetGPUInfo() []GpuInfo {
scanner := bufio.NewScanner(fp) scanner := bufio.NewScanner(fp)
isCPU := false isCPU := false
var major, minor, patch uint64 var major, minor, patch uint64
var vendor, device uint64
for scanner.Scan() { for scanner.Scan() {
line := strings.TrimSpace(scanner.Text()) line := strings.TrimSpace(scanner.Text())
// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs // Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
@ -118,6 +118,26 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Debug("malformed int " + line) slog.Debug("malformed int " + line)
continue continue
} }
} else if strings.HasPrefix(line, "vendor_id") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Debug("malformed vendor_id", "vendor_id", line)
continue
}
vendor, err = strconv.ParseUint(ver[1], 10, 32)
if err != nil {
slog.Debug("malformed vendor_id" + line)
}
} else if strings.HasPrefix(line, "device_id") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Debug("malformed device_id", "device_id", line)
continue
}
device, err = strconv.ParseUint(ver[1], 10, 32)
if err != nil {
slog.Debug("malformed device_id" + line)
}
} }
// TODO - any other properties we want to extract and record? // TODO - any other properties we want to extract and record?
@ -140,7 +160,7 @@ func AMDGetGPUInfo() []GpuInfo {
} }
if int(major) < RocmComputeMin { if int(major) < RocmComputeMin {
slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID) slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID)
continue continue
} }
@ -210,12 +230,17 @@ func AMDGetGPUInfo() []GpuInfo {
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library // iGPU detection, remove this check once we can support an iGPU variant of the rocm library
if totalMemory < IGPUMemLimit { if totalMemory < IGPUMemLimit {
slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", gpuID, "total", format.HumanBytes2(totalMemory)) slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory))
continue continue
} }
var name string
// TODO - PCI ID lookup
if vendor > 0 && device > 0 {
name = fmt.Sprintf("%04x:%04x", vendor, device)
}
slog.Info("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory)) slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
slog.Info("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory)) slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
gpuInfo := GpuInfo{ gpuInfo := GpuInfo{
Library: "rocm", Library: "rocm",
memInfo: memInfo{ memInfo: memInfo{
@ -223,11 +248,11 @@ func AMDGetGPUInfo() []GpuInfo {
FreeMemory: (totalMemory - usedMemory), FreeMemory: (totalMemory - usedMemory),
}, },
ID: fmt.Sprintf("%d", gpuID), ID: fmt.Sprintf("%d", gpuID),
// Name: not exposed in sysfs directly, would require pci device id lookup Name: name,
Major: int(major), Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
Minor: int(minor),
Patch: int(patch),
MinimumMemory: rocmMinimumMemory, MinimumMemory: rocmMinimumMemory,
DriverMajor: driverMajor,
DriverMinor: driverMinor,
} }
// If the user wants to filter to a subset of devices, filter out if we aren't a match // If the user wants to filter to a subset of devices, filter out if we aren't a match
@ -266,7 +291,7 @@ func AMDGetGPUInfo() []GpuInfo {
} }
slog.Debug("rocm supported GPUs", "types", supported) slog.Debug("rocm supported GPUs", "types", supported)
} }
gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch) gfx := gpuInfo.Compute
if !slices.Contains[[]string, string](supported, gfx) { if !slices.Contains[[]string, string](supported, gfx) {
slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported) slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
// TODO - consider discrete markdown just for ROCM troubleshooting? // TODO - consider discrete markdown just for ROCM troubleshooting?
@ -276,7 +301,7 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx) slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
} }
} else { } else {
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride) slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
} }
// The GPU has passed all the verification steps and is supported // The GPU has passed all the verification steps and is supported
@ -322,19 +347,34 @@ func AMDValidateLibDir() (string, error) {
return "", fmt.Errorf("no suitable rocm found, falling back to CPU") return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
} }
func AMDDriverVersion() (string, error) { func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
_, err := os.Stat(DriverVersionFile) _, err = os.Stat(DriverVersionFile)
if err != nil { if err != nil {
return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err) return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
} }
fp, err := os.Open(DriverVersionFile) fp, err := os.Open(DriverVersionFile)
if err != nil { if err != nil {
return "", err return 0, 0, err
} }
defer fp.Close() defer fp.Close()
verString, err := io.ReadAll(fp) verString, err := io.ReadAll(fp)
if err != nil { if err != nil {
return "", err return 0, 0, err
} }
return strings.TrimSpace(string(verString)), nil
pattern := `\A(\d+)\.(\d+).*`
regex := regexp.MustCompile(pattern)
match := regex.FindStringSubmatch(string(verString))
if len(match) < 2 {
return 0, 0, fmt.Errorf("malformed version string %s", string(verString))
}
driverMajor, err = strconv.Atoi(match[1])
if err != nil {
return 0, 0, err
}
driverMinor, err = strconv.Atoi(match[2])
if err != nil {
return 0, 0, err
}
return driverMajor, driverMinor, nil
} }

View file

@ -7,7 +7,6 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"slices" "slices"
"strconv"
"strings" "strings"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
@ -34,13 +33,12 @@ func AMDGetGPUInfo() []GpuInfo {
} }
defer hl.Release() defer hl.Release()
ver, err := hl.AMDDriverVersion() // TODO - this reports incorrect version information, so omitting for now
if err == nil { // driverMajor, driverMinor, err := hl.AMDDriverVersion()
slog.Info("AMD Driver: " + ver) // if err != nil {
} else { // // For now this is benign, but we may eventually need to fail compatibility checks
// For now this is benign, but we may eventually need to fail compatibility checks // slog.Debug("error looking up amd driver version", "error", err)
slog.Debug("error looking up amd driver version", "error", err) // }
}
// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified // Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
count := hl.HipGetDeviceCount() count := hl.HipGetDeviceCount()
@ -62,10 +60,10 @@ func AMDGetGPUInfo() []GpuInfo {
return nil return nil
} }
} else { } else {
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride) slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
} }
slog.Info("detected hip devices", "count", count) slog.Debug("detected hip devices", "count", count)
// TODO how to determine the underlying device ID when visible devices is causing this to subset? // TODO how to determine the underlying device ID when visible devices is causing this to subset?
for i := 0; i < count; i++ { for i := 0; i < count; i++ {
err = hl.HipSetDevice(i) err = hl.HipSetDevice(i)
@ -85,18 +83,11 @@ func AMDGetGPUInfo() []GpuInfo {
// Can luid be used on windows for setting visible devices (and is it actually set?) // Can luid be used on windows for setting visible devices (and is it actually set?)
n = bytes.IndexByte(props.GcnArchName[:], 0) n = bytes.IndexByte(props.GcnArchName[:], 0)
gfx := string(props.GcnArchName[:n]) gfx := string(props.GcnArchName[:n])
slog.Info("hip device", "id", i, "name", name, "gfx", gfx) slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
var major, minor, patch string
switch len(gfx) {
case 6:
major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:]
case 7:
major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:]
}
//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0 //slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
// TODO Why isn't props.iGPU accurate!? // TODO Why isn't props.iGPU accurate!?
if strings.EqualFold(name, iGPUName) { if strings.EqualFold(name, iGPUName) {
slog.Info("iGPU detected skipping", "id", i) slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
continue continue
} }
if gfxOverride == "" { if gfxOverride == "" {
@ -106,7 +97,7 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage") slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
continue continue
} else { } else {
slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx) slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
} }
} }
@ -124,8 +115,8 @@ func AMDGetGPUInfo() []GpuInfo {
// TODO revisit this once ROCm v6 is available on windows. // TODO revisit this once ROCm v6 is available on windows.
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable // v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory)) slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory)) slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
gpuInfo := GpuInfo{ gpuInfo := GpuInfo{
Library: "rocm", Library: "rocm",
memInfo: memInfo{ memInfo: memInfo{
@ -135,31 +126,12 @@ func AMDGetGPUInfo() []GpuInfo {
ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
DependencyPath: libDir, DependencyPath: libDir,
MinimumMemory: rocmMinimumMemory, MinimumMemory: rocmMinimumMemory,
} Name: name,
if major != "" { Compute: gfx,
gpuInfo.Major, err = strconv.Atoi(major)
if err != nil { // TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
slog.Info("failed to parse version", "version", gfx, "error", err) // DriverMajor: driverMajor,
} // DriverMinor: driverMinor,
}
if minor != "" {
gpuInfo.Minor, err = strconv.Atoi(minor)
if err != nil {
slog.Info("failed to parse version", "version", gfx, "error", err)
}
}
if patch != "" {
// Patch rev is hex; e.g. gfx90a
p, err := strconv.ParseInt(patch, 16, 0)
if err != nil {
slog.Info("failed to parse version", "version", gfx, "error", err)
} else {
gpuInfo.Patch = int(p)
}
}
if gpuInfo.Major < RocmComputeMin {
slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
continue
} }
resp = append(resp, gpuInfo) resp = append(resp, gpuInfo)

View file

@ -119,12 +119,12 @@ func initGPUHandles() *handles {
return gpuHandles return gpuHandles
} }
slog.Info("Detecting GPUs") slog.Debug("Detecting GPUs")
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns) nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
if len(nvcudaLibPaths) > 0 { if len(nvcudaLibPaths) > 0 {
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths) deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
if nvcuda != nil { if nvcuda != nil {
slog.Info("detected GPUs", "count", deviceCount, "library", libPath) slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
gpuHandles.nvcuda = nvcuda gpuHandles.nvcuda = nvcuda
gpuHandles.deviceCount = deviceCount gpuHandles.deviceCount = deviceCount
return gpuHandles return gpuHandles
@ -135,7 +135,7 @@ func initGPUHandles() *handles {
if len(cudartLibPaths) > 0 { if len(cudartLibPaths) > 0 {
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths) deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
if cudart != nil { if cudart != nil {
slog.Info("detected GPUs", "library", libPath, "count", deviceCount) slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
gpuHandles.cudart = cudart gpuHandles.cudart = cudart
gpuHandles.deviceCount = deviceCount gpuHandles.deviceCount = deviceCount
return gpuHandles return gpuHandles
@ -184,10 +184,14 @@ func GetGPUInfo() GpuInfoList {
gpuInfo := GpuInfo{ gpuInfo := GpuInfo{
Library: "cuda", Library: "cuda",
} }
var driverMajor int
var driverMinor int
if gpuHandles.cudart != nil { if gpuHandles.cudart != nil {
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo) C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
} else { } else {
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo) C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
driverMajor = int(gpuHandles.nvcuda.driver_major)
driverMinor = int(gpuHandles.nvcuda.driver_minor)
} }
if memInfo.err != nil { if memInfo.err != nil {
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
@ -201,10 +205,12 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.TotalMemory = uint64(memInfo.total) gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free) gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Major = int(memInfo.major) gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
gpuInfo.Minor = int(memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = int(driverMajor)
gpuInfo.DriverMinor = int(driverMinor)
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does... // TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
resp = append(resp, gpuInfo) resp = append(resp, gpuInfo)

View file

@ -39,16 +39,19 @@ extern "C" {
#endif #endif
#define GPU_ID_LEN 64 #define GPU_ID_LEN 64
#define GPU_NAME_LEN 96
typedef struct mem_info { typedef struct mem_info {
char *err; // If non-nill, caller responsible for freeing char *err; // If non-nill, caller responsible for freeing
char gpu_id[GPU_ID_LEN]; char gpu_id[GPU_ID_LEN];
char gpu_name[GPU_NAME_LEN];
uint64_t total; uint64_t total;
uint64_t free; uint64_t free;
// Compute Capability // Compute Capability
int major; int major;
int minor; int minor;
int patch;
} mem_info_t; } mem_info_t;
void cpu_check_ram(mem_info_t *resp); void cpu_check_ram(mem_info_t *resp);

View file

@ -10,8 +10,6 @@ void cpu_check_ram(mem_info_t *resp) {
if (GlobalMemoryStatusEx(&info) != 0) { if (GlobalMemoryStatusEx(&info) != 0) {
resp->total = info.ullTotalPhys; resp->total = info.ullTotalPhys;
resp->free = info.ullAvailPhys; resp->free = info.ullAvailPhys;
resp->major = 0;
resp->minor = 0;
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0"); snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
} else { } else {
resp->err = LOAD_ERR(); resp->err = LOAD_ERR();
@ -31,8 +29,6 @@ void cpu_check_ram(mem_info_t *resp) {
} else { } else {
resp->total = info.totalram * info.mem_unit; resp->total = info.totalram * info.mem_unit;
resp->free = info.freeram * info.mem_unit; resp->free = info.freeram * info.mem_unit;
resp->major = 0;
resp->minor = 0;
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0"); snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
} }
return; return;

View file

@ -22,6 +22,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
{"cuDeviceGet", (void *)&resp->ch.cuDeviceGet}, {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
{"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute}, {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
{"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid}, {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
{"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
{"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3}, {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
{"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2}, {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
{"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy}, {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
@ -70,18 +71,17 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
} }
int version = 0; int version = 0;
nvcudaDriverVersion_t driverVersion; resp->ch.driver_major = 0;
driverVersion.major = 0; resp->ch.driver_minor = 0;
driverVersion.minor = 0;
// Report driver version if we're in verbose mode, ignore errors // Report driver version if we're in verbose mode, ignore errors
ret = (*resp->ch.cuDriverGetVersion)(&version); ret = (*resp->ch.cuDriverGetVersion)(&version);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {
LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret); LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
} else { } else {
driverVersion.major = version / 1000; resp->ch.driver_major = version / 1000;
driverVersion.minor = (version - (driverVersion.major * 1000)) / 10; resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor); LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
} }
ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices); ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
@ -117,8 +117,6 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
return; return;
} }
resp->major = 0;
resp->minor = 0;
int major = 0; int major = 0;
int minor = 0; int minor = 0;
ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
@ -161,6 +159,12 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
); );
} }
ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
if (ret != CUDA_SUCCESS) {
LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
resp->gpu_name[0] = '\0';
}
// To get memory we have to set (and release) a context // To get memory we have to set (and release) a context
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
if (ret != CUDA_SUCCESS) { if (ret != CUDA_SUCCESS) {

View file

@ -44,12 +44,15 @@ typedef void* CUcontext;
typedef struct nvcuda_handle { typedef struct nvcuda_handle {
void *handle; void *handle;
uint16_t verbose; uint16_t verbose;
int driver_major;
int driver_minor;
CUresult (*cuInit)(unsigned int Flags); CUresult (*cuInit)(unsigned int Flags);
CUresult (*cuDriverGetVersion)(int *driverVersion); CUresult (*cuDriverGetVersion)(int *driverVersion);
CUresult (*cuDeviceGetCount)(int *); CUresult (*cuDeviceGetCount)(int *);
CUresult (*cuDeviceGet)(CUdevice* device, int ordinal); CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2 CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
CUresult (*cuDeviceGetName)(char *name, int len, CUdevice dev);
// Context specific aspects // Context specific aspects
CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev); CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);

View file

@ -1,5 +1,12 @@
package gpu package gpu
import (
"fmt"
"log/slog"
"github.com/ollama/ollama/format"
)
type memInfo struct { type memInfo struct {
TotalMemory uint64 `json:"total_memory,omitempty"` TotalMemory uint64 `json:"total_memory,omitempty"`
FreeMemory uint64 `json:"free_memory,omitempty"` FreeMemory uint64 `json:"free_memory,omitempty"`
@ -22,9 +29,11 @@ type GpuInfo struct {
// GPU information // GPU information
ID string `json:"gpu_id"` // string to use for selection of this specific GPU ID string `json:"gpu_id"` // string to use for selection of this specific GPU
Name string `json:"name"` // user friendly name if available Name string `json:"name"` // user friendly name if available
Major int `json:"major,omitempty"` // Major compatibility version (CC or gfx) Compute string `json:"compute"` // Compute Capability or gfx
Minor int `json:"minor,omitempty"` // Minor compatibility version (CC or gfx)
Patch int `json:"patch,omitempty"` // Patch compatibility only matters on AMD // Driver Information - TODO no need to put this on each GPU
DriverMajor int `json:"driver_major,omitempty"`
DriverMinor int `json:"driver_minor,omitempty"`
// TODO other performance capability info to help in scheduling decisions // TODO other performance capability info to help in scheduling decisions
} }
@ -56,6 +65,21 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
return resp return resp
} }
// Report the GPU information into the log an Info level
func (l GpuInfoList) LogDetails() {
for _, g := range l {
slog.Info("inference compute",
"id", g.ID,
"library", g.Library,
"compute", g.Compute,
"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
"name", g.Name,
"total", format.HumanBytes2(g.TotalMemory),
"available", format.HumanBytes2(g.FreeMemory),
)
}
}
// Sort by Free Space // Sort by Free Space
type ByFreeMemory []GpuInfo type ByFreeMemory []GpuInfo

View file

@ -1065,7 +1065,8 @@ func Serve(ln net.Listener) error {
// At startup we retrieve GPU information so we can get log messages before loading a model // At startup we retrieve GPU information so we can get log messages before loading a model
// This will log warnings to the log in case we have problems with detected GPUs // This will log warnings to the log in case we have problems with detected GPUs
_ = gpu.GetGPUInfo() gpus := gpu.GetGPUInfo()
gpus.LogDetails()
return srvr.Serve(ln) return srvr.Serve(ln)
} }