Report more information about GPUs in verbose mode
This adds additional calls to both CUDA and ROCm management libraries to discover additional attributes about the GPU(s) detected in the system, and wires up runtime verbosity selection. When users hit problems with GPUs we can ask them to run with `OLLAMA_DEBUG=1 ollama serve` and share the results.
This commit is contained in:
parent
0759d8996e
commit
987c16b2f7
6 changed files with 171 additions and 30 deletions
|
@ -259,6 +259,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
|
|||
|
||||
func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
|
||||
var resp C.cuda_init_resp_t
|
||||
resp.ch.verbose = getVerboseState()
|
||||
for _, libPath := range cudaLibPaths {
|
||||
lib := C.CString(libPath)
|
||||
defer C.free(unsafe.Pointer(lib))
|
||||
|
@ -275,6 +276,7 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
|
|||
|
||||
func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
|
||||
var resp C.rocm_init_resp_t
|
||||
resp.rh.verbose = getVerboseState()
|
||||
for _, libPath := range rocmLibPaths {
|
||||
lib := C.CString(libPath)
|
||||
defer C.free(unsafe.Pointer(lib))
|
||||
|
@ -288,3 +290,10 @@ func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func getVerboseState() C.uint16_t {
|
||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||
return C.uint16_t(1)
|
||||
}
|
||||
return C.uint16_t(0)
|
||||
}
|
||||
|
|
|
@ -27,6 +27,13 @@
|
|||
|
||||
#endif
|
||||
|
||||
#define LOG(verbose, ...) \
|
||||
do { \
|
||||
if (verbose) { \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
#include <string.h>
|
||||
|
||||
#define CUDA_LOOKUP_SIZE 6
|
||||
#define CUDA_LOOKUP_SIZE 12
|
||||
|
||||
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
||||
nvmlReturn_t ret;
|
||||
|
@ -23,6 +23,12 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
|||
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
|
||||
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
|
||||
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
|
||||
{"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
|
||||
{"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
|
||||
{"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
|
||||
{"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
|
||||
{"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
|
||||
{"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
|
||||
};
|
||||
|
||||
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
|
||||
|
@ -58,7 +64,13 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
|||
resp->err = strdup(buf);
|
||||
}
|
||||
|
||||
return;
|
||||
// Report driver version if we're in verbose mode, ignore errors
|
||||
ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
|
||||
if (ret != NVML_SUCCESS) {
|
||||
LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
||||
|
@ -98,6 +110,44 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
|||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
if (h.verbose) {
|
||||
nvmlBrandType_t brand = 0;
|
||||
// When in verbose mode, report more information about
|
||||
// the card we discover, but don't fail on error
|
||||
ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.nvmlDeviceGetBrand)(device, &brand);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
|
||||
}
|
||||
}
|
||||
|
||||
LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
|
||||
LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
|
||||
|
||||
resp->total += memInfo.total;
|
||||
resp->free += memInfo.free;
|
||||
|
|
|
@ -15,14 +15,26 @@ typedef struct nvmlMemory_st {
|
|||
unsigned long long used;
|
||||
} nvmlMemory_t;
|
||||
|
||||
typedef enum nvmlBrandType_enum
|
||||
{
|
||||
NVML_BRAND_UNKNOWN = 0,
|
||||
} nvmlBrandType_t;
|
||||
|
||||
typedef struct cuda_handle {
|
||||
void *handle;
|
||||
uint16_t verbose;
|
||||
nvmlReturn_t (*initFn)(void);
|
||||
nvmlReturn_t (*shutdownFn)(void);
|
||||
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
|
||||
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
|
||||
nvmlReturn_t (*getCount)(unsigned int *);
|
||||
nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
|
||||
nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int length);
|
||||
nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int length);
|
||||
nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int length);
|
||||
nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int length);
|
||||
nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int length);
|
||||
nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
|
||||
} cuda_handle_t;
|
||||
|
||||
typedef struct cuda_init_resp {
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
#include <string.h>
|
||||
|
||||
#define ROCM_LOOKUP_SIZE 5
|
||||
#define ROCM_LOOKUP_SIZE 14
|
||||
|
||||
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
||||
rsmi_status_t ret;
|
||||
|
@ -21,7 +21,15 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
|||
{"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
|
||||
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
|
||||
{"rsmi_version_get", (void *)&resp->rh.versionGetFn},
|
||||
// { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
|
||||
{"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
|
||||
{"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
|
||||
{"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
|
||||
{"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
|
||||
{"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
|
||||
{"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
|
||||
{"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
|
||||
{"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
|
||||
{"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
|
||||
};
|
||||
|
||||
resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
|
||||
|
@ -62,8 +70,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
|||
|
||||
void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
||||
resp->err = NULL;
|
||||
// uint32_t num_devices;
|
||||
// uint16_t device;
|
||||
uint64_t totalMem = 0;
|
||||
uint64_t usedMem = 0;
|
||||
rsmi_status_t ret;
|
||||
|
@ -76,34 +82,82 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
|||
return;
|
||||
}
|
||||
|
||||
// TODO - iterate through devices... ret =
|
||||
// rsmi_num_monitor_devices(&num_devices);
|
||||
ret = (*h.rsmi_num_monitor_devices)(&resp->count);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
|
||||
|
||||
// ret = (*h.getHandle)(0, &device);
|
||||
// if (ret != RSMI_STATUS_SUCCESS) {
|
||||
// printf("rocm vram device lookup failure: %d\n", ret);
|
||||
// return -1;
|
||||
// }
|
||||
resp->total = 0;
|
||||
resp->free = 0;
|
||||
for (i = 0; i < resp->count; i++) {
|
||||
if (h.verbose) {
|
||||
// When in verbose mode, report more information about
|
||||
// the card we discover, but don't fail on error
|
||||
ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
|
||||
}
|
||||
ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
|
||||
} else {
|
||||
LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
|
||||
}
|
||||
}
|
||||
|
||||
// Get total memory - used memory for available memory
|
||||
ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
|
||||
ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
|
||||
ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: set this to the actual number of devices
|
||||
resp->count = 1;
|
||||
resp->total = totalMem;
|
||||
resp->free = totalMem - usedMem;
|
||||
return;
|
||||
LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
|
||||
LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
|
||||
resp->total += totalMem;
|
||||
resp->free += totalMem - usedMem;
|
||||
}
|
||||
}
|
||||
|
||||
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
|
||||
|
|
|
@ -24,12 +24,21 @@ typedef enum rsmi_memory_type {
|
|||
|
||||
typedef struct rocm_handle {
|
||||
void *handle;
|
||||
uint16_t verbose;
|
||||
rsmi_status_t (*initFn)(uint64_t);
|
||||
rsmi_status_t (*shutdownFn)(void);
|
||||
rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
|
||||
rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
|
||||
rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
|
||||
// rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
|
||||
rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
|
||||
rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
|
||||
rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
|
||||
rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);
|
||||
rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);
|
||||
rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);
|
||||
rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);
|
||||
rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);
|
||||
rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);
|
||||
} rocm_handle_t;
|
||||
|
||||
typedef struct rocm_init_resp {
|
||||
|
|
Loading…
Reference in a new issue