Merge pull request #2162 from dhiltgen/rocm_real_gpus
Report more information about GPUs in verbose mode
This commit is contained in:
commit
f63dc2db5c
6 changed files with 171 additions and 30 deletions
|
@ -259,6 +259,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
|
||||||
|
|
||||||
func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
|
func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
|
||||||
var resp C.cuda_init_resp_t
|
var resp C.cuda_init_resp_t
|
||||||
|
resp.ch.verbose = getVerboseState()
|
||||||
for _, libPath := range cudaLibPaths {
|
for _, libPath := range cudaLibPaths {
|
||||||
lib := C.CString(libPath)
|
lib := C.CString(libPath)
|
||||||
defer C.free(unsafe.Pointer(lib))
|
defer C.free(unsafe.Pointer(lib))
|
||||||
|
@ -275,6 +276,7 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
|
||||||
|
|
||||||
func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
|
func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
|
||||||
var resp C.rocm_init_resp_t
|
var resp C.rocm_init_resp_t
|
||||||
|
resp.rh.verbose = getVerboseState()
|
||||||
for _, libPath := range rocmLibPaths {
|
for _, libPath := range rocmLibPaths {
|
||||||
lib := C.CString(libPath)
|
lib := C.CString(libPath)
|
||||||
defer C.free(unsafe.Pointer(lib))
|
defer C.free(unsafe.Pointer(lib))
|
||||||
|
@ -288,3 +290,10 @@ func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getVerboseState() C.uint16_t {
|
||||||
|
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||||
|
return C.uint16_t(1)
|
||||||
|
}
|
||||||
|
return C.uint16_t(0)
|
||||||
|
}
|
||||||
|
|
|
@ -27,6 +27,13 @@
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define LOG(verbose, ...) \
|
||||||
|
do { \
|
||||||
|
if (verbose) { \
|
||||||
|
fprintf(stderr, __VA_ARGS__); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#define CUDA_LOOKUP_SIZE 6
|
#define CUDA_LOOKUP_SIZE 12
|
||||||
|
|
||||||
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
||||||
nvmlReturn_t ret;
|
nvmlReturn_t ret;
|
||||||
|
@ -23,6 +23,12 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
||||||
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
|
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
|
||||||
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
|
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
|
||||||
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
|
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
|
||||||
|
{"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
|
||||||
|
{"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
|
||||||
|
{"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
|
||||||
|
{"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
|
||||||
|
{"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
|
||||||
|
{"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
|
||||||
};
|
};
|
||||||
|
|
||||||
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
|
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
|
||||||
|
@ -58,7 +64,13 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
// Report driver version if we're in verbose mode, ignore errors
|
||||||
|
ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
||||||
|
@ -98,6 +110,44 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (h.verbose) {
|
||||||
|
nvmlBrandType_t brand = 0;
|
||||||
|
// When in verbose mode, report more information about
|
||||||
|
// the card we discover, but don't fail on error
|
||||||
|
ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.nvmlDeviceGetBrand)(device, &brand);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
|
||||||
|
LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
|
||||||
|
|
||||||
resp->total += memInfo.total;
|
resp->total += memInfo.total;
|
||||||
resp->free += memInfo.free;
|
resp->free += memInfo.free;
|
||||||
|
|
|
@ -15,14 +15,26 @@ typedef struct nvmlMemory_st {
|
||||||
unsigned long long used;
|
unsigned long long used;
|
||||||
} nvmlMemory_t;
|
} nvmlMemory_t;
|
||||||
|
|
||||||
|
typedef enum nvmlBrandType_enum
|
||||||
|
{
|
||||||
|
NVML_BRAND_UNKNOWN = 0,
|
||||||
|
} nvmlBrandType_t;
|
||||||
|
|
||||||
typedef struct cuda_handle {
|
typedef struct cuda_handle {
|
||||||
void *handle;
|
void *handle;
|
||||||
|
uint16_t verbose;
|
||||||
nvmlReturn_t (*initFn)(void);
|
nvmlReturn_t (*initFn)(void);
|
||||||
nvmlReturn_t (*shutdownFn)(void);
|
nvmlReturn_t (*shutdownFn)(void);
|
||||||
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
|
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
|
||||||
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
|
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
|
||||||
nvmlReturn_t (*getCount)(unsigned int *);
|
nvmlReturn_t (*getCount)(unsigned int *);
|
||||||
nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
|
nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
|
||||||
|
nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int length);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int length);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int length);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int length);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int length);
|
||||||
|
nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
|
||||||
} cuda_handle_t;
|
} cuda_handle_t;
|
||||||
|
|
||||||
typedef struct cuda_init_resp {
|
typedef struct cuda_init_resp {
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#define ROCM_LOOKUP_SIZE 5
|
#define ROCM_LOOKUP_SIZE 14
|
||||||
|
|
||||||
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
||||||
rsmi_status_t ret;
|
rsmi_status_t ret;
|
||||||
|
@ -21,7 +21,15 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
||||||
{"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
|
{"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
|
||||||
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
|
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
|
||||||
{"rsmi_version_get", (void *)&resp->rh.versionGetFn},
|
{"rsmi_version_get", (void *)&resp->rh.versionGetFn},
|
||||||
// { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
|
{"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
|
||||||
|
{"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
|
||||||
|
{"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
|
||||||
|
{"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
|
||||||
|
{"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
|
||||||
|
{"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
|
||||||
|
{"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
|
||||||
|
{"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
|
||||||
|
{"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
|
||||||
};
|
};
|
||||||
|
|
||||||
resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
|
resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
|
||||||
|
@ -62,8 +70,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
|
||||||
|
|
||||||
void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
// uint32_t num_devices;
|
|
||||||
// uint16_t device;
|
|
||||||
uint64_t totalMem = 0;
|
uint64_t totalMem = 0;
|
||||||
uint64_t usedMem = 0;
|
uint64_t usedMem = 0;
|
||||||
rsmi_status_t ret;
|
rsmi_status_t ret;
|
||||||
|
@ -76,34 +82,82 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO - iterate through devices... ret =
|
ret = (*h.rsmi_num_monitor_devices)(&resp->count);
|
||||||
// rsmi_num_monitor_devices(&num_devices);
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
|
||||||
|
|
||||||
// ret = (*h.getHandle)(0, &device);
|
resp->total = 0;
|
||||||
// if (ret != RSMI_STATUS_SUCCESS) {
|
resp->free = 0;
|
||||||
// printf("rocm vram device lookup failure: %d\n", ret);
|
for (i = 0; i < resp->count; i++) {
|
||||||
// return -1;
|
if (h.verbose) {
|
||||||
// }
|
// When in verbose mode, report more information about
|
||||||
|
// the card we discover, but don't fail on error
|
||||||
|
ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
|
||||||
|
} else {
|
||||||
|
LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Get total memory - used memory for available memory
|
// Get total memory - used memory for available memory
|
||||||
ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
|
ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
|
||||||
if (ret != RSMI_STATUS_SUCCESS) {
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
|
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
|
ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
|
||||||
if (ret != RSMI_STATUS_SUCCESS) {
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
|
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
|
||||||
// TODO: set this to the actual number of devices
|
LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
|
||||||
resp->count = 1;
|
resp->total += totalMem;
|
||||||
resp->total = totalMem;
|
resp->free += totalMem - usedMem;
|
||||||
resp->free = totalMem - usedMem;
|
}
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
|
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
|
||||||
|
|
|
@ -24,12 +24,21 @@ typedef enum rsmi_memory_type {
|
||||||
|
|
||||||
typedef struct rocm_handle {
|
typedef struct rocm_handle {
|
||||||
void *handle;
|
void *handle;
|
||||||
|
uint16_t verbose;
|
||||||
rsmi_status_t (*initFn)(uint64_t);
|
rsmi_status_t (*initFn)(uint64_t);
|
||||||
rsmi_status_t (*shutdownFn)(void);
|
rsmi_status_t (*shutdownFn)(void);
|
||||||
rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
|
rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
|
||||||
rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
|
rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
|
||||||
rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
|
rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
|
||||||
// rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
|
rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
|
||||||
|
rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
|
||||||
|
rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
|
||||||
|
rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);
|
||||||
|
rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);
|
||||||
|
rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);
|
||||||
|
rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);
|
||||||
|
rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);
|
||||||
|
rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);
|
||||||
} rocm_handle_t;
|
} rocm_handle_t;
|
||||||
|
|
||||||
typedef struct rocm_init_resp {
|
typedef struct rocm_init_resp {
|
||||||
|
|
Loading…
Reference in a new issue