2024-03-25 15:07:44 +00:00
# ifndef __APPLE__
# ifndef __GPU_INFO_CUDART_H__
# define __GPU_INFO_CUDART_H__
# include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum cudartReturn_enum {
CUDART_SUCCESS = 0 ,
2024-04-30 23:42:48 +00:00
CUDART_ERROR_INVALID_VALUE = 1 ,
CUDART_ERROR_MEMORY_ALLOCATION = 2 ,
CUDART_ERROR_INSUFFICIENT_DRIVER = 35 ,
2024-03-25 15:07:44 +00:00
// Other values omitted for now...
} cudartReturn_t ;
typedef enum cudartDeviceAttr_enum {
cudartDevAttrComputeCapabilityMajor = 75 ,
cudartDevAttrComputeCapabilityMinor = 76 ,
2024-03-30 16:50:05 +00:00
// TODO - not yet wired up but may be useful for Jetson or other
// integrated GPU scenarios with shared memory
cudaDevAttrIntegrated = 18
2024-03-25 15:07:44 +00:00
} cudartDeviceAttr_t ;
typedef void * cudartDevice_t ; // Opaque is sufficient
typedef struct cudartMemory_st {
size_t total ;
size_t free ;
size_t used ;
} cudartMemory_t ;
typedef struct cudartDriverVersion {
int major ;
int minor ;
} cudartDriverVersion_t ;
2024-03-30 16:50:05 +00:00
typedef struct cudaUUID {
unsigned char bytes [ 16 ] ;
} cudaUUID_t ;
typedef struct cudaDeviceProp {
char name [ 256 ] ; /**< ASCII string identifying device */
cudaUUID_t uuid ; /**< 16-byte unique identifier */
char luid [ 8 ] ; /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
unsigned int luidDeviceNodeMask ; /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
size_t totalGlobalMem ; /**< Global memory available on device in bytes */
size_t sharedMemPerBlock ; /**< Shared memory available per block in bytes */
int regsPerBlock ; /**< 32-bit registers available per block */
int warpSize ; /**< Warp size in threads */
size_t memPitch ; /**< Maximum pitch in bytes allowed by memory copies */
int maxThreadsPerBlock ; /**< Maximum number of threads per block */
int maxThreadsDim [ 3 ] ; /**< Maximum size of each dimension of a block */
int maxGridSize [ 3 ] ; /**< Maximum size of each dimension of a grid */
int clockRate ; /**< Clock frequency in kilohertz */
size_t totalConstMem ; /**< Constant memory available on device in bytes */
int major ; /**< Major compute capability */
int minor ; /**< Minor compute capability */
size_t textureAlignment ; /**< Alignment requirement for textures */
size_t texturePitchAlignment ; /**< Pitch alignment requirement for texture references bound to pitched memory */
int deviceOverlap ; /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
int multiProcessorCount ; /**< Number of multiprocessors on device */
int kernelExecTimeoutEnabled ; /**< Specified whether there is a run time limit on kernels */
int integrated ; /**< Device is integrated as opposed to discrete */
int canMapHostMemory ; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
int computeMode ; /**< Compute mode (See ::cudaComputeMode) */
int maxTexture1D ; /**< Maximum 1D texture size */
int maxTexture1DMipmap ; /**< Maximum 1D mipmapped texture size */
int maxTexture1DLinear ; /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
int maxTexture2D [ 2 ] ; /**< Maximum 2D texture dimensions */
int maxTexture2DMipmap [ 2 ] ; /**< Maximum 2D mipmapped texture dimensions */
int maxTexture2DLinear [ 3 ] ; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
int maxTexture2DGather [ 2 ] ; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
int maxTexture3D [ 3 ] ; /**< Maximum 3D texture dimensions */
int maxTexture3DAlt [ 3 ] ; /**< Maximum alternate 3D texture dimensions */
int maxTextureCubemap ; /**< Maximum Cubemap texture dimensions */
int maxTexture1DLayered [ 2 ] ; /**< Maximum 1D layered texture dimensions */
int maxTexture2DLayered [ 3 ] ; /**< Maximum 2D layered texture dimensions */
int maxTextureCubemapLayered [ 2 ] ; /**< Maximum Cubemap layered texture dimensions */
int maxSurface1D ; /**< Maximum 1D surface size */
int maxSurface2D [ 2 ] ; /**< Maximum 2D surface dimensions */
int maxSurface3D [ 3 ] ; /**< Maximum 3D surface dimensions */
int maxSurface1DLayered [ 2 ] ; /**< Maximum 1D layered surface dimensions */
int maxSurface2DLayered [ 3 ] ; /**< Maximum 2D layered surface dimensions */
int maxSurfaceCubemap ; /**< Maximum Cubemap surface dimensions */
int maxSurfaceCubemapLayered [ 2 ] ; /**< Maximum Cubemap layered surface dimensions */
size_t surfaceAlignment ; /**< Alignment requirements for surfaces */
int concurrentKernels ; /**< Device can possibly execute multiple kernels concurrently */
int ECCEnabled ; /**< Device has ECC support enabled */
int pciBusID ; /**< PCI bus ID of the device */
int pciDeviceID ; /**< PCI device ID of the device */
int pciDomainID ; /**< PCI domain ID of the device */
int tccDriver ; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
int asyncEngineCount ; /**< Number of asynchronous engines */
int unifiedAddressing ; /**< Device shares a unified address space with the host */
int memoryClockRate ; /**< Peak memory clock frequency in kilohertz */
int memoryBusWidth ; /**< Global memory bus width in bits */
int l2CacheSize ; /**< Size of L2 cache in bytes */
int persistingL2CacheMaxSize ; /**< Device's maximum l2 persisting lines capacity setting in bytes */
int maxThreadsPerMultiProcessor ; /**< Maximum resident threads per multiprocessor */
int streamPrioritiesSupported ; /**< Device supports stream priorities */
int globalL1CacheSupported ; /**< Device supports caching globals in L1 */
int localL1CacheSupported ; /**< Device supports caching locals in L1 */
size_t sharedMemPerMultiprocessor ; /**< Shared memory available per multiprocessor in bytes */
int regsPerMultiprocessor ; /**< 32-bit registers available per multiprocessor */
int managedMemory ; /**< Device supports allocating managed memory on this system */
int isMultiGpuBoard ; /**< Device is on a multi-GPU board */
int multiGpuBoardGroupID ; /**< Unique identifier for a group of devices on the same multi-GPU board */
int hostNativeAtomicSupported ; /**< Link between the device and the host supports native atomic operations */
int singleToDoublePrecisionPerfRatio ; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
int pageableMemoryAccess ; /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
int concurrentManagedAccess ; /**< Device can coherently access managed memory concurrently with the CPU */
int computePreemptionSupported ; /**< Device supports Compute Preemption */
int canUseHostPointerForRegisteredMem ; /**< Device can access host registered memory at the same virtual address as the CPU */
int cooperativeLaunch ; /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
int cooperativeMultiDeviceLaunch ; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
size_t sharedMemPerBlockOptin ; /**< Per device maximum shared memory per block usable by special opt in */
int pageableMemoryAccessUsesHostPageTables ; /**< Device accesses pageable memory via the host's page tables */
int directManagedMemAccessFromHost ; /**< Host can directly access managed memory on the device without migration. */
int maxBlocksPerMultiProcessor ; /**< Maximum number of resident blocks per multiprocessor */
int accessPolicyMaxWindowSize ; /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
size_t reservedSharedMemPerBlock ; /**< Shared memory reserved by CUDA driver per block in bytes */
} cudaDeviceProp_t ;
2024-03-25 15:07:44 +00:00
typedef struct cudart_handle {
void * handle ;
uint16_t verbose ;
cudartReturn_t ( * cudaSetDevice ) ( int device ) ;
cudartReturn_t ( * cudaDeviceSynchronize ) ( void ) ;
cudartReturn_t ( * cudaDeviceReset ) ( void ) ;
cudartReturn_t ( * cudaMemGetInfo ) ( size_t * , size_t * ) ;
cudartReturn_t ( * cudaGetDeviceCount ) ( int * ) ;
cudartReturn_t ( * cudaDeviceGetAttribute ) ( int * value , cudartDeviceAttr_t attr , int device ) ;
cudartReturn_t ( * cudaDriverGetVersion ) ( int * driverVersion ) ;
2024-03-30 16:50:05 +00:00
cudartReturn_t ( * cudaGetDeviceProperties ) ( cudaDeviceProp_t * prop , int device ) ;
2024-03-25 15:07:44 +00:00
} cudart_handle_t ;
typedef struct cudart_init_resp {
char * err ; // If err is non-null handle is invalid
cudart_handle_t ch ;
2024-03-30 16:50:05 +00:00
int num_devices ;
2024-03-25 15:07:44 +00:00
} cudart_init_resp_t ;
void cudart_init ( char * cudart_lib_path , cudart_init_resp_t * resp ) ;
2024-05-15 22:13:16 +00:00
void cudart_bootstrap ( cudart_handle_t ch , int device_id , mem_info_t * resp ) ;
// TODO - if we keep this library longer term, add cudart_get_free
2024-03-30 22:34:21 +00:00
void cudart_release ( cudart_handle_t ch ) ;
2024-03-25 15:07:44 +00:00
# endif // __GPU_INFO_CUDART_H__
# endif // __APPLE__