ollama/gpu/gpu_info_cudart.h

#ifndef __APPLE__
#ifndef __GPU_INFO_CUDART_H__
#define __GPU_INFO_CUDART_H__
#include "gpu_info.h"

// Just enough typedef's to dlopen/dlsym for memory information
typedef enum cudartReturn_enum {
  CUDART_SUCCESS = 0,
  CUDART_ERROR_INVALID_VALUE = 1,
  CUDART_ERROR_MEMORY_ALLOCATION = 2,
  CUDART_ERROR_INSUFFICIENT_DRIVER = 35,
  // Other values omitted for now...
} cudartReturn_t;

typedef enum cudartDeviceAttr_enum {
  cudartDevAttrComputeCapabilityMajor = 75,
  cudartDevAttrComputeCapabilityMinor = 76,

  // TODO - not yet wired up but may be useful for Jetson or other
  // integrated GPU scenarios with shared memory
  cudaDevAttrIntegrated = 18

} cudartDeviceAttr_t;

typedef void *cudartDevice_t;  // Opaque is sufficient
typedef struct cudartMemory_st {
  size_t total;
  size_t free;
  size_t used;
} cudartMemory_t;

typedef struct cudartDriverVersion {
  int major;
  int minor;
} cudartDriverVersion_t;

typedef struct cudaUUID {
    unsigned char bytes[16];
} cudaUUID_t;
typedef struct cudaDeviceProp {
    char         name[256];                  /**< ASCII string identifying device */
    cudaUUID_t   uuid;                       /**< 16-byte unique identifier */
    char         luid[8];                    /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
    unsigned int luidDeviceNodeMask;         /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
    size_t       totalGlobalMem;             /**< Global memory available on device in bytes */
    size_t       sharedMemPerBlock;          /**< Shared memory available per block in bytes */
    int          regsPerBlock;               /**< 32-bit registers available per block */
    int          warpSize;                   /**< Warp size in threads */
    size_t       memPitch;                   /**< Maximum pitch in bytes allowed by memory copies */
    int          maxThreadsPerBlock;         /**< Maximum number of threads per block */
    int          maxThreadsDim[3];           /**< Maximum size of each dimension of a block */
    int          maxGridSize[3];             /**< Maximum size of each dimension of a grid */
    int          clockRate;                  /**< Clock frequency in kilohertz */
    size_t       totalConstMem;              /**< Constant memory available on device in bytes */
    int          major;                      /**< Major compute capability */
    int          minor;                      /**< Minor compute capability */
    size_t       textureAlignment;           /**< Alignment requirement for textures */
    size_t       texturePitchAlignment;      /**< Pitch alignment requirement for texture references bound to pitched memory */
    int          deviceOverlap;              /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
    int          multiProcessorCount;        /**< Number of multiprocessors on device */
    int          kernelExecTimeoutEnabled;   /**< Specified whether there is a run time limit on kernels */
    int          integrated;                 /**< Device is integrated as opposed to discrete */
    int          canMapHostMemory;           /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
    int          computeMode;                /**< Compute mode (See ::cudaComputeMode) */
    int          maxTexture1D;               /**< Maximum 1D texture size */
    int          maxTexture1DMipmap;         /**< Maximum 1D mipmapped texture size */
    int          maxTexture1DLinear;         /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
    int          maxTexture2D[2];            /**< Maximum 2D texture dimensions */
    int          maxTexture2DMipmap[2];      /**< Maximum 2D mipmapped texture dimensions */
    int          maxTexture2DLinear[3];      /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
    int          maxTexture2DGather[2];      /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
    int          maxTexture3D[3];            /**< Maximum 3D texture dimensions */
    int          maxTexture3DAlt[3];         /**< Maximum alternate 3D texture dimensions */
    int          maxTextureCubemap;          /**< Maximum Cubemap texture dimensions */
    int          maxTexture1DLayered[2];     /**< Maximum 1D layered texture dimensions */
    int          maxTexture2DLayered[3];     /**< Maximum 2D layered texture dimensions */
    int          maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
    int          maxSurface1D;               /**< Maximum 1D surface size */
    int          maxSurface2D[2];            /**< Maximum 2D surface dimensions */
    int          maxSurface3D[3];            /**< Maximum 3D surface dimensions */
    int          maxSurface1DLayered[2];     /**< Maximum 1D layered surface dimensions */
    int          maxSurface2DLayered[3];     /**< Maximum 2D layered surface dimensions */
    int          maxSurfaceCubemap;          /**< Maximum Cubemap surface dimensions */
    int          maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
    size_t       surfaceAlignment;           /**< Alignment requirements for surfaces */
    int          concurrentKernels;          /**< Device can possibly execute multiple kernels concurrently */
    int          ECCEnabled;                 /**< Device has ECC support enabled */
    int          pciBusID;                   /**< PCI bus ID of the device */
    int          pciDeviceID;                /**< PCI device ID of the device */
    int          pciDomainID;                /**< PCI domain ID of the device */
    int          tccDriver;                  /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
    int          asyncEngineCount;           /**< Number of asynchronous engines */
    int          unifiedAddressing;          /**< Device shares a unified address space with the host */
    int          memoryClockRate;            /**< Peak memory clock frequency in kilohertz */
    int          memoryBusWidth;             /**< Global memory bus width in bits */
    int          l2CacheSize;                /**< Size of L2 cache in bytes */
    int          persistingL2CacheMaxSize;   /**< Device's maximum l2 persisting lines capacity setting in bytes */
    int          maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
    int          streamPrioritiesSupported;  /**< Device supports stream priorities */
    int          globalL1CacheSupported;     /**< Device supports caching globals in L1 */
    int          localL1CacheSupported;      /**< Device supports caching locals in L1 */
    size_t       sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
    int          regsPerMultiprocessor;      /**< 32-bit registers available per multiprocessor */
    int          managedMemory;              /**< Device supports allocating managed memory on this system */
    int          isMultiGpuBoard;            /**< Device is on a multi-GPU board */
    int          multiGpuBoardGroupID;       /**< Unique identifier for a group of devices on the same multi-GPU board */
    int          hostNativeAtomicSupported;  /**< Link between the device and the host supports native atomic operations */
    int          singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
    int          pageableMemoryAccess;       /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
    int          concurrentManagedAccess;    /**< Device can coherently access managed memory concurrently with the CPU */
    int          computePreemptionSupported; /**< Device supports Compute Preemption */
    int          canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
    int          cooperativeLaunch;          /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
    int          cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
    size_t       sharedMemPerBlockOptin;     /**< Per device maximum shared memory per block usable by special opt in */
    int          pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
    int          directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
    int          maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
    int          accessPolicyMaxWindowSize;  /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
    size_t       reservedSharedMemPerBlock;  /**< Shared memory reserved by CUDA driver per block in bytes */
  } cudaDeviceProp_t;

typedef struct cudart_handle {
  void *handle;
  uint16_t verbose;
  cudartReturn_t (*cudaSetDevice)(int device);
  cudartReturn_t (*cudaDeviceSynchronize)(void);
  cudartReturn_t (*cudaDeviceReset)(void);
  cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *);
  cudartReturn_t (*cudaGetDeviceCount)(int *);
  cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
  cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
  cudartReturn_t (*cudaGetDeviceProperties) (cudaDeviceProp_t* prop, int device);
} cudart_handle_t;

typedef struct cudart_init_resp {
  char *err;  // If err is non-null handle is invalid
  cudart_handle_t ch;
  int num_devices;
} cudart_init_resp_t;

void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);
void cudart_release(cudart_handle_t ch);

#endif  // __GPU_INFO_CUDART_H__
#endif  // __APPLE__
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`#ifndef __APPLE__`
			`#ifndef __GPU_INFO_CUDART_H__`
			`#define __GPU_INFO_CUDART_H__`
			`#include "gpu_info.h"`

			`// Just enough typedef's to dlopen/dlsym for memory information`
			`typedef enum cudartReturn_enum {`
			`CUDART_SUCCESS = 0,`
Add CUDA Driver API for GPU discovery We're seeing some corner cases with cudart which might be resolved by switching to the driver API which comes bundled with the driver package 2024-04-30 23:42:48 +00:00			`CUDART_ERROR_INVALID_VALUE = 1,`
			`CUDART_ERROR_MEMORY_ALLOCATION = 2,`
			`CUDART_ERROR_INSUFFICIENT_DRIVER = 35,`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`// Other values omitted for now...`
			`} cudartReturn_t;`

			`typedef enum cudartDeviceAttr_enum {`
			`cudartDevAttrComputeCapabilityMajor = 75,`
			`cudartDevAttrComputeCapabilityMinor = 76,`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00
			`// TODO - not yet wired up but may be useful for Jetson or other`
			`// integrated GPU scenarios with shared memory`
			`cudaDevAttrIntegrated = 18`

add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`} cudartDeviceAttr_t;`

			`typedef void *cudartDevice_t; // Opaque is sufficient`
			`typedef struct cudartMemory_st {`
			`size_t total;`
			`size_t free;`
			`size_t used;`
			`} cudartMemory_t;`

			`typedef struct cudartDriverVersion {`
			`int major;`
			`int minor;`
			`} cudartDriverVersion_t;`

Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`typedef struct cudaUUID {`
			`unsigned char bytes[16];`
			`} cudaUUID_t;`
			`typedef struct cudaDeviceProp {`
			`char name[256]; /*< ASCII string identifying device /`
			`cudaUUID_t uuid; /*< 16-byte unique identifier /`
			`char luid[8]; /*< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms /`
			`unsigned int luidDeviceNodeMask; /*< LUID device node mask. Value is undefined on TCC and non-Windows platforms /`
			`size_t totalGlobalMem; /*< Global memory available on device in bytes /`
			`size_t sharedMemPerBlock; /*< Shared memory available per block in bytes /`
			`int regsPerBlock; /*< 32-bit registers available per block /`
			`int warpSize; /*< Warp size in threads /`
			`size_t memPitch; /*< Maximum pitch in bytes allowed by memory copies /`
			`int maxThreadsPerBlock; /*< Maximum number of threads per block /`
			`int maxThreadsDim[3]; /*< Maximum size of each dimension of a block /`
			`int maxGridSize[3]; /*< Maximum size of each dimension of a grid /`
			`int clockRate; /*< Clock frequency in kilohertz /`
			`size_t totalConstMem; /*< Constant memory available on device in bytes /`
			`int major; /*< Major compute capability /`
			`int minor; /*< Minor compute capability /`
			`size_t textureAlignment; /*< Alignment requirement for textures /`
			`size_t texturePitchAlignment; /*< Pitch alignment requirement for texture references bound to pitched memory /`
			`int deviceOverlap; /*< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. /`
			`int multiProcessorCount; /*< Number of multiprocessors on device /`
			`int kernelExecTimeoutEnabled; /*< Specified whether there is a run time limit on kernels /`
			`int integrated; /*< Device is integrated as opposed to discrete /`
			`int canMapHostMemory; /*< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer /`
			`int computeMode; /*< Compute mode (See ::cudaComputeMode) /`
			`int maxTexture1D; /*< Maximum 1D texture size /`
			`int maxTexture1DMipmap; /*< Maximum 1D mipmapped texture size /`
			`int maxTexture1DLinear; /*< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. /`
			`int maxTexture2D[2]; /*< Maximum 2D texture dimensions /`
			`int maxTexture2DMipmap[2]; /*< Maximum 2D mipmapped texture dimensions /`
			`int maxTexture2DLinear[3]; /*< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory /`
			`int maxTexture2DGather[2]; /*< Maximum 2D texture dimensions if texture gather operations have to be performed /`
			`int maxTexture3D[3]; /*< Maximum 3D texture dimensions /`
			`int maxTexture3DAlt[3]; /*< Maximum alternate 3D texture dimensions /`
			`int maxTextureCubemap; /*< Maximum Cubemap texture dimensions /`
			`int maxTexture1DLayered[2]; /*< Maximum 1D layered texture dimensions /`
			`int maxTexture2DLayered[3]; /*< Maximum 2D layered texture dimensions /`
			`int maxTextureCubemapLayered[2];/*< Maximum Cubemap layered texture dimensions /`
			`int maxSurface1D; /*< Maximum 1D surface size /`
			`int maxSurface2D[2]; /*< Maximum 2D surface dimensions /`
			`int maxSurface3D[3]; /*< Maximum 3D surface dimensions /`
			`int maxSurface1DLayered[2]; /*< Maximum 1D layered surface dimensions /`
			`int maxSurface2DLayered[3]; /*< Maximum 2D layered surface dimensions /`
			`int maxSurfaceCubemap; /*< Maximum Cubemap surface dimensions /`
			`int maxSurfaceCubemapLayered[2];/*< Maximum Cubemap layered surface dimensions /`
			`size_t surfaceAlignment; /*< Alignment requirements for surfaces /`
			`int concurrentKernels; /*< Device can possibly execute multiple kernels concurrently /`
			`int ECCEnabled; /*< Device has ECC support enabled /`
			`int pciBusID; /*< PCI bus ID of the device /`
			`int pciDeviceID; /*< PCI device ID of the device /`
			`int pciDomainID; /*< PCI domain ID of the device /`
			`int tccDriver; /*< 1 if device is a Tesla device using TCC driver, 0 otherwise /`
			`int asyncEngineCount; /*< Number of asynchronous engines /`
			`int unifiedAddressing; /*< Device shares a unified address space with the host /`
			`int memoryClockRate; /*< Peak memory clock frequency in kilohertz /`
			`int memoryBusWidth; /*< Global memory bus width in bits /`
			`int l2CacheSize; /*< Size of L2 cache in bytes /`
			`int persistingL2CacheMaxSize; /*< Device's maximum l2 persisting lines capacity setting in bytes /`
			`int maxThreadsPerMultiProcessor;/*< Maximum resident threads per multiprocessor /`
			`int streamPrioritiesSupported; /*< Device supports stream priorities /`
			`int globalL1CacheSupported; /*< Device supports caching globals in L1 /`
			`int localL1CacheSupported; /*< Device supports caching locals in L1 /`
			`size_t sharedMemPerMultiprocessor; /*< Shared memory available per multiprocessor in bytes /`
			`int regsPerMultiprocessor; /*< 32-bit registers available per multiprocessor /`
			`int managedMemory; /*< Device supports allocating managed memory on this system /`
			`int isMultiGpuBoard; /*< Device is on a multi-GPU board /`
			`int multiGpuBoardGroupID; /*< Unique identifier for a group of devices on the same multi-GPU board /`
			`int hostNativeAtomicSupported; /*< Link between the device and the host supports native atomic operations /`
			`int singleToDoublePrecisionPerfRatio; /*< Ratio of single precision performance (in floating-point operations per second) to double precision performance /`
			`int pageableMemoryAccess; /*< Device supports coherently accessing pageable memory without calling cudaHostRegister on it /`
			`int concurrentManagedAccess; /*< Device can coherently access managed memory concurrently with the CPU /`
			`int computePreemptionSupported; /*< Device supports Compute Preemption /`
			`int canUseHostPointerForRegisteredMem; /*< Device can access host registered memory at the same virtual address as the CPU /`
			`int cooperativeLaunch; /*< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel /`
			`int cooperativeMultiDeviceLaunch; /*< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. /`
			`size_t sharedMemPerBlockOptin; /*< Per device maximum shared memory per block usable by special opt in /`
			`int pageableMemoryAccessUsesHostPageTables; /*< Device accesses pageable memory via the host's page tables /`
			`int directManagedMemAccessFromHost; /*< Host can directly access managed memory on the device without migration. /`
			`int maxBlocksPerMultiProcessor; /*< Maximum number of resident blocks per multiprocessor /`
			`int accessPolicyMaxWindowSize; /*< The maximum value of ::cudaAccessPolicyWindow::num_bytes. /`
			`size_t reservedSharedMemPerBlock; /*< Shared memory reserved by CUDA driver per block in bytes /`
			`} cudaDeviceProp_t;`

add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`typedef struct cudart_handle {`
			`void *handle;`
			`uint16_t verbose;`
			`cudartReturn_t (*cudaSetDevice)(int device);`
			`cudartReturn_t (*cudaDeviceSynchronize)(void);`
			`cudartReturn_t (*cudaDeviceReset)(void);`
			`cudartReturn_t (cudaMemGetInfo)(size_t , size_t *);`
			`cudartReturn_t (cudaGetDeviceCount)(int );`
			`cudartReturn_t (cudaDeviceGetAttribute)(int value, cudartDeviceAttr_t attr, int device);`
			`cudartReturn_t (cudaDriverGetVersion) (int driverVersion);`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`cudartReturn_t (cudaGetDeviceProperties) (cudaDeviceProp_t prop, int device);`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`} cudart_handle_t;`

			`typedef struct cudart_init_resp {`
			`char *err; // If err is non-null handle is invalid`
			`cudart_handle_t ch;`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`int num_devices;`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`} cudart_init_resp_t;`

			`void cudart_init(char cudart_lib_path, cudart_init_resp_t resp);`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);`
Release gpu discovery library after use Leaving the cudart library loaded kept ~30m of memory pinned in the GPU in the main process. This change ensures we don't hold GPU resources when idle. 2024-03-30 22:34:21 +00:00			`void cudart_release(cudart_handle_t ch);`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00
			`#endif // __GPU_INFO_CUDART_H__`
			`#endif // __APPLE__`