34b9db5afc
This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS.
147 lines
9.9 KiB
C
147 lines
9.9 KiB
C
#ifndef __APPLE__
|
|
#ifndef __GPU_INFO_CUDART_H__
|
|
#define __GPU_INFO_CUDART_H__
|
|
#include "gpu_info.h"
|
|
|
|
// Just enough typedef's to dlopen/dlsym for memory information
|
|
typedef enum cudartReturn_enum {
|
|
CUDART_SUCCESS = 0,
|
|
CUDA_ERROR_INVALID_VALUE = 1,
|
|
CUDA_ERROR_MEMORY_ALLOCATION = 2,
|
|
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
|
|
// Other values omitted for now...
|
|
} cudartReturn_t;
|
|
|
|
typedef enum cudartDeviceAttr_enum {
|
|
cudartDevAttrComputeCapabilityMajor = 75,
|
|
cudartDevAttrComputeCapabilityMinor = 76,
|
|
|
|
// TODO - not yet wired up but may be useful for Jetson or other
|
|
// integrated GPU scenarios with shared memory
|
|
cudaDevAttrIntegrated = 18
|
|
|
|
} cudartDeviceAttr_t;
|
|
|
|
typedef void *cudartDevice_t; // Opaque is sufficient
|
|
typedef struct cudartMemory_st {
|
|
size_t total;
|
|
size_t free;
|
|
size_t used;
|
|
} cudartMemory_t;
|
|
|
|
typedef struct cudartDriverVersion {
|
|
int major;
|
|
int minor;
|
|
} cudartDriverVersion_t;
|
|
|
|
typedef struct cudaUUID {
|
|
unsigned char bytes[16];
|
|
} cudaUUID_t;
|
|
typedef struct cudaDeviceProp {
|
|
char name[256]; /**< ASCII string identifying device */
|
|
cudaUUID_t uuid; /**< 16-byte unique identifier */
|
|
char luid[8]; /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
|
|
unsigned int luidDeviceNodeMask; /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
|
|
size_t totalGlobalMem; /**< Global memory available on device in bytes */
|
|
size_t sharedMemPerBlock; /**< Shared memory available per block in bytes */
|
|
int regsPerBlock; /**< 32-bit registers available per block */
|
|
int warpSize; /**< Warp size in threads */
|
|
size_t memPitch; /**< Maximum pitch in bytes allowed by memory copies */
|
|
int maxThreadsPerBlock; /**< Maximum number of threads per block */
|
|
int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */
|
|
int maxGridSize[3]; /**< Maximum size of each dimension of a grid */
|
|
int clockRate; /**< Clock frequency in kilohertz */
|
|
size_t totalConstMem; /**< Constant memory available on device in bytes */
|
|
int major; /**< Major compute capability */
|
|
int minor; /**< Minor compute capability */
|
|
size_t textureAlignment; /**< Alignment requirement for textures */
|
|
size_t texturePitchAlignment; /**< Pitch alignment requirement for texture references bound to pitched memory */
|
|
int deviceOverlap; /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
|
|
int multiProcessorCount; /**< Number of multiprocessors on device */
|
|
int kernelExecTimeoutEnabled; /**< Specified whether there is a run time limit on kernels */
|
|
int integrated; /**< Device is integrated as opposed to discrete */
|
|
int canMapHostMemory; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
|
|
int computeMode; /**< Compute mode (See ::cudaComputeMode) */
|
|
int maxTexture1D; /**< Maximum 1D texture size */
|
|
int maxTexture1DMipmap; /**< Maximum 1D mipmapped texture size */
|
|
int maxTexture1DLinear; /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
|
|
int maxTexture2D[2]; /**< Maximum 2D texture dimensions */
|
|
int maxTexture2DMipmap[2]; /**< Maximum 2D mipmapped texture dimensions */
|
|
int maxTexture2DLinear[3]; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
|
|
int maxTexture2DGather[2]; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
|
|
int maxTexture3D[3]; /**< Maximum 3D texture dimensions */
|
|
int maxTexture3DAlt[3]; /**< Maximum alternate 3D texture dimensions */
|
|
int maxTextureCubemap; /**< Maximum Cubemap texture dimensions */
|
|
int maxTexture1DLayered[2]; /**< Maximum 1D layered texture dimensions */
|
|
int maxTexture2DLayered[3]; /**< Maximum 2D layered texture dimensions */
|
|
int maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
|
|
int maxSurface1D; /**< Maximum 1D surface size */
|
|
int maxSurface2D[2]; /**< Maximum 2D surface dimensions */
|
|
int maxSurface3D[3]; /**< Maximum 3D surface dimensions */
|
|
int maxSurface1DLayered[2]; /**< Maximum 1D layered surface dimensions */
|
|
int maxSurface2DLayered[3]; /**< Maximum 2D layered surface dimensions */
|
|
int maxSurfaceCubemap; /**< Maximum Cubemap surface dimensions */
|
|
int maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
|
|
size_t surfaceAlignment; /**< Alignment requirements for surfaces */
|
|
int concurrentKernels; /**< Device can possibly execute multiple kernels concurrently */
|
|
int ECCEnabled; /**< Device has ECC support enabled */
|
|
int pciBusID; /**< PCI bus ID of the device */
|
|
int pciDeviceID; /**< PCI device ID of the device */
|
|
int pciDomainID; /**< PCI domain ID of the device */
|
|
int tccDriver; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
|
|
int asyncEngineCount; /**< Number of asynchronous engines */
|
|
int unifiedAddressing; /**< Device shares a unified address space with the host */
|
|
int memoryClockRate; /**< Peak memory clock frequency in kilohertz */
|
|
int memoryBusWidth; /**< Global memory bus width in bits */
|
|
int l2CacheSize; /**< Size of L2 cache in bytes */
|
|
int persistingL2CacheMaxSize; /**< Device's maximum l2 persisting lines capacity setting in bytes */
|
|
int maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
|
|
int streamPrioritiesSupported; /**< Device supports stream priorities */
|
|
int globalL1CacheSupported; /**< Device supports caching globals in L1 */
|
|
int localL1CacheSupported; /**< Device supports caching locals in L1 */
|
|
size_t sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
|
|
int regsPerMultiprocessor; /**< 32-bit registers available per multiprocessor */
|
|
int managedMemory; /**< Device supports allocating managed memory on this system */
|
|
int isMultiGpuBoard; /**< Device is on a multi-GPU board */
|
|
int multiGpuBoardGroupID; /**< Unique identifier for a group of devices on the same multi-GPU board */
|
|
int hostNativeAtomicSupported; /**< Link between the device and the host supports native atomic operations */
|
|
int singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
|
|
int pageableMemoryAccess; /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
|
|
int concurrentManagedAccess; /**< Device can coherently access managed memory concurrently with the CPU */
|
|
int computePreemptionSupported; /**< Device supports Compute Preemption */
|
|
int canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
|
|
int cooperativeLaunch; /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
|
|
int cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
|
|
size_t sharedMemPerBlockOptin; /**< Per device maximum shared memory per block usable by special opt in */
|
|
int pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
|
|
int directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
|
|
int maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
|
|
int accessPolicyMaxWindowSize; /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
|
|
size_t reservedSharedMemPerBlock; /**< Shared memory reserved by CUDA driver per block in bytes */
|
|
} cudaDeviceProp_t;
|
|
|
|
typedef struct cudart_handle {
|
|
void *handle;
|
|
uint16_t verbose;
|
|
cudartReturn_t (*cudaSetDevice)(int device);
|
|
cudartReturn_t (*cudaDeviceSynchronize)(void);
|
|
cudartReturn_t (*cudaDeviceReset)(void);
|
|
cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *);
|
|
cudartReturn_t (*cudaGetDeviceCount)(int *);
|
|
cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
|
|
cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
|
|
cudartReturn_t (*cudaGetDeviceProperties) (cudaDeviceProp_t* prop, int device);
|
|
} cudart_handle_t;
|
|
|
|
typedef struct cudart_init_resp {
|
|
char *err; // If err is non-null handle is invalid
|
|
cudart_handle_t ch;
|
|
int num_devices;
|
|
} cudart_init_resp_t;
|
|
|
|
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
|
|
void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);
|
|
void cudart_release(cudart_handle_t ch);
|
|
|
|
#endif // __GPU_INFO_CUDART_H__
|
|
#endif // __APPLE__
|