/** * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file * * MIT License * * Copyright (c) 2023-2024 The ggml authors * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include #include #include #include #define CUBLAS_COMPUTE_16F CUDA_R_16F #define CUBLAS_COMPUTE_32F CUDA_R_32F #define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F #define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT #define CUBLAS_OP_N MUBLAS_OP_N #define CUBLAS_OP_T MUBLAS_OP_T #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT #define CUDA_R_16F MUSA_R_16F #define CUDA_R_32F MUSA_R_32F #define cublasComputeType_t cudaDataType_t #define cublasCreate mublasCreate #define cublasDestroy mublasDestroy #define cublasGemmEx mublasGemmEx #define cublasGemmBatchedEx mublasGemmBatchedEx #define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx #define cublasHandle_t mublasHandle_t #define cublasSetMathMode mublasSetMathMode #define cublasSetStream mublasSetStream #define cublasSgemm mublasSgemm #define cublasStatus_t mublasStatus_t #define cublasGetStatusString mublasStatus_to_string #define cudaDataType_t musaDataType_t #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess #define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess #define cudaDeviceProp musaDeviceProp #define cudaDeviceSynchronize musaDeviceSynchronize #define cudaError_t musaError_t #define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled #define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled #define cudaEventCreateWithFlags musaEventCreateWithFlags #define cudaEventDisableTiming musaEventDisableTiming #define cudaEventRecord musaEventRecord #define cudaEventSynchronize musaEventSynchronize #define cudaEvent_t musaEvent_t #define cudaEventDestroy musaEventDestroy #define cudaFree musaFree #define cudaFreeHost musaFreeHost #define cudaGetDevice musaGetDevice #define cudaGetDeviceCount musaGetDeviceCount #define cudaGetDeviceProperties musaGetDeviceProperties #define cudaGetErrorString musaGetErrorString #define cudaGetLastError musaGetLastError #define cudaHostRegister musaHostRegister #define cudaHostRegisterPortable musaHostRegisterPortable #define cudaHostRegisterReadOnly musaHostRegisterReadOnly #define cudaHostUnregister musaHostUnregister #define cudaLaunchHostFunc musaLaunchHostFunc #define cudaMalloc musaMalloc #define cudaMallocHost musaMallocHost #define cudaMemcpy musaMemcpy #define cudaMemcpyAsync musaMemcpyAsync #define cudaMemcpyPeerAsync musaMemcpyPeerAsync #define cudaMemcpy2DAsync musaMemcpy2DAsync #define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice #define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost #define cudaMemcpyHostToDevice musaMemcpyHostToDevice #define cudaMemcpyKind musaMemcpyKind #define cudaMemset musaMemset #define cudaMemsetAsync musaMemsetAsync #define cudaMemGetInfo musaMemGetInfo #define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize #define cudaSetDevice musaSetDevice #define cudaStreamCreateWithFlags musaStreamCreateWithFlags #define cudaStreamDestroy musaStreamDestroy #define cudaStreamFireAndForget musaStreamFireAndForget #define cudaStreamNonBlocking musaStreamNonBlocking #define cudaStreamPerThread musaStreamPerThread #define cudaStreamSynchronize musaStreamSynchronize #define cudaStreamWaitEvent musaStreamWaitEvent #define cudaStream_t musaStream_t #define cudaSuccess musaSuccess // Additional mappings for MUSA virtual memory pool #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED #define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED #define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE #define CUdevice MUdevice #define CUdeviceptr MUdeviceptr #define CUmemAccessDesc MUmemAccessDesc #define CUmemAllocationProp MUmemAllocationProp #define CUmemGenericAllocationHandle MUmemGenericAllocationHandle #define cuDeviceGet muDeviceGet #define cuDeviceGetAttribute muDeviceGetAttribute #define cuMemAddressFree muMemAddressFree #define cuMemAddressReserve muMemAddressReserve #define cuMemCreate muMemCreate #define cuMemGetAllocationGranularity muMemGetAllocationGranularity #define cuMemMap muMemMap #define cuMemRelease muMemRelease #define cuMemSetAccess muMemSetAccess #define cuMemUnmap muMemUnmap #define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize #define cudaFuncSetAttribute musaFuncSetAttribute #define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms #define make_cudaExtent make_musaExtent #define make_cudaPitchedPtr make_musaPitchedPtr // Additional mappings for MUSA graphs #define CUDA_SUCCESS MUSA_SUCCESS #define CUresult MUresult #define cuGetErrorString muGetErrorString #define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure #define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction #define cudaGraphDestroy musaGraphDestroy #define cudaGraphExecDestroy musaGraphExecDestroy #define cudaGraphExec_t musaGraphExec_t #define cudaGraphExecUpdate musaGraphExecUpdate #define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult #define cudaGraphGetNodes musaGraphGetNodes #define cudaGraphInstantiate musaGraphInstantiate #define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams #define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams #define cudaGraphLaunch musaGraphLaunch #define cudaGraphNodeGetType musaGraphNodeGetType #define cudaGraphNode_t musaGraphNode_t #define cudaGraphNodeType musaGraphNodeType #define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel #define cudaGraph_t musaGraph_t #define cudaKernelNodeParams musaKernelNodeParams #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed #define cudaStreamEndCapture musaStreamEndCapture // XXX: Clang builtins mapping #define __vsub4 __vsub4_musa #define __vcmpeq4 __vcmpeq4_musa #define __vcmpne4 __vcmpne4_musa #ifndef __has_builtin #define __has_builtin(x) 0 #endif typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) { return __vsubss4(a, b); } static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) { const uint8x4_t& va = reinterpret_cast(a); const uint8x4_t& vb = reinterpret_cast(b); unsigned int c; uint8x4_t& vc = reinterpret_cast(c); #pragma unroll for (int i = 0; i < 4; ++i) { vc[i] = va[i] == vb[i] ? 0xff : 0x00; } return c; } static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) { const uint8x4_t& va = reinterpret_cast(a); const uint8x4_t& vb = reinterpret_cast(b); unsigned int c; uint8x4_t& vc = reinterpret_cast(c); #pragma unroll for (int i = 0; i < 4; ++i) { vc[i] = va[i] == vb[i] ? 0x00 : 0xff; } return c; }