#include <grouped_gemm_kernel.hpp>
|
| CK_TILE_DEVICE void | Run (const UniversalGemmKernelArgs< 1, 1, NumDTensor_ > &kargs, const tuple< index_t, index_t > &block_idx_2d, const index_t block_idx_z) const |
| CK_TILE_DEVICE index_t | FindGroupId (const GemmTransKernelArg< NumDTensor_ > *gemm_desc_ptr, index_t block_id, index_t group_count) const |
| template<bool U = UsePersistentKernel, typename = std::enable_if_t<!U>> |
| CK_TILE_DEVICE void | operator() (const void CK_CONSTANT_ADDRESS_SPACE *gemm_descs_const, index_t group_count) const |
| template<bool U = UsePersistentKernel, typename = std::enable_if_t<U>, typename = void> |
| CK_TILE_DEVICE void | operator() (const void CK_CONSTANT_ADDRESS_SPACE *gemm_descs_const, const index_t group_count) const |
|
| static CK_TILE_HOST const std::string | GetName () |
| static CK_TILE_HOST auto | GetWorkSpaceSize (const std::vector< GroupedGemmHostArgs<> > &gemm_descs) -> std::size_t |
| static CK_TILE_HOST auto | GetWorkSpaceSize (index_t group_count) -> std::size_t |
| static CK_TILE_HOST auto | BlockSize () -> dim3 |
| static CK_TILE_HOST auto | MaxOccupancyGridSize (const stream_config &s) -> dim3 |
| | Get the maximum occupancy grid size for the persistent kernel on the current device.
|
| static CK_TILE_HOST auto | GridSize (const std::vector< GroupedGemmHostArgs< NumDTensor_ > > &gemm_descs) |
| static CK_TILE_HOST auto | MakeKargs (const std::vector< GroupedGemmHostArgs< NumDTensor_ > > &gemm_descs) -> std::vector< GemmTransKernelArg< NumDTensor_ > > |
| static CK_TILE_HOST bool | IsSupportedArgument (const std::vector< GemmTransKernelArg< NumDTensor_ > > &kargs) |
| static CK_TILE_HOST_DEVICE constexpr auto | GetSmemSize () -> index_t |
| static CK_TILE_DEVICE void | RunGemmWithPipelineSelection (const ADataType *a_ptr, const BDataType *b_ptr, const std::array< const void *, NumDTensor_ > &ds_ptr, CDataType *c_ptr, void *smem_ptr_0, const UniversalGemmKernelArgs< 1, 1, NumDTensor_ > &kargs, const typename Base::SplitKBatchOffset &splitk_batch_offset, const index_t block_idx_m, const index_t block_idx_n) |
| | Runs single GEMM problem cooperatively by whole workgroup.
|
| static CK_TILE_DEVICE void | RunGemmWithPipelineSelection2LDS (const ADataType *a_ptr, const BDataType *b_ptr, CDataType *c_ptr, const std::array< const void *, NumDTensor_ > &ds_ptr, void *__restrict__ smem_ptr_0, void *__restrict__ smem_ptr_1, const UniversalGemmKernelArgs< 1, 1, NumDTensor_ > &kargs, const typename Base::SplitKBatchOffset &splitk_batch_offset, const index_t block_idx_m, const index_t block_idx_n) |
| | Runs single GEMM problem cooperatively by whole workgroup.
|
◆ ADataType
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
Specify the data type configurations for A, B, C/E.
◆ ALayout
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ Base
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ BDataType
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ BLayout
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ CDataType
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ CLayout
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ DsDataType
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ EpiloguePipeline
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ GemmPipeline
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ Kernel
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ OffsetTile1DPartitioner
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
ALayout and ADataType are expected to be scalars, not a tuple.
BLayout and BDataType are expected to be scalars, not a tuple.
C/CLayout and C/EDataType are expected to be scalars, not a tuple.
◆ TilePartitioner
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ BlockSize()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ FindGroupId()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ GetName()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ GetSmemSize()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ GetWorkSpaceSize() [1/2]
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ GetWorkSpaceSize() [2/2]
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ GridSize()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ IsSupportedArgument()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ MakeKargs()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ MaxOccupancyGridSize()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
Get the maximum occupancy grid size for the persistent kernel on the current device.
- Returns
- The maximum occupancy grid size.
- Note
- This function queries the maximum occupancy of the kernel using hipOccupancyMaxActiveBlocksPerMultiprocessor.
◆ operator()() [1/2]
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
template<bool U = UsePersistentKernel, typename = std::enable_if_t<U>, typename = void>
◆ operator()() [2/2]
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
template<bool U = UsePersistentKernel, typename = std::enable_if_t<!U>>
◆ Run()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ RunGemmWithPipelineSelection()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| CK_TILE_DEVICE void ck_tile::GroupedGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::RunGemmWithPipelineSelection |
( |
const ADataType * | a_ptr, |
|
|
const BDataType * | b_ptr, |
|
|
const std::array< const void *, NumDTensor_ > & | ds_ptr, |
|
|
CDataType * | c_ptr, |
|
|
void * | smem_ptr_0, |
|
|
const UniversalGemmKernelArgs< 1, 1, NumDTensor_ > & | kargs, |
|
|
const typename Base::SplitKBatchOffset & | splitk_batch_offset, |
|
|
const index_t | block_idx_m, |
|
|
const index_t | block_idx_n ) |
|
inlinestatic |
Runs single GEMM problem cooperatively by whole workgroup.
- Note
- The GEMM pipeline is selected in-kernel based on the number of K-loops and the tail-number. This is needed for the persistent tile-loop when we didn't have access to the K dimension on the host.
- Parameters
-
| a_ptr | input A pointer |
| b_ptr | input B pointer |
| c_ptr | output C pointer |
| smem_ptr_0 | The start memory pointer of the shared memory block. |
| kargs | GEMM kernel arguments |
| splitk_batch_offset | splitk_batch_offset Utility structure used to calculate k batch. |
| block_idx_m | The GEMM's output M dimension tile index processed by this workgroup. |
| block_idx_n | The GEMM's output N dimension tile index processed by this workgroup. |
◆ RunGemmWithPipelineSelection2LDS()
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| CK_TILE_DEVICE void ck_tile::GroupedGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::RunGemmWithPipelineSelection2LDS |
( |
const ADataType * | a_ptr, |
|
|
const BDataType * | b_ptr, |
|
|
CDataType * | c_ptr, |
|
|
const std::array< const void *, NumDTensor_ > & | ds_ptr, |
|
|
void *__restrict__ | smem_ptr_0, |
|
|
void *__restrict__ | smem_ptr_1, |
|
|
const UniversalGemmKernelArgs< 1, 1, NumDTensor_ > & | kargs, |
|
|
const typename Base::SplitKBatchOffset & | splitk_batch_offset, |
|
|
const index_t | block_idx_m, |
|
|
const index_t | block_idx_n ) |
|
inlinestatic |
Runs single GEMM problem cooperatively by whole workgroup.
- Note
- The GEMM pipeline is selected in-kernel based on the number of K-loops and the tail-number. This is needed for the persistent tile-loop when we didn't have access to the K dimension on the host.
- Parameters
-
| a_ptr | input A pointer |
| b_ptr | input B pointer |
| c_ptr | output C pointer |
| smem_ptr_0 | The start memory pointer of the shared memory block. |
| smem_ptr_1 | The second start memory pointer of the shared memory block. |
| kargs | GEMM kernel arguments |
| splitk_batch_offset | splitk_batch_offset Utility structure used to calculate k batch. |
| block_idx_m | The GEMM's output M dimension tile index processed by this workgroup. |
| block_idx_n | The GEMM's output N dimension tile index processed by this workgroup. |
◆ kBlockSize
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ NumDTensor_
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
◆ UsePersistentKernel
template<typename TilePartitioner_, typename GemmPipeline_, typename EpiloguePipeline_>
| bool ck_tile::GroupedGemmKernel< TilePartitioner_, GemmPipeline_, EpiloguePipeline_ >::UsePersistentKernel = GemmPipeline::UsePersistentKernel |
|
staticconstexpr |
The documentation for this struct was generated from the following file: