24template <
typename GridwiseGemm,
25 typename ComputePtrOffsetOfStridedBatch,
26 bool HasMainKBlockLoop,
31#if CK_USE_LAUNCH_BOUNDS
35 typename GridwiseGemm::Argument karg,
38 const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
40#if(defined(__gfx11__) || defined(__gfx12__))
45 (std::is_same_v<c_data_type, ck::half_t> ||
46 std::is_same_v<c_data_type, ck::bhalf_t>)))
63 constexpr index_t LDS_size = GridwiseGemm::template GetSharedMemoryNumberOfByte<
64 typename GridwiseGemm::EpilogueCShuffle>();
65 __shared__
char p_shared[LDS_size];
67 auto splitk_batch_offset =
typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
70 typename GridwiseGemm::AsGridPointer p_as_grid_shift;
74 p_as_grid_shift(i) =
static_cast<const ADataType_*
>(karg.p_as_grid[i]) +
75 splitk_batch_offset.a_k_split_offset[i] + a_batch_offset;
79 typename GridwiseGemm::BsGridPointer p_bs_grid_shift;
83 p_bs_grid_shift(i) =
static_cast<const BDataType_*
>(karg.p_bs_grid[i]) +
84 splitk_batch_offset.b_k_split_offset[i] + b_batch_offset;
87 auto epilogue_args =
typename GridwiseGemm::EpilogueCShuffle{};
89 GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
93 karg.p_e_grid + splitk_batch_offset.c_reduce_offset + c_batch_offset,
100#if defined(__gfx11__)
105 ignore = compute_ptr_offset_of_batch;
205template <
typename ALayout,
211 typename AccDataType,
212 typename CShuffleDataType,
213 typename AElementwiseOperation,
214 typename BElementwiseOperation,
215 typename CElementwiseOperation,
227 typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
228 typename ABlockTransferThreadClusterArrangeOrder,
229 typename ABlockTransferSrcAccessOrder,
230 index_t ABlockTransferSrcVectorDim,
231 index_t ABlockTransferSrcScalarPerVector,
232 index_t ABlockTransferDstScalarPerVector_AK1,
233 bool ABlockLdsExtraM,
234 typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
235 typename BBlockTransferThreadClusterArrangeOrder,
236 typename BBlockTransferSrcAccessOrder,
237 index_t BBlockTransferSrcVectorDim,
238 index_t BBlockTransferSrcScalarPerVector,
239 index_t BBlockTransferDstScalarPerVector_BK1,
240 bool BBlockLdsExtraN,
241 index_t CShuffleMRepeatPerShuffle,
242 index_t CShuffleNRepeatPerShuffle,
243 typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
244 index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
247 typename ComputeTypeA = CDataType,
248 typename ComputeTypeB = ComputeTypeA,
249 bool PermuteA =
false,
250 bool PermuteB =
false>
257 AElementwiseOperation,
258 BElementwiseOperation,
259 CElementwiseOperation>
263 static_assert(PermuteA ==
false,
264 "Permute A functionality not supported by DeviceBatchedGemm operations.\n");
265 static_assert(PermuteB ==
false,
266 "Permute B functionality not supported by DeviceBatchedGemm operations.\n");
273 : BatchStrideA_(BatchStrideA), BatchStrideB_(BatchStrideB), BatchStrideC_(BatchStrideC)
279 return g_idx *
static_cast<long_index_t>(BatchStrideA_);
284 return g_idx *
static_cast<long_index_t>(BatchStrideB_);
289 return g_idx *
static_cast<long_index_t>(BatchStrideC_);
310 AElementwiseOperation,
311 BElementwiseOperation,
312 CElementwiseOperation,
324 ABlockTransferThreadClusterLengths_AK0_M_AK1,
325 ABlockTransferThreadClusterArrangeOrder,
326 ABlockTransferSrcAccessOrder,
327 ABlockTransferSrcVectorDim,
328 ABlockTransferSrcScalarPerVector,
329 ABlockTransferDstScalarPerVector_AK1,
332 BBlockTransferThreadClusterLengths_BK0_N_BK1,
333 BBlockTransferThreadClusterArrangeOrder,
334 BBlockTransferSrcAccessOrder,
335 BBlockTransferSrcVectorDim,
336 BBlockTransferSrcScalarPerVector,
337 BBlockTransferDstScalarPerVector_BK1,
340 CShuffleMRepeatPerShuffle,
341 CShuffleNRepeatPerShuffle,
342 CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
355 const BDataType* p_b_grid_,
356 CDataType* p_c_grid_,
368 AElementwiseOperation a_element_op_,
369 BElementwiseOperation b_element_op_,
370 CElementwiseOperation cde_element_op_,
371 bool is_reduce_ =
false)
373 std::array<const void*, 1>{p_b_grid_},
374 std::array<const void*, 0>{},
379 std::array<index_t, 1>{StrideA_},
380 std::array<index_t, 1>{StrideB_},
381 std::array<index_t, 0>{},
415 if(stream_config.log_level_ > 0)
418 GridwiseGemm::BlockwiseGemmPipe::HotLoopInstList::Print();
423 throw std::runtime_error(
"wrong! GridwiseGemm has invalid setting");
439 index_t k_grain = arg.KBatch * KPerBlock;
440 index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
444 const auto Run = [&](
const auto& kernel) {
445 if(stream_config.flush_cache)
450 arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideAs, arg_.AK0);
452 arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideBs, arg_.BK0);
459 std::array<std::size_t, 1> size_as_buffers;
460 size_as_buffers[0] = a_grid_desc_ak0_m_ak1[
Number<0>{}].GetElementSpaceSize() *
463 std::array<std::size_t, 1> size_bs_buffers;
464 size_bs_buffers[0] = b_grid_desc_bk0_n_bk1[
Number<0>{}].GetElementSpaceSize() *
472 stream_config.rotating_count,
475 std::array<std::size_t, 0>{});
476 rotating_mem.Print();
478 auto run_flush_cache = [&]() {
490 hipMemsetAsync(arg_.p_e_grid,
492 arg_.Batch * arg_.M * arg_.N *
sizeof(CDataType),
493 stream_config.stream_id_));
504 arg_.compute_ptr_offset_of_batch);
508 auto clear_workspace = [&]() {
516 hipMemsetAsync(arg.p_e_grid,
518 arg.
Batch * arg.M * arg.N *
sizeof(CDataType),
519 stream_config.stream_id_));
534 constexpr index_t minimum_occupancy = []() {
541 return (MPerBlock * NPerBlock / BlockSize <= 128) ? 2 : 1;
549 if(has_main_k_block_loop)
559 ComputePtrOffsetOfStridedBatch,
590 ComputePtrOffsetOfStridedBatch,
616 return Run(*
dynamic_cast<const Argument*
>(p_arg), stream_config);
633 if constexpr(std::is_same_v<CDataType, ck::half_t> ||
634 std::is_same_v<CDataType, ck::bhalf_t>)
643 if constexpr(std::is_same_v<ComputeTypeA, f8_t> || std::is_same_v<ComputeTypeA, bf8_t> ||
644 std::is_same_v<ComputeTypeB, f8_t> || std::is_same_v<ComputeTypeB, bf8_t>)
676 const BDataType* p_b,
688 AElementwiseOperation,
689 BElementwiseOperation,
690 CElementwiseOperation)
706 AElementwiseOperation{},
707 BElementwiseOperation{},
708 CElementwiseOperation{}};
727 AElementwiseOperation,
728 BElementwiseOperation,
729 CElementwiseOperation)
override
731 return std::make_unique<Argument>(
static_cast<const ADataType*
>(p_a),
732 static_cast<const BDataType*
>(p_b),
733 static_cast<CDataType*
>(p_c),
745 AElementwiseOperation{},
746 BElementwiseOperation{},
747 CElementwiseOperation{});
753 return std::make_unique<Invoker>(
Invoker{});
759 auto str = std::stringstream();
761 std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
765 std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
773 str <<
"DeviceBatchedGemm_Wmma_CShuffleV3"
776 << std::string(ALayout::name)[0]
777 << std::string(BLayout::name)[0]
778 << std::string(CLayout::name)[0]
783 << MPerBlock <<
"x" << NPerBlock <<
"x" << KPerBlock <<
", "
785 << MPerWmma <<
"x"<<NPerWmma <<
", "
787 << MRepeat <<
"x" << NRepeat <<
", "
789 << ABlockTransferSrcScalarPerVector <<
"x" << BBlockTransferSrcScalarPerVector <<
", "
790 <<
"BlkGemmPipelineScheduler: "
791 << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] <<
", "
792 <<
"BlkGemmPipelineVersion: "
793 << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] <<
", "
794 <<
"BlkGemmPipelinePrefetchStages: "
795 << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages <<
", "
#define CK_MAX_THREAD_PER_BLOCK
Definition ck.hpp:30
#define REGISTER_EXTRA_PRINTING_METHODS
Definition device_base.hpp:47
#define HIP_CHECK_ERROR(retval_or_funcall)
Definition host_utility/hip_check_error.hpp:21
Definition convolution_backward_data_specialization.hpp:8
std::string getGemmSpecializationString(const GemmSpecialization &s)
Definition gemm_specialization.hpp:32
GemmSpecialization
Definition gemm_specialization.hpp:11
@ MKPadding
Definition gemm_specialization.hpp:18
@ KPadding
Definition gemm_specialization.hpp:16
@ MNKPadding
Definition gemm_specialization.hpp:20
@ NKPadding
Definition gemm_specialization.hpp:19
__global__ void kernel_batched_gemm_wmma_cshuffle_v3(typename GridwiseGemm::Argument karg, const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:34
Definition convolution_backward_data_specialization.hpp:7
void flush_icache()
Definition flush_cache.hpp:383
float launch_and_time_kernel_with_preprocess(const StreamConfig &stream_config, PreProcessFunc preprocess, F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, GemmArgs &gemm_args, Args... args)
Definition flush_cache.hpp:398
int32_t index_t
Definition ck.hpp:299
InMemoryDataOperationEnum
Definition ck.hpp:277
@ Set
Definition ck.hpp:278
@ AtomicAdd
Definition ck.hpp:279
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
BlockGemmPipelineVersion
Definition blkgemmpipe_scheduler.hpp:12
@ v2
Definition blkgemmpipe_scheduler.hpp:15
@ v3
Definition blkgemmpipe_scheduler.hpp:16
@ v5
Definition blkgemmpipe_scheduler.hpp:18
@ v4
Definition blkgemmpipe_scheduler.hpp:17
@ v1
Definition blkgemmpipe_scheduler.hpp:14
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ Full
Definition blkgemmpipe_scheduler.hpp:49
typename tuple_element< I, TTuple >::type tuple_element_t
Definition utility/tuple.hpp:208
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
__device__ uint32_t amd_wave_read_first_lane(uint32_t value)
Definition amd_wave_read_first_lane.hpp:100
bool is_gfx12_supported()
Definition host_utility/device_prop.hpp:55
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
@ Interwave
Definition blkgemmpipe_scheduler.hpp:27
typename remove_reference< T >::type remove_reference_t
Definition type.hpp:292
int64_t long_index_t
Definition ck.hpp:300
typename remove_pointer< T >::type remove_pointer_t
Definition type.hpp:300
bool is_gfx11_supported()
Definition host_utility/device_prop.hpp:60
Definition ck/stream_config.hpp:10
static __host__ constexpr bool CheckValidity(const Argument &karg)
Definition gridwise_gemm_wmma_cshuffle_v3_common.hpp:624
static __host__ auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
Definition gridwise_gemm_wmma_cshuffle_v3_common.hpp:273
static constexpr index_t KPack
Definition gridwise_gemm_wmma_cshuffle_v3_common.hpp:154
static __host__ constexpr bool CalculateHasMainKBlockLoop(index_t K)
Definition gridwise_gemm_wmma_cshuffle_v3_common.hpp:837
"Universal" GEMM kernel with SplitK support.
Definition gridwise_gemm_wmma_cshuffle_v3.hpp:233
ck::GridwiseGemm_wmma_cshuffle_v3< ALayout, BLayout, Tuple<>, CLayout, Tuple< ADataType >, Tuple< BDataType >, AccDataType, CShuffleDataType, Tuple<>, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, Sequence< CShuffleBlockTransferScalarPerVector_NPerBlock >, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, false, false >::MakeBsGridDescriptor_BK0_N_BK1 __host__ static __device__ auto MakeBsGridDescriptor_BK0_N_BK1(const index_t K, const index_t KPad, const index_t N, const index_t NPad, const std::array< index_t, NumBTensor > &StrideBs, const index_t BK0)
ck::GridwiseGemm_wmma_cshuffle_v3< ALayout, BLayout, Tuple<>, CLayout, Tuple< ADataType >, Tuple< BDataType >, AccDataType, CShuffleDataType, Tuple<>, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, Sequence< CShuffleBlockTransferScalarPerVector_NPerBlock >, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, false, false >::MakeAsGridDescriptor_AK0_M_AK1 __host__ static __device__ auto MakeAsGridDescriptor_AK0_M_AK1(const index_t M, const index_t MPad, const index_t K, const index_t KPad, const std::array< index_t, NumATensor > &StrideAs, const index_t AK0)
ck::GridwiseGemm_wmma_cshuffle_v3< ALayout, BLayout, Tuple<>, CLayout, Tuple< ADataType >, Tuple< BDataType >, AccDataType, CShuffleDataType, Tuple<>, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, Sequence< CShuffleBlockTransferScalarPerVector_NPerBlock >, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, false, false >::APackedSize static constexpr index_t APackedSize
ck::GridwiseGemm_wmma_cshuffle_v3< ALayout, BLayout, Tuple<>, CLayout, Tuple< ADataType >, Tuple< BDataType >, AccDataType, CShuffleDataType, Tuple<>, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, Sequence< CShuffleBlockTransferScalarPerVector_NPerBlock >, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, false, false >::BPackedSize static constexpr index_t BPackedSize
Definition utility/sequence.hpp:43
Definition utility/tuple.hpp:186
Definition utility/tuple.hpp:117
Definition functional2.hpp:33
Definition device_base.hpp:197
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:353
ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:394
index_t Batch
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:393
__host__ Argument(const ADataType *p_a_grid_, const BDataType *p_b_grid_, CDataType *p_c_grid_, index_t M_, index_t N_, index_t K_, index_t StrideA_, index_t StrideB_, index_t StrideC_, index_t BatchStrideA_, index_t BatchStrideB_, index_t BatchStrideC_, index_t Batch_, index_t k_batch_, AElementwiseOperation a_element_op_, BElementwiseOperation b_element_op_, CElementwiseOperation cde_element_op_, bool is_reduce_=false)
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:354
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:269
ComputePtrOffsetOfStridedBatch(index_t BatchStrideA, index_t BatchStrideB, index_t BatchStrideC)
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:270
__host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:277
__host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:282
__host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:287
Helper structure responsible for kernel invocation.
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:407
float Run(const BaseArgument *p_arg, const StreamConfig &stream_config=StreamConfig{}) override
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:613
float Run(const Argument &arg, const StreamConfig &stream_config=StreamConfig{})
This function issues GPU kernel execution.
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:413
"Universal" Batched GEMM operation without SplitK support.
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:260
std::unique_ptr< BaseInvoker > MakeInvokerPointer() override
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:751
static constexpr bool IsValidCompilationParameter()
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:620
static auto MakeInvoker()
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:711
bool IsSupportedArgument(const BaseArgument *p_arg) override
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:664
std::unique_ptr< BaseArgument > MakeArgumentPointer(const void *p_a, const void *p_b, void *p_c, index_t M, index_t N, index_t K, index_t StrideA, index_t StrideB, index_t StrideC, index_t BatchStrideA, index_t BatchStrideB, index_t BatchStrideC, index_t Batch, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation) override
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:714
std::string GetTypeString() const override
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:757
static auto MakeArgument(const ADataType *p_a, const BDataType *p_b, CDataType *p_c, index_t M, index_t N, index_t K, index_t StrideA, index_t StrideB, index_t StrideC, index_t BatchStrideA, index_t BatchStrideB, index_t BatchStrideC, index_t Batch, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation)
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:675
static bool IsSupportedArgument(const Argument &arg)
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:626
GridwiseGemm_wmma_cshuffle_v3< ALayout, BLayout, Tuple<>, CLayout, Tuple< ADataType >, Tuple< BDataType >, AccDataType, CShuffleDataType, Tuple<>, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerWmma, NPerWmma, MRepeat, NRepeat, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, Sequence< CShuffleBlockTransferScalarPerVector_NPerBlock >, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, false, false > GridwiseGemm
Definition device_batched_gemm_wmma_cshuffle_v3.hpp:299
Definition device_batched_gemm.hpp:25
Definition flush_cache.hpp:21