StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM > Struct Template Reference

StreamKTilePartitioner&lt; BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM &gt; Struct Template Reference#

Composable Kernel: ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM > Struct Template Reference
ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM > Struct Template Reference

Stream-K tile partitioner that dynamically balances work across workgroups. More...

#include <gemm_tile_partitioner.hpp>

Public Types

using BlockGemmShape = BlockGemmShapeType

Public Member Functions

CK_TILE_HOST_DEVICE StreamKTilePartitioner () noexcept=delete
CK_TILE_HOST_DEVICE StreamKTilePartitioner (uint32_t M, uint32_t N, uint32_t K, uint32_t num_cu, uint32_t occupancy, uint32_t sk_blocks=0xffffffff) noexcept
 Construct Stream-K tile partitioner with problem dimensions.
CK_TILE_HOST auto GridSize () const noexcept -> dim3
 Calculate optimal grid size for Stream-K.
CK_TILE_DEVICE auto GetOutputTileIndex (uint32_t tile_idx) const noexcept -> tuple< uint32_t, uint32_t >
 Get output tile index for standard 2D mapping (compatibility).
CK_TILE_DEVICE void GetBlockItr (uint32_t block_idx, uint32_t &iter_start, uint32_t &iter_end) const noexcept
 Get work range for a given block ID.
CK_TILE_HOST_DEVICE uint32_t GetSkTotalIters () const noexcept
 Get total number of iterations for sk tiles.
CK_TILE_HOST_DEVICE uint32_t GetSkTiles () const noexcept
 Get total number of sk tiles.
CK_TILE_DEVICE uint32_t GetCurrentIterLength (uint32_t iter_start, uint32_t iter_end) const noexcept
 Get length of loop iterations for stream-k loop.
CK_TILE_DEVICE uint32_t GetTileIdx (uint32_t iter) const noexcept
 Get index of tile during a specified iteration.
CK_TILE_DEVICE void GetTileIdxWithOffset (uint32_t iter, uint32_t &tile_idx, uint32_t &iter_offset) const noexcept
 Get index of tile during a specified iteration.
CK_TILE_HOST_DEVICE uint32_t GetWorkSpaceSizeForAcc (uint32_t acc_element_bytes) const noexcept
 Calculates the buffer space needed for accumulation.
CK_TILE_HOST_DEVICE uint32_t GetWorkSpaceSizeForSemaphore () const noexcept
 Calculates the buffer space needed for the semaphore.
CK_TILE_HOST_DEVICE uint32_t GetWorkSpaceSize (uint32_t acc_element_bytes) const noexcept
 Calculates the total buffer space needed for accumulation and the semaphore.
CK_TILE_HOST_DEVICE uint32_t GetTileIntersections (uint32_t tiles_, const mdiv &equiv_tiles_) const noexcept
 Get location of intersection of tiles for reduction.
CK_TILE_HOST_DEVICE uint32_t GetTilesCoverSkBlock (uint32_t num_sk_blocks_, uint32_t iters_per_sk_block_) const noexcept
 Calculate the number of tiles needed for the number of sk blocks.
CK_TILE_HOST_DEVICE uint32_t GetTotalAccBuffers () const noexcept
 Calculate the amount of total accumulation buffers required for stream-k.
CK_TILE_DEVICE uint32_t GetAccBufferOffsetFromTile (uint32_t tile_idx_) const noexcept
 Calculate offset based on tile index for big/little tiles.
CK_TILE_DEVICE uint32_t GetAccBufferOffsetFromBlock (uint32_t block_idx_) const noexcept
 Calculate offset based on block_idx index for big/little streamk blocks.
CK_TILE_HOST_DEVICE uint32_t GetNumTileM () const noexcept
CK_TILE_HOST_DEVICE uint32_t GetNumTileN () const noexcept
CK_TILE_HOST_DEVICE uint32_t GetNumTileK () const noexcept

Static Public Member Functions

static CK_TILE_HOST_DEVICE auto GetLoopNum (uint32_t K) noexcept -> uint32_t
 Calculate number of loop iterations over K dimension for given work unit.

Public Attributes

uint32_t sk_num_blocks
uint32_t sk_num_big_blocks
uint32_t dp_start_block_idx
uint32_t reduction_start_block_idx
uint32_t k_iters_per_big_block
mdiv2 n_tiles
mdiv k_iters_per_tile
mdiv equiv_tiles_big
mdiv equiv_tiles_little

Static Public Attributes

static constexpr uint32_t MPerBlock = BlockGemmShape::kM
static constexpr uint32_t NPerBlock = BlockGemmShape::kN
static constexpr uint32_t KPerBlock = BlockGemmShape::kK

Detailed Description

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
struct ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >

Stream-K tile partitioner that dynamically balances work across workgroups.

This partitioner is responsible for mapping workgroups to tiles in the C tensor for the Stream-K algorithm which decomposes the GEMM problem into smaller work units and distributes them more evenly across available blocks, improving load balancing especially for cases where the K dimension is large.

Template Parameters
BlockGemmShapeTypeA class providing basic GEMM parameters.
ReductionStrategyA class that defines the reduction strategy for the results in the C Tensor.
TileSwizzleSubMA value that defines the size of the swizzle group along the m dimension, where the swizzle group denotes consecutive tiles down a column. For instance a swizzle group of 8 denotes tiles 0, 1, ..., 7, map to tiles [0,0], [1,0], ..., [7,0] in the C tensor.

Member Typedef Documentation

◆ BlockGemmShape

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
using ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::BlockGemmShape = BlockGemmShapeType

Constructor & Destructor Documentation

◆ StreamKTilePartitioner() [1/2]

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::StreamKTilePartitioner ( )
deletenoexcept

◆ StreamKTilePartitioner() [2/2]

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::StreamKTilePartitioner ( uint32_t M,
uint32_t N,
uint32_t K,
uint32_t num_cu,
uint32_t occupancy,
uint32_t sk_blocks = 0xffffffff )
inlinenoexcept

Construct Stream-K tile partitioner with problem dimensions.

Member Function Documentation

◆ GetAccBufferOffsetFromBlock()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetAccBufferOffsetFromBlock ( uint32_t block_idx_) const
inlinenoexcept

Calculate offset based on block_idx index for big/little streamk blocks.

◆ GetAccBufferOffsetFromTile()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetAccBufferOffsetFromTile ( uint32_t tile_idx_) const
inlinenoexcept

Calculate offset based on tile index for big/little tiles.

◆ GetBlockItr()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_DEVICE void ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetBlockItr ( uint32_t block_idx,
uint32_t & iter_start,
uint32_t & iter_end ) const
inlinenoexcept

Get work range for a given block ID.

◆ GetCurrentIterLength()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetCurrentIterLength ( uint32_t iter_start,
uint32_t iter_end ) const
inlinenoexcept

Get length of loop iterations for stream-k loop.

◆ GetLoopNum()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE auto ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetLoopNum ( uint32_t K) ->uint32_t
inlinestaticnoexcept

Calculate number of loop iterations over K dimension for given work unit.

◆ GetNumTileK()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetNumTileK ( ) const
inlinenoexcept

◆ GetNumTileM()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetNumTileM ( ) const
inlinenoexcept

◆ GetNumTileN()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetNumTileN ( ) const
inlinenoexcept

◆ GetOutputTileIndex()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_DEVICE auto ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetOutputTileIndex ( uint32_t tile_idx) const->tuple< uint32_t, uint32_t >
inlinenoexcept

Get output tile index for standard 2D mapping (compatibility).

◆ GetSkTiles()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetSkTiles ( ) const
inlinenoexcept

Get total number of sk tiles.

◆ GetSkTotalIters()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetSkTotalIters ( ) const
inlinenoexcept

Get total number of iterations for sk tiles.

◆ GetTileIdx()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetTileIdx ( uint32_t iter) const
inlinenoexcept

Get index of tile during a specified iteration.

◆ GetTileIdxWithOffset()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_DEVICE void ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetTileIdxWithOffset ( uint32_t iter,
uint32_t & tile_idx,
uint32_t & iter_offset ) const
inlinenoexcept

Get index of tile during a specified iteration.

◆ GetTileIntersections()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetTileIntersections ( uint32_t tiles_,
const mdiv & equiv_tiles_ ) const
inlinenoexcept

Get location of intersection of tiles for reduction.

◆ GetTilesCoverSkBlock()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetTilesCoverSkBlock ( uint32_t num_sk_blocks_,
uint32_t iters_per_sk_block_ ) const
inlinenoexcept

Calculate the number of tiles needed for the number of sk blocks.

◆ GetTotalAccBuffers()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetTotalAccBuffers ( ) const
inlinenoexcept

Calculate the amount of total accumulation buffers required for stream-k.

◆ GetWorkSpaceSize()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetWorkSpaceSize ( uint32_t acc_element_bytes) const
inlinenoexcept

Calculates the total buffer space needed for accumulation and the semaphore.

◆ GetWorkSpaceSizeForAcc()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetWorkSpaceSizeForAcc ( uint32_t acc_element_bytes) const
inlinenoexcept

Calculates the buffer space needed for accumulation.

◆ GetWorkSpaceSizeForSemaphore()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST_DEVICE uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GetWorkSpaceSizeForSemaphore ( ) const
inlinenoexcept

Calculates the buffer space needed for the semaphore.

◆ GridSize()

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
CK_TILE_HOST auto ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::GridSize ( ) const->dim3
inlinenoexcept

Calculate optimal grid size for Stream-K.

Member Data Documentation

◆ dp_start_block_idx

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::dp_start_block_idx

◆ equiv_tiles_big

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
mdiv ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::equiv_tiles_big

◆ equiv_tiles_little

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
mdiv ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::equiv_tiles_little

◆ k_iters_per_big_block

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::k_iters_per_big_block

◆ k_iters_per_tile

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
mdiv ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::k_iters_per_tile

◆ KPerBlock

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::KPerBlock = BlockGemmShape::kK
staticconstexpr

◆ MPerBlock

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::MPerBlock = BlockGemmShape::kM
staticconstexpr

◆ n_tiles

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
mdiv2 ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::n_tiles

◆ NPerBlock

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::NPerBlock = BlockGemmShape::kN
staticconstexpr

◆ reduction_start_block_idx

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::reduction_start_block_idx

◆ sk_num_big_blocks

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::sk_num_big_blocks

◆ sk_num_blocks

template<typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategy = ck_tile::StreamKReductionStrategy::Atomic, uint32_t TileSwizzleSubM = 8>
uint32_t ck_tile::StreamKTilePartitioner< BlockGemmShapeType, ReductionStrategy, TileSwizzleSubM >::sk_num_blocks

The documentation for this struct was generated from the following file: