BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride > Struct Template Reference

BlockwiseGemmXdlops_v2&lt; BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride &gt; Struct Template Reference#

Composable Kernel: ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride > Struct Template Reference
ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride > Struct Template Reference

Blockwise gemm. More...

#include <blockwise_gemm_xdlops.hpp>

Public Types

using ThisThreadBlock = ThisThreadBlock<BlockSize>
using Tuple4 = decltype(CalculateAThreadOriginDataIndex())

Public Member Functions

__host__ __device__ constexpr auto & GetCThreadBuffer ()
__host__ __device__ BlockwiseGemmXdlops_v2 (Tuple4 a_origin=CalculateAThreadOriginDataIndex(), Tuple4 b_origin=CalculateBThreadOriginDataIndex())
template<typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
__device__ void Run (const ABlockBuffer &a_block_buf, const BBlockBuffer &b_block_buf, CThreadBuffer &c_thread_buf) const

Static Public Member Functions

static __device__ auto GetWaveIdx ()
static __device__ auto CalculateAThreadOriginDataIndex ()
static __device__ auto CalculateBThreadOriginDataIndex ()
template<index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
static __device__ auto CalculateCThreadOriginDataIndex (Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
template<index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
static __device__ auto CalculateCThreadOriginDataIndex8D (Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 ()
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 ()
__host__ static __device__ constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 ()
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 ()
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 ()
__host__ static __device__ constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 ()
template<typename CGridDesc_M_N>
__host__ static __device__ constexpr auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 (const CGridDesc_M_N &c_grid_desc_m_n)
template<typename CGridDesc_G_M_N>
__host__ static __device__ constexpr auto MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 (const CGridDesc_G_M_N &c_grid_desc_g_m_n)

Public Attributes

StaticBufferTupleOfVector< AddressSpaceEnum::Vgpr, FloatAcc, MRepeat *NRepeat, xdlops_gemm.GetRegSizePerXdlops(), true > c_thread_buf_

Static Public Attributes

static constexpr auto I0 = Number<0>{}
static constexpr auto I1 = Number<1>{}
static constexpr auto I2 = Number<2>{}
static constexpr auto I3 = Number<3>{}
static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL)
static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL)
static constexpr index_t WaveSize = BlockSize / MWaves / NWaves
static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0)
static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0)
static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2)
static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2)
static constexpr auto xdlops_gemm
static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops
static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k

Protected Types

using AThreadCopy
using BThreadCopy

Protected Attributes

AThreadCopy a_thread_copy_
BThreadCopy b_thread_copy_

Static Protected Attributes

static constexpr auto a_thread_desc_
static constexpr auto b_thread_desc_
static constexpr auto c_thread_desc_

Detailed Description

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
struct ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >

Blockwise gemm.

Supports

  1. regular XDL output M2_M3_M4_M2 and transposed XDL output M2_N2_N3_N4
  2. decoupled input tile descriptor and mma tile descriptor in order to support both vgpr and LDS source buffer
  3. configurable k index starting position and step size after each FMA/XDL instruction

Member Typedef Documentation

◆ AThreadCopy

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
using ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::AThreadCopy
protected
Initial value:
FloatAB,
decltype(a_thread_desc_),
3,
static constexpr auto a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_dpp.hpp:254
static constexpr auto a_thread_desc_
Definition blockwise_gemm_dpp.hpp:312
static constexpr index_t A_K1
Definition blockwise_gemm_dpp.hpp:52
Definition utility/sequence.hpp:43
Definition threadwise_tensor_slice_transfer.hpp:1260

◆ BThreadCopy

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
using ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::BThreadCopy
protected
Initial value:
FloatAB,
decltype(b_thread_desc_),
3,
static constexpr index_t B_K1
Definition blockwise_gemm_dpp.hpp:53
static constexpr auto b_thread_desc_
Definition blockwise_gemm_dpp.hpp:316
static constexpr auto b_block_desc_n0_n1_n2_k
Definition blockwise_gemm_dpp.hpp:255

◆ ThisThreadBlock

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
using ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::ThisThreadBlock = ThisThreadBlock<BlockSize>

◆ Tuple4

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
using ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::Tuple4 = decltype(CalculateAThreadOriginDataIndex())

Constructor & Destructor Documentation

◆ BlockwiseGemmXdlops_v2()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
__host__ __device__ ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::BlockwiseGemmXdlops_v2 ( Tuple4 a_origin = CalculateAThreadOriginDataIndex(),
Tuple4 b_origin = CalculateBThreadOriginDataIndex() )
inline

Member Function Documentation

◆ CalculateAThreadOriginDataIndex()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
__device__ auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::CalculateAThreadOriginDataIndex ( )
inlinestatic

◆ CalculateBThreadOriginDataIndex()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
__device__ auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::CalculateBThreadOriginDataIndex ( )
inlinestatic

◆ CalculateCThreadOriginDataIndex()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
__device__ auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::CalculateCThreadOriginDataIndex ( Number< m0 > ,
Number< n0 > ,
Number< xdlops_i > ,
Number< blk_i >  )
inlinestatic

◆ CalculateCThreadOriginDataIndex8D()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
__device__ auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::CalculateCThreadOriginDataIndex8D ( Number< m0 > ,
Number< n0 > ,
Number< xdlops_i > ,
Number< blk_i >  )
inlinestatic

◆ GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
__host__ static __device__ constexpr auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 ( )
inlinestaticconstexpr

◆ GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
__host__ static __device__ constexpr auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 ( )
inlinestaticconstexpr

◆ GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
__host__ static __device__ constexpr auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 ( )
inlinestaticconstexpr

◆ GetCThreadBuffer()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
__host__ __device__ constexpr auto & ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::GetCThreadBuffer ( )
inlineconstexpr

◆ GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
__host__ static __device__ constexpr auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 ( )
inlinestaticconstexpr

◆ GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
__host__ static __device__ constexpr auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 ( )
inlinestaticconstexpr

◆ GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
__host__ static __device__ constexpr auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 ( )
inlinestaticconstexpr

◆ GetWaveIdx()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
__device__ auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::GetWaveIdx ( )
inlinestatic

◆ MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<typename CGridDesc_G_M_N>
__host__ static __device__ constexpr auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 ( const CGridDesc_G_M_N & c_grid_desc_g_m_n)
inlinestaticconstexpr

◆ MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<typename CGridDesc_M_N>
__host__ static __device__ constexpr auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 ( const CGridDesc_M_N & c_grid_desc_m_n)
inlinestaticconstexpr

◆ Run()

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
template<typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
__device__ void ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::Run ( const ABlockBuffer & a_block_buf,
const BBlockBuffer & b_block_buf,
CThreadBuffer & c_thread_buf ) const
inline

Member Data Documentation

◆ a_block_desc_m0_m1_m2_k

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
AMmaTileDesc ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::a_block_desc_m0_m1_m2_k
staticconstexpr

◆ A_K0

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
index_t ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::A_K0 = ATileDesc{}.GetLength(I0)
staticconstexpr

◆ A_K1

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
index_t ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::A_K1 = ATileDesc{}.GetLength(I2)
staticconstexpr

◆ a_thread_copy_

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
AThreadCopy ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::a_thread_copy_
protected

◆ a_thread_desc_

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::a_thread_desc_
staticconstexprprotected
Initial value:
=
integral_constant< index_t, N > Number
Definition number.hpp:12
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
static constexpr auto I1
Definition blockwise_gemm_dpp.hpp:35

◆ b_block_desc_n0_n1_n2_k

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
BMmaTileDesc ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::b_block_desc_n0_n1_n2_k
staticconstexpr

◆ B_K0

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
index_t ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::B_K0 = BTileDesc{}.GetLength(I0)
staticconstexpr

◆ B_K1

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
index_t ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::B_K1 = BTileDesc{}.GetLength(I2)
staticconstexpr

◆ b_thread_copy_

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
BThreadCopy ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::b_thread_copy_
protected

◆ b_thread_desc_

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::b_thread_desc_
staticconstexprprotected

◆ c_thread_buf_

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, FloatAcc, MRepeat * NRepeat, xdlops_gemm.GetRegSizePerXdlops(), true> ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::c_thread_buf_

◆ c_thread_desc_

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::c_thread_desc_
staticconstexprprotected
Initial value:
make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()))
static constexpr auto xdlops_gemm
Definition blockwise_gemm_xdlops.hpp:707

◆ I0

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::I0 = Number<0>{}
staticconstexpr

◆ I1

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::I1 = Number<1>{}
staticconstexpr

◆ I2

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::I2 = Number<2>{}
staticconstexpr

◆ I3

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::I3 = Number<3>{}
staticconstexpr

◆ KPerThread

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
index_t ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops
staticconstexpr

◆ MWaves

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
index_t ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::MWaves = MPerBlock / (MRepeat * MPerXDL)
staticconstexpr

◆ NWaves

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
index_t ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::NWaves = NPerBlock / (NRepeat * NPerXDL)
staticconstexpr

◆ WaveSize

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
index_t ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::WaveSize = BlockSize / MWaves / NWaves
staticconstexpr

◆ xdlops_gemm

template<index_t BlockSize, typename FloatAB, typename FloatAcc, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false, index_t AMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops, index_t BMmaKStride = KPack * XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, FloatAB, TransposeC>{}.K0PerXdlops>
auto ck::BlockwiseGemmXdlops_v2< BlockSize, FloatAB, FloatAcc, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, TransposeC, AMmaKStride, BMmaKStride >::xdlops_gemm
staticconstexpr

The documentation for this struct was generated from the following file: