DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched > Struct Template Reference

DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle&lt; A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched &gt; Struct Template Reference#

Composable Kernel: ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched > Struct Template Reference
ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched > Struct Template Reference

#include <device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp>

Inheritance diagram for ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >:
ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, D0sDataType, B1DataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation > ck::tensor_operation::device::BaseOperator

Classes

struct  ComputeBasePtrOfStridedBatch
struct  Argument
struct  Invoker

Public Types

using DeviceOp = DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
using A0GridDesc_M_K = decltype(MakeA0GridDescriptor_M_K(1, 1, 1))
using B0GridDesc_N_K = decltype(MakeB0GridDescriptor_N_K(1, 1, 1))
using D0sGridDesc_M_N = remove_cvref_t<decltype(MakeD0sGridDescriptor_M_N({}, {}, {}))>
using B1GridDesc_N_K = decltype(MakeB1GridDescriptor_N_K(1, 1, 1))
using D1sGridDesc_M_N = remove_cvref_t<decltype(MakeD1sGridDescriptor_M_N({}, {}, {}))>
using E1GridDesc_M_N = decltype(MakeE1GridDescriptor_M_N<E1Layout>(1, 1, 1))
template<index_t Gemm0MXdlPerWave_>
using GridwiseGemmBase
using GridwiseGemm64 = GridwiseGemmBase<math::max(Gemm0MXdlPerWave64, 1)>
using GridwiseGemm32 = GridwiseGemmBase<Gemm0MXdlPerWave32>
using A0GridDesc_AK0_M_AK1
using B0GridDesc_BK0_N_BK1
using B1GridDesc_BK0_N_BK1

Public Member Functions

bool IsSupportedArgument (const BaseArgument *p_arg) override
std::unique_ptr< BaseArgumentMakeArgumentPointer (const void *p_a0, const void *p_b0, std::array< const void *, NumD0Tensor > p_d0s, const void *p_b1, std::array< const void *, NumD1Tensor > p_d1s, void *p_e1, index_t MRaw, index_t NRaw, index_t KRaw, index_t Gemm1NRaw, index_t Batch, index_t StrideA0, index_t StrideB0, std::array< ck::index_t, NumD0Tensor > StrideD0s, index_t StrideB1, std::array< ck::index_t, NumD1Tensor > StrideD1s, index_t StrideE1, index_t BatchStrideA0, index_t BatchStrideB0, std::array< ck::index_t, NumD0Tensor > BatchStrideD0s, index_t BatchStrideB1, std::array< ck::index_t, NumD1Tensor > BatchStrideD1s, index_t BatchStrideE1, A0ElementwiseOperation a0_element_op, B0ElementwiseOperation b0_element_op, CDE0ElementwiseOperation cde0_element_op, B1ElementwiseOperation b1_element_op, CDE1ElementwiseOperation cde1_element_op) override
std::unique_ptr< BaseInvokerMakeInvokerPointer () override
std::string GetTypeString () const override
Public Member Functions inherited from ck::tensor_operation::device::BaseOperator
 BaseOperator ()=default
 BaseOperator (const BaseOperator &)=default
BaseOperatoroperator= (const BaseOperator &)=default
virtual std::string GetInstanceString () const
virtual std::string GetTypeIdName () const
virtual std::optional< std::string > GetObjectName () const
virtual std::optional< std::string > GetTemplateInfo () const
virtual std::string GetTypeIdHashCode () const
virtual size_t GetWorkSpaceSize (const BaseArgument *) const
virtual void SetWorkSpacePointer (BaseArgument *p_arg, void *p_workspace, const StreamConfig &=StreamConfig{}) const
virtual ~BaseOperator ()

Static Public Member Functions

static auto MakeA0GridDescriptor_M_K (index_t MRaw, index_t KRaw, index_t StrideA0)
static auto MakeB0GridDescriptor_N_K (index_t KRaw, index_t NRaw, index_t StrideB)
template<typename DLay>
static auto MakeD0GridDescriptor_M_N (index_t MRaw, index_t NRaw, index_t StrideD0)
static auto MakeB1GridDescriptor_N_K (index_t KRaw, index_t NRaw, index_t StrideB)
template<typename ELay>
static auto MakeE1GridDescriptor_M_N (index_t MRaw, index_t NRaw, index_t StrideE1)
static auto MakeD0sGridDescriptor_M_N (const std::array< index_t, NumD1Tensor > &MRaws, const std::array< index_t, NumD1Tensor > &NRaws, const std::array< index_t, NumD1Tensor > &DsStride)
static auto MakeD1sGridDescriptor_M_N (const std::array< index_t, NumD1Tensor > &MRaws, const std::array< index_t, NumD1Tensor > &NRaws, const std::array< index_t, NumD1Tensor > &DsStride)
static constexpr bool IsValidCompilationParameter ()
template<typename RefLayout, typename DsLayout, const index_t NumDTensor>
static bool CheckDLayout ()
static bool IsSupportedArgument (const Argument &arg)
static auto MakeArgument (const A0DataType *p_a0, const B0DataType *p_b0, std::array< const void *, NumD0Tensor > p_d0s, const B1DataType *p_b1, std::array< const void *, NumD1Tensor > p_d1s, E1DataType *p_e1, index_t MRaw, index_t NRaw, index_t KRaw, index_t Gemm1NRaw, index_t Batch, index_t StrideA0, index_t StrideB0, std::array< index_t, NumD0Tensor > StrideD0s, index_t StrideB1, std::array< index_t, NumD1Tensor > StrideD1s, index_t StrideE1, index_t BatchStrideA0, index_t BatchStrideB0, std::array< index_t, NumD0Tensor > BatchStrideD0s, index_t BatchStrideB1, std::array< index_t, NumD1Tensor > BatchStrideD1s, index_t BatchStrideE1, A0ElementwiseOperation a0_element_op, B0ElementwiseOperation b0_element_op, CDE0ElementwiseOperation cde0_element_op, B1ElementwiseOperation b1_element_op, CDE1ElementwiseOperation cde1_element_op)
static auto MakeInvoker ()

Static Public Attributes

static constexpr auto Gemm0MXdlPerWave64
static constexpr auto Gemm0MXdlPerWave32
static constexpr index_t NumD0Tensor = D0sDataType::Size()
static constexpr index_t NumD1Tensor = D1sDataType::Size()
static constexpr auto I0 = Number<0>{}
static constexpr auto I1 = Number<1>{}
static constexpr auto I2 = Number<2>{}
static constexpr auto I3 = Number<3>{}
static constexpr auto I4 = Number<4>{}
static constexpr auto I5 = Number<5>{}
static constexpr auto I6 = Number<6>{}
static constexpr auto I7 = Number<7>{}
static constexpr auto I8 = Number<8>{}
static constexpr auto I9 = Number<9>{}
static constexpr auto gemm0_padder
static constexpr auto gemm1_padder
Static Public Attributes inherited from ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, D0sDataType, B1DataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation >
static constexpr index_t NumD0Tensor = D0sDataType::Size()
static constexpr index_t NumD1Tensor = D1sDataType::Size()

Member Typedef Documentation

◆ A0GridDesc_AK0_M_AK1

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::A0GridDesc_AK0_M_AK1
Initial value:
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeDefaultA0GridDescriptor_AK0_M_AK1
__host__ static __device__ constexpr auto MakeDefaultA0GridDescriptor_AK0_M_AK1(const A0GridDesc_M_K &a0_grid_desc_m_k)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:369
decltype(MakeA0GridDescriptor_M_K(1, 1, 1)) A0GridDesc_M_K
Definition device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp:457

◆ A0GridDesc_M_K

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::A0GridDesc_M_K = decltype(MakeA0GridDescriptor_M_K(1, 1, 1))

◆ B0GridDesc_BK0_N_BK1

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::B0GridDesc_BK0_N_BK1
Initial value:
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeDefaultB0GridDescriptor_BK0_N_BK1
__host__ static __device__ constexpr auto MakeDefaultB0GridDescriptor_BK0_N_BK1(const B0GridDesc_N_K &b0_grid_desc_n_k)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:386
decltype(MakeB0GridDescriptor_N_K(1, 1, 1)) B0GridDesc_N_K
Definition device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp:458

◆ B0GridDesc_N_K

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::B0GridDesc_N_K = decltype(MakeB0GridDescriptor_N_K(1, 1, 1))

◆ B1GridDesc_BK0_N_BK1

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::B1GridDesc_BK0_N_BK1
Initial value:
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeDefaultB1GridDescriptor_BK0_N_BK1
__host__ static __device__ constexpr auto MakeDefaultB1GridDescriptor_BK0_N_BK1(const B1GridDesc_N_K &b1_grid_desc_n_k)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:443
decltype(MakeB1GridDescriptor_N_K(1, 1, 1)) B1GridDesc_N_K
Definition device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp:460

◆ B1GridDesc_N_K

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::B1GridDesc_N_K = decltype(MakeB1GridDescriptor_N_K(1, 1, 1))

◆ D0sGridDesc_M_N

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::D0sGridDesc_M_N = remove_cvref_t<decltype(MakeD0sGridDescriptor_M_N({}, {}, {}))>

◆ D1sGridDesc_M_N

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::D1sGridDesc_M_N = remove_cvref_t<decltype(MakeD1sGridDescriptor_M_N({}, {}, {}))>

◆ DeviceOp

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::DeviceOp = DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle

◆ E1GridDesc_M_N

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::E1GridDesc_M_N = decltype(MakeE1GridDescriptor_M_N<E1Layout>(1, 1, 1))

◆ GridwiseGemm32

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GridwiseGemm32 = GridwiseGemmBase<Gemm0MXdlPerWave32>

◆ GridwiseGemm64

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GridwiseGemm64 = GridwiseGemmBase<math::max(Gemm0MXdlPerWave64, 1)>

◆ GridwiseGemmBase

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
template<index_t Gemm0MXdlPerWave_>
using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GridwiseGemmBase

Member Function Documentation

◆ CheckDLayout()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
template<typename RefLayout, typename DsLayout, const index_t NumDTensor>
bool ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::CheckDLayout ( )
inlinestatic

◆ GetTypeString()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
std::string ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GetTypeString ( ) const
inlineoverridevirtual

◆ IsSupportedArgument() [1/2]

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
bool ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::IsSupportedArgument ( const Argument & arg)
inlinestatic

◆ IsSupportedArgument() [2/2]

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
bool ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::IsSupportedArgument ( const BaseArgument * p_arg)
inlineoverridevirtual

◆ IsValidCompilationParameter()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
constexpr bool ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::IsValidCompilationParameter ( )
inlinestaticconstexpr

◆ MakeA0GridDescriptor_M_K()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeA0GridDescriptor_M_K ( index_t MRaw,
index_t KRaw,
index_t StrideA0 )
inlinestatic

◆ MakeArgument()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeArgument ( const A0DataType * p_a0,
const B0DataType * p_b0,
std::array< const void *, NumD0Tensor > p_d0s,
const B1DataType * p_b1,
std::array< const void *, NumD1Tensor > p_d1s,
E1DataType * p_e1,
index_t MRaw,
index_t NRaw,
index_t KRaw,
index_t Gemm1NRaw,
index_t Batch,
index_t StrideA0,
index_t StrideB0,
std::array< index_t, NumD0Tensor > StrideD0s,
index_t StrideB1,
std::array< index_t, NumD1Tensor > StrideD1s,
index_t StrideE1,
index_t BatchStrideA0,
index_t BatchStrideB0,
std::array< index_t, NumD0Tensor > BatchStrideD0s,
index_t BatchStrideB1,
std::array< index_t, NumD1Tensor > BatchStrideD1s,
index_t BatchStrideE1,
A0ElementwiseOperation a0_element_op,
B0ElementwiseOperation b0_element_op,
CDE0ElementwiseOperation cde0_element_op,
B1ElementwiseOperation b1_element_op,
CDE1ElementwiseOperation cde1_element_op )
inlinestatic

◆ MakeArgumentPointer()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
std::unique_ptr< BaseArgument > ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeArgumentPointer ( const void * p_a0,
const void * p_b0,
std::array< const void *, NumD0Tensor > p_d0s,
const void * p_b1,
std::array< const void *, NumD1Tensor > p_d1s,
void * p_e1,
index_t MRaw,
index_t NRaw,
index_t KRaw,
index_t Gemm1NRaw,
index_t Batch,
index_t StrideA0,
index_t StrideB0,
std::array< ck::index_t, NumD0Tensor > StrideD0s,
index_t StrideB1,
std::array< ck::index_t, NumD1Tensor > StrideD1s,
index_t StrideE1,
index_t BatchStrideA0,
index_t BatchStrideB0,
std::array< ck::index_t, NumD0Tensor > BatchStrideD0s,
index_t BatchStrideB1,
std::array< ck::index_t, NumD1Tensor > BatchStrideD1s,
index_t BatchStrideE1,
A0ElementwiseOperation a0_element_op,
B0ElementwiseOperation b0_element_op,
CDE0ElementwiseOperation cde0_element_op,
B1ElementwiseOperation b1_element_op,
CDE1ElementwiseOperation cde1_element_op )
inlineoverridevirtual

◆ MakeB0GridDescriptor_N_K()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeB0GridDescriptor_N_K ( index_t KRaw,
index_t NRaw,
index_t StrideB )
inlinestatic

◆ MakeB1GridDescriptor_N_K()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeB1GridDescriptor_N_K ( index_t KRaw,
index_t NRaw,
index_t StrideB )
inlinestatic

◆ MakeD0GridDescriptor_M_N()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
template<typename DLay>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeD0GridDescriptor_M_N ( index_t MRaw,
index_t NRaw,
index_t StrideD0 )
inlinestatic

◆ MakeD0sGridDescriptor_M_N()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeD0sGridDescriptor_M_N ( const std::array< index_t, NumD1Tensor > & MRaws,
const std::array< index_t, NumD1Tensor > & NRaws,
const std::array< index_t, NumD1Tensor > & DsStride )
inlinestatic

◆ MakeD1sGridDescriptor_M_N()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeD1sGridDescriptor_M_N ( const std::array< index_t, NumD1Tensor > & MRaws,
const std::array< index_t, NumD1Tensor > & NRaws,
const std::array< index_t, NumD1Tensor > & DsStride )
inlinestatic

◆ MakeE1GridDescriptor_M_N()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
template<typename ELay>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeE1GridDescriptor_M_N ( index_t MRaw,
index_t NRaw,
index_t StrideE1 )
inlinestatic

◆ MakeInvoker()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeInvoker ( )
inlinestatic

◆ MakeInvokerPointer()

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
std::unique_ptr< BaseInvoker > ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeInvokerPointer ( )
inlineoverridevirtual

Member Data Documentation

◆ gemm0_padder

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::gemm0_padder
staticconstexpr
Initial value:
=
Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock}
Definition matrix_padder.hpp:204

◆ Gemm0MXdlPerWave32

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::Gemm0MXdlPerWave32
staticconstexpr
Initial value:
= GetNXdlPerWave2<BlockSize,
Gemm0NPerBlock,
Gemm0MPerBlock,
Gemm0NPerXdl,
Gemm0MPerXdl,
Gemm0NXdlPerWave,
false>()

◆ Gemm0MXdlPerWave64

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::Gemm0MXdlPerWave64
staticconstexpr
Initial value:
= GetNXdlPerWave2<BlockSize,
Gemm0NPerBlock,
Gemm0MPerBlock,
Gemm0NPerXdl,
Gemm0MPerXdl,
Gemm0NXdlPerWave,
true>()

◆ gemm1_padder

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::gemm1_padder
staticconstexpr
Initial value:
=
Gemm0MPerBlock, Gemm1NPerBlock, Gemm1KPerBlock}

◆ I0

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::I0 = Number<0>{}
staticconstexpr

◆ I1

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::I1 = Number<1>{}
staticconstexpr

◆ I2

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::I2 = Number<2>{}
staticconstexpr

◆ I3

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::I3 = Number<3>{}
staticconstexpr

◆ I4

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::I4 = Number<4>{}
staticconstexpr

◆ I5

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::I5 = Number<5>{}
staticconstexpr

◆ I6

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::I6 = Number<6>{}
staticconstexpr

◆ I7

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::I7 = Number<7>{}
staticconstexpr

◆ I8

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::I8 = Number<8>{}
staticconstexpr

◆ I9

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
auto ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::I9 = Number<9>{}
staticconstexpr

◆ NumD0Tensor

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
index_t ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::NumD0Tensor = D0sDataType::Size()
staticconstexpr

◆ NumD1Tensor

template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
index_t ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::NumD1Tensor = D1sDataType::Size()
staticconstexpr

The documentation for this struct was generated from the following file: