22template <
typename SliceLengths,
23 typename SrcElementwiseOperation,
24 typename DstElementwiseOperation,
30 typename SrcDimAccessOrder,
31 typename DstDimAccessOrder,
36 index_t SrcScalarStrideInVector,
37 index_t DstScalarStrideInVector,
38 bool SrcResetCoordinateAfterRun,
41 bool DstResetCoordinateAfterRun,
86 const SrcDesc& src_desc,
87 const Index& src_slice_origin,
88 const SrcElementwiseOperation& src_element_op,
89 const DstDesc& dst_desc,
90 const Index& dst_slice_origin,
91 const DstElementwiseOperation& dst_element_op,
95 src_element_op_(src_element_op),
96 dst_element_op_(dst_element_op),
97 gather_offsets_(gather_offsets)
102 "SrcData != DstData");
106 "SrcScalarPerVector_ and DstScalarPerVector_ cannot be 1 for packed data type");
108 static_assert(SrcVectorDim == DstVectorDim,
109 "Packed data type does not support transpose");
116 auto adjusted_origin_idx = [&]() {
119 idx(i) = i.value == GatherDim ? 0 : src_slice_origin_idx[
Number<i>{}];
131 template <
typename SrcBuffer, index_t ThreadScratchId = 0>
132 __device__
void RunRead(
const SrcDesc& src_desc,
133 const SrcBuffer& src_buf,
142 "wrong! SrcBuffer and SrcData data type are inconsistent");
149 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
151 static_assert(SliceLengths::At(SrcVectorDim) % (SrcScalarPerVector_) == 0,
152 "SliceLengths[SrcVectorDim] must be divisible by SrcScalarPerVector");
154 constexpr auto src_dim_access_order = SrcDimAccessOrder{};
155 constexpr auto ordered_gather_dim = src_dim_access_order[GatherDim];
156 constexpr auto ordered_src_access_lengths =
162 Index forward_step_idx;
164 static_for<0, nDim, 1>{}([&](
auto j) {
165 forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
175 Index backward_step_idx;
177 static_for<0, nDim, 1>{}([&](
auto j) {
178 backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
186 static_ford<
decltype(ordered_src_access_lengths)>{}([&](
auto ordered_src_access_idx) {
188 constexpr auto forward_sweep = [&]() {
191 forward_sweep_(
I0) =
true;
193 static_for<1, nDim, 1>{}([&](
auto i) {
194 index_t tmp = ordered_src_access_idx[
I0];
196 static_for<1, i, 1>{}([&](
auto j) {
197 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
200 forward_sweep_(i) = tmp % 2 == 0;
203 return forward_sweep_;
207 constexpr auto src_data_idx = [&]() {
210 static_for<0, nDim, 1>{}([&](
auto i) {
211 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
212 : ordered_src_access_lengths[i] - 1 -
213 ordered_src_access_idx[i];
217 src_scalar_per_access;
226 const IndexType ld_offset = src_coord_.GetOffset() /
PackedSize + gather_offset;
227 src_oob_thread_scratch_tuple_(thread_scratch_id)
228 .template SetAsType<bool>(src_data_idx_seq,
true);
231 using src_vector_t =
typename src_vector_type::type;
233 auto src_vector_container =
234 src_vector_type{src_buf.template Get<src_vector_t>(ld_offset,
true)};
237 using dst_vector_t =
typename dst_vector_type::type;
238 dst_vector_type op_r_v;
240 constexpr auto get_elem_op_vec_len = []() {
243 if constexpr(
decltype(src_element_op_)::is_pack8_invocable)
247 decltype(src_element_op_)>
::value)
249 if constexpr(
decltype(src_element_op_)::is_pack4_invocable)
253 decltype(src_element_op_)>
::value)
255 if constexpr(
decltype(src_element_op_)::is_pack2_invocable)
264 constexpr index_t elem_op_vec_len = get_elem_op_vec_len();
266 using src_elem_op_vec_t =
typename vector_type<SrcData, elem_op_vec_len>::type;
267 using dst_elem_op_vec_t =
typename vector_type<DstData, elem_op_vec_len>::type;
271 src_element_op_(op_r_v.template AsType<dst_elem_op_vec_t>()(idx),
272 src_vector_container.template AsType<src_elem_op_vec_t>()[idx]);
276 src_thread_scratch_tuple_(thread_scratch_id)
277 .template SetAsType<dst_vector_t>(src_data_idx_seq,
278 op_r_v.template AsType<dst_vector_t>()[
I0]);
280 auto move_on_dim = [&]()
constexpr {
283 static_for<0, nDim, 1>{}([&](
auto i) {
284 move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
286 static_for<i + 1, nDim, 1>{}([&](
auto j) {
288 ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
290 move_on_dim_(i) &= i.value != ordered_gather_dim;
296 static_for<0, nDim, 1>{}([&](
auto i) {
299 if constexpr(forward_sweep[i])
302 src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
307 src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
314 if constexpr(SrcResetCoordinateAfterRun)
316 const auto src_reset_step =
323 template <
typename SeqIdx, index_t ThreadScratchId = 0>
324 __device__
constexpr auto
328 return src_thread_scratch_tuple_(thread_scratch_id).template GetAsType<vector_t>(SeqIdx{});
331 template <index_t ThreadScratchId>
335#if !CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
337 dst_thread_scratch_(idx) = src_thread_scratch_tuple_[thread_scratch_id][idx];
345 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
347 constexpr auto src_dim_access_order = SrcDimAccessOrder{};
349 constexpr auto ordered_src_access_lengths =
353 static_ford<
decltype(ordered_src_access_lengths)>{}([&](
auto ordered_src_access_idx) {
355 constexpr auto forward_sweep = [&]() {
358 forward_sweep_(
I0) =
true;
361 index_t tmp = ordered_src_access_idx[
I0];
364 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
367 forward_sweep_(i) = tmp % 2 == 0;
370 return forward_sweep_;
374 constexpr auto src_data_idx = [&]() {
378 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
379 : ordered_src_access_lengths[i] - 1 -
380 ordered_src_access_idx[i];
384 src_scalar_per_access;
392 auto op_r = src_thread_scratch_tuple_(thread_scratch_id)
393 .template GetAsType<vector_t>(src_data_idx_seq);
397 src_thread_scratch_tuple_(thread_scratch_id)
398 .template SetAsType<vector_t>(src_data_idx_seq, op_r_v);
403 if constexpr(SrcVectorDim != DstVectorDim &&
412 "in-register transpose is not supported for pk_i4_t");
421 static_assert(SrcVectorDim != DstVectorDim,
"wrong");
436 constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
438 static_ford<
decltype(access_lengths)>{}([&](
auto access_idx) {
439 constexpr auto data_idx = access_idx * scalar_per_access;
450 [&](
auto i) ->
const src_vector_t& {
452 return src_thread_scratch_tuple_[thread_scratch_id].GetVectorTypeReference(
453 data_idx_seq + i * dst_scalar_step_in_vector);
459 [&](
auto i) -> dst_vector_t& {
461 return dst_thread_scratch_.GetVectorTypeReference(
462 data_idx_seq + i * src_scalar_step_in_vector);
468 src_vector_refs, dst_vector_refs);
476 constexpr auto packed_access_lengths = SliceLengths{} / packed_per_access;
478 static_ford<
decltype(packed_access_lengths)>{}([&](
auto idx) {
479 dst_thread_scratch_(idx) = src_thread_scratch_tuple_[thread_scratch_id][idx];
485 template <
typename DstBuffer, index_t ThreadScratchId = 0>
501 "wrong! SrcBuffer or DstBuffer data type is wrong");
508 constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
510 constexpr auto dst_dim_access_order = DstDimAccessOrder{};
512 constexpr auto ordered_dst_access_lengths =
518 Index forward_step_idx;
520 static_for<0, nDim, 1>{}([&](
auto j) {
521 forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
531 Index backward_step_idx;
533 static_for<0, nDim, 1>{}([&](
auto j) {
534 backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
542 static_ford<
decltype(ordered_dst_access_lengths)>{}([&](
auto ordered_dst_access_idx) {
544 constexpr auto forward_sweep = [&]() {
547 forward_sweep_(
I0) =
true;
549 static_for<1, nDim, 1>{}([&](
auto i) {
550 index_t tmp = ordered_dst_access_idx[
I0];
552 static_for<1, i, 1>{}([&](
auto j) {
553 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
556 forward_sweep_(i) = tmp % 2 == 0;
559 return forward_sweep_;
563 constexpr auto dst_data_idx = [&]() {
566 static_for<0, nDim, 1>{}([&](
auto i) {
567 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
568 : ordered_dst_access_lengths[i] - 1 -
569 ordered_dst_access_idx[i];
573 dst_scalar_per_access;
579 const bool is_dst_valid =
583 using dst_vector_t =
typename dst_vector_type::type;
586 auto dst_vector_container = dst_vector_type{
587 dst_thread_scratch_.template GetAsType<dst_vector_t>(dst_data_idx_seq)};
589 static_for<0, DstScalarPerVector, 1>{}([&](
auto i) {
593 dst_element_op_(dst_v, dst_vector_container.template AsType<DstData>()[i]);
595 dst_vector_container.template AsType<DstData>()(i) = dst_v;
602 dst_vector_container.template AsType<dst_vector_t>()[
I0]);
604 constexpr auto move_on_dim = [&]()
constexpr {
607 static_for<0, nDim, 1>{}([&](
auto i) {
608 move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
610 static_for<i + 1, nDim, 1>{}([&](
auto j) {
612 ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
620 static_for<0, nDim, 1>{}([&](
auto i) {
621 if constexpr(move_on_dim[i])
623 if constexpr(forward_sweep[i])
626 dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
631 dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
638 if constexpr(DstResetCoordinateAfterRun)
640 const auto dst_reset_step =
654 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
656 constexpr auto src_dim_access_order = SrcDimAccessOrder{};
658 constexpr auto ordered_src_access_lengths =
662 constexpr auto forward_sweep = [&]() {
665 forward_sweep_(
I0) =
true;
668 index_t tmp = ordered_src_access_lengths[
I0] - 1;
671 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
674 forward_sweep_(i) = tmp % 2 == 0;
677 return forward_sweep_;
682 constexpr auto src_data_idx = [&]() {
686 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
690 src_scalar_per_access;
694 constexpr auto reset_src_data_step = [&]() {
695 Index reset_src_data_step_;
698 reset_src_data_step_(i) = i.value == GatherDim ? 0 : -src_data_idx[i];
701 return reset_src_data_step_;
703 return reset_src_data_step;
713 constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
715 constexpr auto dst_dim_access_order = DstDimAccessOrder{};
717 constexpr auto ordered_dst_access_lengths =
721 constexpr auto forward_sweep = [&]() {
724 forward_sweep_(
I0) =
true;
727 index_t tmp = ordered_dst_access_lengths[
I0] - 1;
730 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
733 forward_sweep_(i) = tmp % 2 == 0;
736 return forward_sweep_;
741 constexpr auto dst_data_idx = [&]() {
745 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
749 dst_scalar_per_access;
753 constexpr auto reset_dst_data_step = [&]() {
754 Index reset_dst_data_step_;
758 return reset_dst_data_step_;
761 return reset_dst_data_step;
766 const Index& src_slice_origin_step_idx)
769 const auto adjusted_step_idx =
770 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
780 const Index& dst_slice_origin_step_idx)
783 const auto adjusted_step_idx =
784 DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
798 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
804 constexpr auto desc0 =
810 if constexpr(i == SrcVectorDim)
813 make_tuple(src_access_lengths_and_vector_length[i],
825 if constexpr(i == SrcVectorDim)
836 constexpr auto up_dim_idss =
847 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
858 constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
863 constexpr auto desc0 =
869 if constexpr(i == DstVectorDim)
872 make_tuple(dst_access_lengths_and_vector_length[i],
884 if constexpr(i == DstVectorDim)
895 constexpr auto up_dim_idss =
903 static constexpr auto src_oob_thread_scratch_desc_ =
907 using SrcThreadScratch =
911 decltype(src_thread_scratch_desc_),
914 using SrcOOBThreadScratch =
918 decltype(src_oob_thread_scratch_desc_),
924 decltype(dst_thread_scratch_desc_),
930 DstThreadScratch dst_thread_scratch_;
934 const SrcElementwiseOperation src_element_op_;
935 const DstElementwiseOperation dst_element_op_;
__host__ __device__ constexpr T min(T x)
Definition utility/math.hpp:116
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
decltype(ck::declval< T & >().is_pack8_invocable) is_pack8_invocable_t
Definition is_detected.hpp:43
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition tensor_description/tensor_descriptor.hpp:444
__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition tensor_description/tensor_descriptor.hpp:508
__host__ __device__ constexpr auto container_push_back(const Array< TData, NSize > &a, const TData &x)
Definition utility/container_helper.hpp:18
InMemoryDataOperationEnum
Definition ck.hpp:277
@ Set
Definition ck.hpp:278
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
decltype(ck::declval< T & >().is_pack4_invocable) is_pack4_invocable_t
Definition is_detected.hpp:40
__host__ __device__ constexpr bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition tensor_description/tensor_descriptor.hpp:560
typename detail::detector< nonesuch, void, Op, Args... >::value_t is_detected
Definition is_detected.hpp:34
integral_constant< index_t, N > Number
Definition number.hpp:12
@ Lds
Definition amd_address_space.hpp:18
@ Global
Definition amd_address_space.hpp:17
@ Vgpr
Definition amd_address_space.hpp:20
__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition utility/container_helper.hpp:380
__host__ __device__ constexpr auto generate_sequence_v2(F &&f, Number< N >)
Definition sequence_helper.hpp:25
__host__ __device__ constexpr auto container_reorder_given_old2new(const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
Definition utility/container_helper.hpp:54
constexpr bool is_same_v
Definition type.hpp:283
__host__ __device__ constexpr auto make_merge_transform_v3_division_mod(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:84
__host__ __device__ constexpr auto generate_tuple(F &&f, Number< N >)
Definition tuple_helper.hpp:21
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
decltype(ck::declval< T & >().is_pack2_invocable) is_pack2_invocable_t
Definition is_detected.hpp:37
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto generate_sequence(F, Number< N >)
Definition sequence_helper.hpp:18
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
constexpr index_t packed_size_v
Definition data_type.hpp:411
__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition tensor_description/tensor_descriptor.hpp:407
__host__ __device__ constexpr auto container_reorder_given_new2old(const Array< TData, NSize > &old_array, Sequence< IRs... >)
Definition utility/container_helper.hpp:43
__host__ __device__ constexpr auto generate_tie(F &&f, Number< N >)
Definition tuple_helper.hpp:34
Array< index_t, N > MultiIndex
Definition array_multi_index.hpp:12
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition dtype_vector.hpp:54
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
Definition utility/sequence.hpp:43
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:126
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::Index MultiIndex< nDim > Index
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:50
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I0 static constexpr auto I0
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:58
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I4 static constexpr auto I4
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:62
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I5 static constexpr auto I5
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:63
static __device__ constexpr auto GetSrcCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:647
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::SrcCoordStep decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:55
static __device__ constexpr auto GetDstThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:852
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I10 static constexpr auto I10
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:67
static __device__ constexpr auto GetSrcOOBThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:842
__device__ void TransferDataFromSrcThreadScratchToDstThreadScratch(Number< ThreadScratchId > thread_scratch_id)
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:333
__device__ constexpr ThreadwiseTensorSliceTransfer_v3r1_gather(const SrcDesc &src_desc, const Index &src_slice_origin, const SrcElementwiseOperation &src_element_op, const DstDesc &dst_desc, const Index &dst_slice_origin, const DstElementwiseOperation &dst_element_op, const StaticallyIndexedArray< IndexType, gather_num > &gather_offsets)
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:85
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::gather_num static constexpr index_t gather_num
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:83
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:765
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::DstCoordStep decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:56
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:113
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::PackedSize static constexpr index_t PackedSize
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:73
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I12 static constexpr auto I12
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:68
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::SrcScalarPerVector static constexpr auto SrcScalarPerVector
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:80
__device__ constexpr auto GetSrcThreadScratchIdx(Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:325
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:486
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I13 static constexpr auto I13
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:69
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I8 static constexpr auto I8
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:66
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I16 static constexpr auto I16
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:71
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::nDim static constexpr index_t nDim
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:49
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I6 static constexpr auto I6
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:64
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::SrcCoord decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:52
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I1 static constexpr auto I1
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:59
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:779
static __device__ constexpr auto GetSrcThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:793
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I7 static constexpr auto I7
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:65
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::DstScalarPerVector static constexpr auto DstScalarPerVector
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:81
static __device__ constexpr auto GetDstCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:706
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I2 static constexpr auto I2
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:60
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::DstCoord decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:53
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:132
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I14 static constexpr auto I14
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:70
ck::ThreadwiseTensorSliceTransfer_v3r1_gather< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::I3 static constexpr auto I3
Definition threadwise_tensor_slice_transfer_v3r1_gather.hpp:61
Definition threadwise_tensor_slice_transfer_util.hpp:43
Definition threadwise_tensor_slice_transfer_util.hpp:20
Definition threadwise_tensor_slice_transfer_util.hpp:29
Definition data_type.hpp:187
Definition functional2.hpp:33
Definition functional3.hpp:97
Definition utility/transpose_vectors.hpp:16
Definition dtype_vector.hpp:30