From d7e7fbdcff0346a99f29d9e6c671d2f5b6015409 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Thu, 15 Jan 2026 23:22:16 -0600 Subject: [PATCH] Add generate_identity_sequences helper for common pattern This adds an optimized helper for the common generate_tuple pattern: generate_tuple([](auto i) { return Sequence{}; }, N) The new generate_identity_sequences() function creates Tuple, Sequence<1>, ..., Sequence> without requiring lambda instantiation at each call site. Updated 21 call sites across threadwise_tensor_slice_transfer, wrapper utilities, and layout files to use the new helper. Build time improvement: ~1.1% wall-clock (18.3s -> 18.1s) --- .../threadwise_tensor_slice_transfer_v3r1.hpp | 6 ++--- ...ise_tensor_slice_transfer_v3r1_dequant.hpp | 9 +++----- ...wise_tensor_slice_transfer_v3r1_gather.hpp | 6 ++--- .../threadwise_tensor_slice_transfer_v3r2.hpp | 6 ++--- .../threadwise_tensor_slice_transfer_v7r2.hpp | 6 ++--- .../threadwise_tensor_slice_transfer_v7r3.hpp | 6 ++--- ...ise_tensor_slice_transfer_v7r3_scatter.hpp | 6 ++--- include/ck/utility/tuple_helper.hpp | 22 +++++++++++++++++++ include/ck/wrapper/layout.hpp | 3 +-- include/ck/wrapper/operations/gemm.hpp | 3 +-- include/ck/wrapper/tensor.hpp | 3 +-- include/ck/wrapper/utils/layout_utils.hpp | 6 ++--- include/ck/wrapper/utils/tensor_partition.hpp | 3 +-- 13 files changed, 43 insertions(+), 42 deletions(-) diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index 610d03ca108..73e479c8079 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -866,8 +866,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } @@ -925,8 +924,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp index 6eb4b21e216..2ddb34671a6 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp @@ -894,8 +894,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } @@ -944,8 +943,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } @@ -993,8 +991,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp index 2077eeebd79..e080d7eeac7 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp @@ -833,8 +833,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } @@ -892,8 +891,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_gather }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp index 56ae553f2f0..3c7291cca31 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r2.hpp @@ -692,8 +692,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2 }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } @@ -744,8 +743,7 @@ struct ThreadwiseTensorSliceTransfer_v3r2 }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp index 87cecc75740..6326f6cbda2 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp @@ -514,8 +514,7 @@ struct ThreadwiseTensorSliceTransfer_v7r2 }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } @@ -563,8 +562,7 @@ struct ThreadwiseTensorSliceTransfer_v7r2 }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp index 262702c693e..574da10852f 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3.hpp @@ -656,8 +656,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3 }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } @@ -706,8 +705,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3 }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp index fe975f4e36b..732922c1576 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp @@ -548,8 +548,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } @@ -598,8 +597,7 @@ struct ThreadwiseTensorSliceTransfer_v7r3_scatter }, Number{}); - constexpr auto up_dim_idss = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto up_dim_idss = generate_identity_sequences(); return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); } diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp index 294d7e7c7dc..22f1ded15fc 100644 --- a/include/ck/utility/tuple_helper.hpp +++ b/include/ck/utility/tuple_helper.hpp @@ -37,6 +37,28 @@ __host__ __device__ constexpr auto generate_tie(F&& f, Number) typename arithmetic_sequence_gen<0, N, 1>::type{}); } +// Optimized helper for common pattern: generate_tuple([](auto i) { return Sequence{}; }, +// N) Creates Tuple, Sequence<1>, ..., Sequence> without lambda instantiation +namespace detail { +template +__host__ __device__ constexpr auto make_identity_sequences_impl(Sequence) +{ + return make_tuple(Sequence{}...); +} +} // namespace detail + +template +__host__ __device__ constexpr auto generate_identity_sequences() +{ + return detail::make_identity_sequences_impl(make_index_sequence{}); +} + +template +__host__ __device__ constexpr auto generate_identity_sequences(Number) +{ + return generate_identity_sequences(); +} + // tx and ty are tuple of references, return type of will tuple of referennce (not rvalue) template __host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple& tx, diff --git a/include/ck/wrapper/layout.hpp b/include/ck/wrapper/layout.hpp index 334d5851db0..2f81a44f399 100644 --- a/include/ck/wrapper/layout.hpp +++ b/include/ck/wrapper/layout.hpp @@ -242,8 +242,7 @@ struct Layout const auto lower_dims = generate_tuple([&](auto i) { return GenerateLowerDim>(shape); }, Number::Size()>{}); - const auto upper_dims = generate_tuple([&](auto i) { return Sequence{}; }, - Number::Size()>{}); + const auto upper_dims = generate_identity_sequences::Size()>(); return transform_tensor_descriptor(desc, transforms, lower_dims, upper_dims); } diff --git a/include/ck/wrapper/operations/gemm.hpp b/include/ck/wrapper/operations/gemm.hpp index d328ac7d42f..46142bd1de1 100644 --- a/include/ck/wrapper/operations/gemm.hpp +++ b/include/ck/wrapper/operations/gemm.hpp @@ -259,8 +259,7 @@ make_blockwise_gemm_xdl_c_local_partition(CTensorType& c_local_tile_tensor) const auto partition_desc = BlockwiseGemmXdlops::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2( layout(c_local_tile_tensor).GetUnrolledDescriptor()); - const auto lower_upper_dims = - generate_tuple([&](auto i) { return Sequence{}; }, Number<8>{}); + const auto lower_upper_dims = generate_identity_sequences<8>(); auto sliced_desc = transform_tensor_descriptor( partition_desc, diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp index 9f8278a3578..120f0c694db 100644 --- a/include/ck/wrapper/tensor.hpp +++ b/include/ck/wrapper/tensor.hpp @@ -187,8 +187,7 @@ __host__ __device__ constexpr auto GenerateSlicedDescriptor(const Tuple& const auto transforms = GenerateSliceTransforms(idx, shape); using TransformsTupleType = decltype(transforms); - const auto lower_dims = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + const auto lower_dims = generate_identity_sequences(); const auto upper_dims = decltype(GenerateUpperDims<0>(TransformsTupleType{})){}; return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); } diff --git a/include/ck/wrapper/utils/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp index 8dd111b8721..e9686de6e79 100644 --- a/include/ck/wrapper/utils/layout_utils.hpp +++ b/include/ck/wrapper/utils/layout_utils.hpp @@ -186,8 +186,7 @@ __host__ __device__ constexpr auto get(const Layout& layout }, Number{}); - const auto lower_dims = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + const auto lower_dims = generate_identity_sequences(); const auto upper_dims = generate_tuple( [&](auto i) { if constexpr(i < shape_offset || i >= shape_offset + new_shape_dims) @@ -492,8 +491,7 @@ __host__ __device__ constexpr auto unmerge(const Layout& la }, Number{}); - constexpr auto lower_dims = - generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + constexpr auto lower_dims = generate_identity_sequences(); constexpr auto upper_dims = generate_tuple( [&](auto i) { if constexpr(is_detected>::value) diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp index 5099f35cdab..34986c270bc 100644 --- a/include/ck/wrapper/utils/tensor_partition.hpp +++ b/include/ck/wrapper/utils/tensor_partition.hpp @@ -293,8 +293,7 @@ make_local_partition(TensorType& tensor, }, Number::Size()>{}); const auto lower_upper_dims = - generate_tuple([&](auto i) { return Sequence{}; }, - Number::Size()>{}); + generate_identity_sequences::Size()>(); auto sliced_desc = transform_tensor_descriptor(unrolled_desc, transforms, lower_upper_dims, lower_upper_dims); // Create layout