From 22a409be002d10315fba2273404b2b519f971d7d Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Fri, 16 Jan 2026 00:42:19 -0600 Subject: [PATCH 1/3] Add make_uniform_tuple helper for repeated value patterns Add make_uniform_tuple(value) helper to replace common pattern: generate_tuple([&](auto) { return value; }, Number{}) This avoids unique lambda instantiations when creating tuples with repeated values. Applied to device_grouped_conv_fwd_multiple_abd. --- ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp | 4 ++-- include/ck/utility/tuple_helper.hpp | 22 +++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp index cc343f6f69a..fb9550ca2a0 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp @@ -699,9 +699,9 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle if constexpr(isMultiA || isMultiB) { const auto as_grid_desc_ak0_m_ak1 = - generate_tuple([&](auto) { return a_grid_desc_m_k_; }, Number{}); + make_uniform_tuple(a_grid_desc_m_k_, Number{}); const auto bs_grid_desc_bk0_n_bk1 = - generate_tuple([&](auto) { return b_grid_desc_n_k_; }, Number{}); + make_uniform_tuple(b_grid_desc_n_k_, Number{}); if(GridwiseGemm::CheckValidity(as_grid_desc_ak0_m_ak1, bs_grid_desc_bk0_n_bk1, diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp index 22f1ded15fc..9ae01792cd6 100644 --- a/include/ck/utility/tuple_helper.hpp +++ b/include/ck/utility/tuple_helper.hpp @@ -59,6 +59,28 @@ __host__ __device__ constexpr auto generate_identity_sequences(Number) return generate_identity_sequences(); } +// Optimized helper for common pattern: generate_tuple([&](auto) { return value; }, Number{}) +// Creates Tuple (N copies) without lambda instantiation +namespace detail { +template +__host__ __device__ constexpr auto make_uniform_tuple_impl(T&& value, Sequence) +{ + return make_tuple(((void)Is, value)...); +} +} // namespace detail + +template +__host__ __device__ constexpr auto make_uniform_tuple(T&& value) +{ + return detail::make_uniform_tuple_impl(static_cast(value), make_index_sequence{}); +} + +template +__host__ __device__ constexpr auto make_uniform_tuple(T&& value, Number) +{ + return make_uniform_tuple(static_cast(value)); +} + // tx and ty are tuple of references, return type of will tuple of referennce (not rvalue) template __host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple& tx, From 0a1e1cc66f5548789c119d8304ef2e70976d9040 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Fri, 16 Jan 2026 01:05:44 -0600 Subject: [PATCH 2/3] Add container_product helper for O(1) depth fold expression Replace O(N) recursive container_reduce with O(1) fold expression for computing products of container elements. This reduces template instantiation depth from 26 to 23 levels. - Add container_product() using unpack + fold expression - Migrate 10 call sites from container_reduce(x, multiplies{}, 1) --- .../multi_index_transform.hpp | 20 ++++++++----------- .../ck/tensor_description/tensor_adaptor.hpp | 3 +-- .../tensor_description/tensor_descriptor.hpp | 3 +-- include/ck/utility/container_helper.hpp | 8 ++++++++ 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/include/ck/tensor_description/multi_index_transform.hpp b/include/ck/tensor_description/multi_index_transform.hpp index 19a47487328..520ec1c012e 100644 --- a/include/ck/tensor_description/multi_index_transform.hpp +++ b/include/ck/tensor_description/multi_index_transform.hpp @@ -487,8 +487,7 @@ struct Merge_v1_carry_check using LowLengthsScan = decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{})); - using UpLengths = - decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{}))); + using UpLengths = decltype(make_tuple(container_product(LowLengths{}))); LowLengths low_lengths_; LowLengthsScan low_lengths_scan_; @@ -500,7 +499,7 @@ struct Merge_v1_carry_check : low_lengths_{low_lengths}, low_lengths_scan_{ container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})}, - up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))} + up_lengths_{make_tuple(container_product(low_lengths))} { static_assert(LowerIndex::Size() == NDimLow, "wrong!"); } @@ -1039,8 +1038,7 @@ struct Merge_v2_magic_division using LowerIndex = MultiIndex; using UpperIndex = MultiIndex<1>; - using UpLengths = - decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{}))); + using UpLengths = decltype(make_tuple(container_product(LowLengths{}))); using LowLengthsMagicDivisorMultipiler = decltype(generate_tuple( lambda_merge_generate_MagicDivision_calculate_magic_multiplier{}, @@ -1065,7 +1063,7 @@ struct Merge_v2_magic_division low_lengths_magic_divisor_shift_{generate_tuple( [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); }, Number{})}, - up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))} + up_lengths_{make_tuple(container_product(low_lengths))} { static_assert(LowerIndex::Size() == NDimLow, "wrong!"); } @@ -1194,8 +1192,7 @@ struct Merge_v2r2_magic_division using LowLengthsScan = decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{})); - using UpLengths = - decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{}))); + using UpLengths = decltype(make_tuple(container_product(LowLengths{}))); using LowLengthsScanMagicDivisorMultipiler = decltype(generate_tuple( lambda_merge_generate_MagicDivision_calculate_magic_multiplier{}, @@ -1223,7 +1220,7 @@ struct Merge_v2r2_magic_division low_lengths_scan_magic_divisor_shift_{generate_tuple( [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths_scan_[i]); }, Number{})}, - up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))} + up_lengths_{make_tuple(container_product(low_lengths))} { static_assert(LowerIndex::Size() == NDimLow, "wrong!"); } @@ -1344,8 +1341,7 @@ struct Merge_v3_division_mod using LowLengthsScan = decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{})); - using UpLengths = - decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{}))); + using UpLengths = decltype(make_tuple(container_product(LowLengths{}))); LowLengths low_lengths_; LowLengthsScan low_lengths_scan_; @@ -1357,7 +1353,7 @@ struct Merge_v3_division_mod : low_lengths_{low_lengths}, low_lengths_scan_{ container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})}, - up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))} + up_lengths_{make_tuple(container_product(low_lengths))} { static_assert(LowerIndex::Size() == NDimLow, "wrong!"); } diff --git a/include/ck/tensor_description/tensor_adaptor.hpp b/include/ck/tensor_description/tensor_adaptor.hpp index 79c5881d48a..55a44198b20 100644 --- a/include/ck/tensor_description/tensor_adaptor.hpp +++ b/include/ck/tensor_description/tensor_adaptor.hpp @@ -65,8 +65,7 @@ struct TensorAdaptor }, Number{}); - // TODO: make container_reduce support tuple of Number and index_t - return container_reduce(lengths, math::multiplies{}, Number<1>{}); + return container_product(lengths); } template diff --git a/include/ck/tensor_description/tensor_descriptor.hpp b/include/ck/tensor_description/tensor_descriptor.hpp index c153c1f894a..4f827e51ea1 100644 --- a/include/ck/tensor_description/tensor_descriptor.hpp +++ b/include/ck/tensor_description/tensor_descriptor.hpp @@ -69,8 +69,7 @@ struct TensorDescriptor }, Number{}); - // TODO: make container_reduce support tuple of Number and index_t - return container_reduce(lengths, math::multiplies{}, Number<1>{}); + return container_product(lengths); } template diff --git a/include/ck/utility/container_helper.hpp b/include/ck/utility/container_helper.hpp index 8f2fe45796e..79cb657ec6d 100644 --- a/include/ck/utility/container_helper.hpp +++ b/include/ck/utility/container_helper.hpp @@ -189,6 +189,14 @@ __host__ __device__ constexpr auto container_reduce(const Container& x, } #endif +// O(1) template depth alternative to container_reduce for computing products. +// Uses fold expression via unpack instead of O(N) linear recursion. +template +__host__ __device__ constexpr auto container_product(const Container& x) +{ + return unpack([](auto... xs) { return (xs * ...); }, x); +} + template __host__ __device__ constexpr auto container_reverse_inclusive_scan(const Array& x, Reduce f, TData init) From 02e42dcaa1a566a05994b950548b67bbb76cab67 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Fri, 16 Jan 2026 01:23:21 -0600 Subject: [PATCH 3/3] Replace lambdas with named functors in container_concat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lambdas create unique types per call site, causing duplicate template instantiations. Named functors are shared across call sites. Results: - container_concat: 186 → 93 instantiations (50% reduction) - Wall-clock: 518ms → 309ms (40% reduction) --- include/ck/utility/container_helper.hpp | 26 +++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/include/ck/utility/container_helper.hpp b/include/ck/utility/container_helper.hpp index 79cb657ec6d..e09c32d20fc 100644 --- a/include/ck/utility/container_helper.hpp +++ b/include/ck/utility/container_helper.hpp @@ -324,6 +324,26 @@ container_reverse_inclusive_scan(const Tuple& x, Reduce f, TData init) return y; } +// Named functors for container_concat to reduce template instantiations +// (lambdas create unique types per call site, functors are shared) +struct make_tuple_functor +{ + template + __host__ __device__ constexpr auto operator()(Ts&&... xs) const + { + return make_tuple(ck::forward(xs)...); + } +}; + +struct make_array_functor +{ + template + __host__ __device__ constexpr auto operator()(T&& x, Ts&&... xs) const + { + return make_array(ck::forward(x), ck::forward(xs)...); + } +}; + template __host__ __device__ constexpr auto container_concat(const X& x, const Ys&... ys) { @@ -333,15 +353,13 @@ __host__ __device__ constexpr auto container_concat(const X& x, const Ys&... ys) template __host__ __device__ constexpr auto container_concat(const Array& ax, const Array& ay) { - return unpack2( - [&](auto&&... zs) { return make_array(ck::forward(zs)...); }, ax, ay); + return unpack2(make_array_functor{}, ax, ay); } template __host__ __device__ constexpr auto container_concat(const Tuple& tx, const Tuple& ty) { - return unpack2( - [&](auto&&... zs) { return make_tuple(ck::forward(zs)...); }, tx, ty); + return unpack2(make_tuple_functor{}, tx, ty); } template