From 22a409be002d10315fba2273404b2b519f971d7d Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 16 Jan 2026 00:42:19 -0600
Subject: [PATCH 1/3] Add make_uniform_tuple helper for repeated value patterns

Add make_uniform_tuple<N>(value) helper to replace common pattern:
  generate_tuple([&](auto) { return value; }, Number<N>{})

This avoids unique lambda instantiations when creating tuples with
repeated values. Applied to device_grouped_conv_fwd_multiple_abd.
---
 ...ped_conv_fwd_multiple_abd_xdl_cshuffle.hpp |  4 ++--
 include/ck/utility/tuple_helper.hpp           | 22 +++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index cc343f6f69a..fb9550ca2a0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -699,9 +699,9 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
             if constexpr(isMultiA || isMultiB)
             {
                 const auto as_grid_desc_ak0_m_ak1 =
-                    generate_tuple([&](auto) { return a_grid_desc_m_k_; }, Number<NumATensor>{});
+                    make_uniform_tuple(a_grid_desc_m_k_, Number<NumATensor>{});
                 const auto bs_grid_desc_bk0_n_bk1 =
-                    generate_tuple([&](auto) { return b_grid_desc_n_k_; }, Number<NumBTensor>{});
+                    make_uniform_tuple(b_grid_desc_n_k_, Number<NumBTensor>{});
 
                 if(GridwiseGemm::CheckValidity(as_grid_desc_ak0_m_ak1,
                                                bs_grid_desc_bk0_n_bk1,
diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp
index 22f1ded15fc..9ae01792cd6 100644
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -59,6 +59,28 @@ __host__ __device__ constexpr auto generate_identity_sequences(Number<N>)
     return generate_identity_sequences<N>();
 }
 
+// Optimized helper for common pattern: generate_tuple([&](auto) { return value; }, Number<N>{})
+// Creates Tuple<T, T, ..., T> (N copies) without lambda instantiation
+namespace detail {
+template <typename T, index_t... Is>
+__host__ __device__ constexpr auto make_uniform_tuple_impl(T&& value, Sequence<Is...>)
+{
+    return make_tuple(((void)Is, value)...);
+}
+} // namespace detail
+
+template <index_t N, typename T>
+__host__ __device__ constexpr auto make_uniform_tuple(T&& value)
+{
+    return detail::make_uniform_tuple_impl(static_cast<T&&>(value), make_index_sequence<N>{});
+}
+
+template <typename T, index_t N>
+__host__ __device__ constexpr auto make_uniform_tuple(T&& value, Number<N>)
+{
+    return make_uniform_tuple<N>(static_cast<T&&>(value));
+}
+
 // tx and ty are tuple of references, return type of will tuple of referennce (not rvalue)
 template <typename... X, typename... Y>
 __host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple<X&...>& tx,

From 0a1e1cc66f5548789c119d8304ef2e70976d9040 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 16 Jan 2026 01:05:44 -0600
Subject: [PATCH 2/3] Add container_product helper for O(1) depth fold
 expression

Replace O(N) recursive container_reduce with O(1) fold expression
for computing products of container elements. This reduces template
instantiation depth from 26 to 23 levels.

- Add container_product() using unpack + fold expression
- Migrate 10 call sites from container_reduce(x, multiplies{}, 1)
---
 .../multi_index_transform.hpp                 | 20 ++++++++-----------
 .../ck/tensor_description/tensor_adaptor.hpp  |  3 +--
 .../tensor_description/tensor_descriptor.hpp  |  3 +--
 include/ck/utility/container_helper.hpp       |  8 ++++++++
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/include/ck/tensor_description/multi_index_transform.hpp b/include/ck/tensor_description/multi_index_transform.hpp
index 19a47487328..520ec1c012e 100644
--- a/include/ck/tensor_description/multi_index_transform.hpp
+++ b/include/ck/tensor_description/multi_index_transform.hpp
@@ -487,8 +487,7 @@ struct Merge_v1_carry_check
     using LowLengthsScan =
         decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
 
-    using UpLengths =
-        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
+    using UpLengths = decltype(make_tuple(container_product(LowLengths{})));
 
     LowLengths low_lengths_;
     LowLengthsScan low_lengths_scan_;
@@ -500,7 +499,7 @@ struct Merge_v1_carry_check
         : low_lengths_{low_lengths},
           low_lengths_scan_{
               container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
-          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
+          up_lengths_{make_tuple(container_product(low_lengths))}
     {
         static_assert(LowerIndex::Size() == NDimLow, "wrong!");
     }
@@ -1039,8 +1038,7 @@ struct Merge_v2_magic_division
     using LowerIndex = MultiIndex<NDimLow>;
     using UpperIndex = MultiIndex<1>;
 
-    using UpLengths =
-        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
+    using UpLengths = decltype(make_tuple(container_product(LowLengths{})));
 
     using LowLengthsMagicDivisorMultipiler = decltype(generate_tuple(
         lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
@@ -1065,7 +1063,7 @@ struct Merge_v2_magic_division
           low_lengths_magic_divisor_shift_{generate_tuple(
               [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); },
               Number<NDimLow>{})},
-          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
+          up_lengths_{make_tuple(container_product(low_lengths))}
     {
         static_assert(LowerIndex::Size() == NDimLow, "wrong!");
     }
@@ -1194,8 +1192,7 @@ struct Merge_v2r2_magic_division
     using LowLengthsScan =
         decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
 
-    using UpLengths =
-        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
+    using UpLengths = decltype(make_tuple(container_product(LowLengths{})));
 
     using LowLengthsScanMagicDivisorMultipiler = decltype(generate_tuple(
         lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{},
@@ -1223,7 +1220,7 @@ struct Merge_v2r2_magic_division
           low_lengths_scan_magic_divisor_shift_{generate_tuple(
               [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths_scan_[i]); },
               Number<NDimLow>{})},
-          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
+          up_lengths_{make_tuple(container_product(low_lengths))}
     {
         static_assert(LowerIndex::Size() == NDimLow, "wrong!");
     }
@@ -1344,8 +1341,7 @@ struct Merge_v3_division_mod
     using LowLengthsScan =
         decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
 
-    using UpLengths =
-        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
+    using UpLengths = decltype(make_tuple(container_product(LowLengths{})));
 
     LowLengths low_lengths_;
     LowLengthsScan low_lengths_scan_;
@@ -1357,7 +1353,7 @@ struct Merge_v3_division_mod
         : low_lengths_{low_lengths},
           low_lengths_scan_{
               container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
-          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
+          up_lengths_{make_tuple(container_product(low_lengths))}
     {
         static_assert(LowerIndex::Size() == NDimLow, "wrong!");
     }
diff --git a/include/ck/tensor_description/tensor_adaptor.hpp b/include/ck/tensor_description/tensor_adaptor.hpp
index 79c5881d48a..55a44198b20 100644
--- a/include/ck/tensor_description/tensor_adaptor.hpp
+++ b/include/ck/tensor_description/tensor_adaptor.hpp
@@ -65,8 +65,7 @@ struct TensorAdaptor
             },
             Number<ndim_top_>{});
 
-        // TODO: make container_reduce support tuple of Number and index_t
-        return container_reduce(lengths, math::multiplies{}, Number<1>{});
+        return container_product(lengths);
     }
 
     template <index_t IDim>
diff --git a/include/ck/tensor_description/tensor_descriptor.hpp b/include/ck/tensor_description/tensor_descriptor.hpp
index c153c1f894a..4f827e51ea1 100644
--- a/include/ck/tensor_description/tensor_descriptor.hpp
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
@@ -69,8 +69,7 @@ struct TensorDescriptor
             },
             Number<ndim_visible_>{});
 
-        // TODO: make container_reduce support tuple of Number and index_t
-        return container_reduce(lengths, math::multiplies{}, Number<1>{});
+        return container_product(lengths);
     }
 
     template <index_t IDim>
diff --git a/include/ck/utility/container_helper.hpp b/include/ck/utility/container_helper.hpp
index 8f2fe45796e..79cb657ec6d 100644
--- a/include/ck/utility/container_helper.hpp
+++ b/include/ck/utility/container_helper.hpp
@@ -189,6 +189,14 @@ __host__ __device__ constexpr auto container_reduce(const Container& x,
 }
 #endif
 
+// O(1) template depth alternative to container_reduce for computing products.
+// Uses fold expression via unpack instead of O(N) linear recursion.
+template <typename Container>
+__host__ __device__ constexpr auto container_product(const Container& x)
+{
+    return unpack([](auto... xs) { return (xs * ...); }, x);
+}
+
 template <typename TData, index_t NSize, typename Reduce>
 __host__ __device__ constexpr auto
 container_reverse_inclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)

From 02e42dcaa1a566a05994b950548b67bbb76cab67 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 16 Jan 2026 01:23:21 -0600
Subject: [PATCH 3/3] Replace lambdas with named functors in container_concat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lambdas create unique types per call site, causing duplicate template
instantiations. Named functors are shared across call sites.

Results:
- container_concat: 186 → 93 instantiations (50% reduction)
- Wall-clock: 518ms → 309ms (40% reduction)
---
 include/ck/utility/container_helper.hpp | 26 +++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/include/ck/utility/container_helper.hpp b/include/ck/utility/container_helper.hpp
index 79cb657ec6d..e09c32d20fc 100644
--- a/include/ck/utility/container_helper.hpp
+++ b/include/ck/utility/container_helper.hpp
@@ -324,6 +324,26 @@ container_reverse_inclusive_scan(const Tuple<Xs...>& x, Reduce f, TData init)
     return y;
 }
 
+// Named functors for container_concat to reduce template instantiations
+// (lambdas create unique types per call site, functors are shared)
+struct make_tuple_functor
+{
+    template <typename... Ts>
+    __host__ __device__ constexpr auto operator()(Ts&&... xs) const
+    {
+        return make_tuple(ck::forward<Ts>(xs)...);
+    }
+};
+
+struct make_array_functor
+{
+    template <typename T, typename... Ts>
+    __host__ __device__ constexpr auto operator()(T&& x, Ts&&... xs) const
+    {
+        return make_array(ck::forward<T>(x), ck::forward<Ts>(xs)...);
+    }
+};
+
 template <typename X, typename... Ys>
 __host__ __device__ constexpr auto container_concat(const X& x, const Ys&... ys)
 {
@@ -333,15 +353,13 @@ __host__ __device__ constexpr auto container_concat(const X& x, const Ys&... ys)
 template <typename T, index_t NX, index_t NY>
 __host__ __device__ constexpr auto container_concat(const Array<T, NX>& ax, const Array<T, NY>& ay)
 {
-    return unpack2(
-        [&](auto&&... zs) { return make_array(ck::forward<decltype(zs)>(zs)...); }, ax, ay);
+    return unpack2(make_array_functor{}, ax, ay);
 }
 
 template <typename... X, typename... Y>
 __host__ __device__ constexpr auto container_concat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
 {
-    return unpack2(
-        [&](auto&&... zs) { return make_tuple(ck::forward<decltype(zs)>(zs)...); }, tx, ty);
+    return unpack2(make_tuple_functor{}, tx, ty);
 }
 
 template <typename Container>