From 94b9e4b6353e9c69c54b75ac78edcb30b6007381 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Thu, 15 Jan 2026 21:15:57 -0600 Subject: [PATCH 1/2] Optimize sequence_gen and uniform_sequence_gen using __make_integer_seq Replace recursive template instantiation with compiler intrinsic __make_integer_seq and pack expansion for O(1) instantiation depth. Before: Maximum nesting depth of 90 levels with recursive divide-and-conquer After: Maximum nesting depth of 26 levels using flat pack expansion Performance improvements measured on example_grouped_conv_fwd_xdl_fp16: - Template instantiation wall-clock time: 36.8s -> 18.7s (49% faster) - Template instantiation cumulative time: 56.6s -> 25.8s (54% faster) - Maximum nesting depth: 90 -> 26 (71% reduction) The key changes: - sequence_gen: Uses __make_integer_seq to generate indices 0..N-1, then applies functor F via pack expansion in a single step - uniform_sequence_gen: Uses __make_integer_seq with pack expansion to generate N copies of a constant value Co-Authored-By: Claude --- include/ck/utility/sequence.hpp | 74 +++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 32 deletions(-) diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp index 6e68690048f..2bd4b19da9c 100644 --- a/include/ck/utility/sequence.hpp +++ b/include/ck/utility/sequence.hpp @@ -218,36 +218,32 @@ struct sequence_merge using type = Seq; }; -// generate sequence -template -struct sequence_gen -{ - template - struct sequence_gen_impl - { - static constexpr index_t NRemainLeft = NRemain / 2; - static constexpr index_t NRemainRight = NRemain - NRemainLeft; - static constexpr index_t IMiddle = IBegin + NRemainLeft; +// generate sequence - optimized using __make_integer_seq to avoid recursive instantiation +namespace detail { - using type = typename sequence_merge< - typename sequence_gen_impl::type, - typename sequence_gen_impl::type>::type; - }; +// Helper that applies functor F to indices and produces a Sequence +// __make_integer_seq produces sequence_gen_helper +template +struct sequence_gen_helper +{ + // Apply a functor F to all indices at once via pack expansion (O(1) depth) + template + using apply = Sequence{})...>; +}; - template - struct sequence_gen_impl - { - static constexpr index_t Is = G{}(Number{}); - using type = Sequence; - }; +} // namespace detail - template - struct sequence_gen_impl - { - using type = Sequence<>; - }; +template +struct sequence_gen +{ + using type = typename __make_integer_seq:: + template apply; +}; - using type = typename sequence_gen_impl<0, NSize, F>::type; +template +struct sequence_gen<0, F> +{ + using type = Sequence<>; }; // arithmetic sequence @@ -283,16 +279,30 @@ struct arithmetic_sequence_gen<0, IEnd, 1> using type = typename __make_integer_seq::type; }; -// uniform sequence +// uniform sequence - optimized using __make_integer_seq +namespace detail { + +template +struct uniform_sequence_helper +{ + // Apply a constant value to all indices via pack expansion + template + using apply = Sequence<((void)Is, Value)...>; +}; + +} // namespace detail + template struct uniform_sequence_gen { - struct F - { - __host__ __device__ constexpr index_t operator()(index_t) const { return I; } - }; + using type = typename __make_integer_seq:: + template apply; +}; - using type = typename sequence_gen::type; +template +struct uniform_sequence_gen<0, I> +{ + using type = Sequence<>; }; // reverse inclusive scan (with init) sequence From 3d46680be09aa99066a12f93c373b9d1efe5b49e Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Thu, 15 Jan 2026 21:32:18 -0600 Subject: [PATCH 2/2] Optimize sequence_merge using direct concatenation for small cases Replace linear recursive instantiation with direct pack expansion for 1-4 sequences, and binary tree reduction for larger cases. Before: O(N) depth for merging N sequences After: O(log N) depth with O(1) for up to 4 sequences This further reduces maximum nesting depth from 26 to 22 levels when combined with the previous sequence_gen optimization. Co-Authored-By: Claude --- include/ck/utility/sequence.hpp | 63 +++++++++++++++---- .../ck/utility/statically_indexed_array.hpp | 1 + 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp index 2bd4b19da9c..18bb36d112f 100644 --- a/include/ck/utility/sequence.hpp +++ b/include/ck/utility/sequence.hpp @@ -199,30 +199,71 @@ template using make_index_sequence = typename __make_integer_seq::seq_type; -// merge sequence -template -struct sequence_merge +// merge sequence - optimized to avoid recursive instantiation +namespace detail { + +// Helper to concatenate multiple sequences in one step using fold expression +template +struct sequence_merge_impl; + +// Base case: single sequence +template +struct sequence_merge_impl> { - using type = typename sequence_merge::type>::type; + using type = Sequence; }; +// Two sequences: direct concatenation template -struct sequence_merge, Sequence> +struct sequence_merge_impl, Sequence> { using type = Sequence; }; -template -struct sequence_merge +// Three sequences: direct concatenation (avoids one level of recursion) +template +struct sequence_merge_impl, Sequence, Sequence> { - using type = Seq; + using type = Sequence; +}; + +// Four sequences: direct concatenation +template +struct sequence_merge_impl, Sequence, Sequence, Sequence> +{ + using type = Sequence; +}; + +// General case: binary tree reduction (O(log N) depth instead of O(N)) +template +struct sequence_merge_impl +{ + // Merge pairs first, then recurse + using left = typename sequence_merge_impl::type; + using right = typename sequence_merge_impl::type; + using type = typename sequence_merge_impl::type; +}; + +} // namespace detail + +template +struct sequence_merge +{ + using type = typename detail::sequence_merge_impl::type; +}; + +template <> +struct sequence_merge<> +{ + using type = Sequence<>; }; // generate sequence - optimized using __make_integer_seq to avoid recursive instantiation namespace detail { // Helper that applies functor F to indices and produces a Sequence -// __make_integer_seq produces sequence_gen_helper +// __make_integer_seq produces sequence_gen_helper template struct sequence_gen_helper { @@ -236,8 +277,8 @@ struct sequence_gen_helper template struct sequence_gen { - using type = typename __make_integer_seq:: - template apply; + using type = + typename __make_integer_seq::template apply; }; template diff --git a/include/ck/utility/statically_indexed_array.hpp b/include/ck/utility/statically_indexed_array.hpp index d0735a32f6d..f3d73e84a78 100644 --- a/include/ck/utility/statically_indexed_array.hpp +++ b/include/ck/utility/statically_indexed_array.hpp @@ -20,6 +20,7 @@ struct tuple_concat, Tuple> using type = Tuple; }; +// StaticallyIndexedArrayImpl uses binary split for O(log N) depth template struct StaticallyIndexedArrayImpl {