From 52fa8f6c2c67d9df7dff5b9dbfa1d3dd06cdd2a8 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Mon, 19 Jan 2026 13:17:42 -0600 Subject: [PATCH 1/3] Add build time optimization documentation --- BUILD_TIME_OPTIMIZATION.md | 247 +++++++++++++++++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 BUILD_TIME_OPTIMIZATION.md diff --git a/BUILD_TIME_OPTIMIZATION.md b/BUILD_TIME_OPTIMIZATION.md new file mode 100644 index 0000000000..77f94db5b1 --- /dev/null +++ b/BUILD_TIME_OPTIMIZATION.md @@ -0,0 +1,247 @@ +# Build Time Optimization + +This document describes techniques for reducing C++ template instantiation overhead in the Composable Kernel codebase. + +## Why Build Time Matters + +Composable Kernel relies heavily on C++ template metaprogramming to achieve GPU kernels with no runtime abstraction penalty. However, deep template instantiation can significantly impact build times. A single translation unit may trigger hundreds of thousands of template instantiations, with each instantiation adding to compile time. + +## Measuring Build Time + +Use Clang's `-ftime-trace` flag to generate JSON build traces: + +```bash +# Build with time trace enabled +cmake -DCMAKE_CXX_FLAGS="-ftime-trace -ftime-trace-granularity=1" .. +ninja example_gemm_xdl_fp16 + +# Find the trace file +find . -name "*.json" -path "*/CMakeFiles/*" +``` + +The trace file can be viewed in Chrome's `chrome://tracing` or analyzed with tools like [ClangBuildAnalyzer](https://github.com/aras-p/ClangBuildAnalyzer). + +Key metrics to monitor: + +- **Template instantiation count**: Total number of unique template instantiations +- **Template instantiation depth**: Maximum recursion depth during instantiation +- **Wall-clock time**: Actual time spent instantiating templates + +The `script/tools/ck-build-analysis` script automates trace collection and analysis: + +```bash +script/tools/ck-build-analysis example_gemm_xdl_fp16 --granularity=1 +``` + +## Optimization Techniques + +### 1. Replace O(N) Recursion with O(1) Pack Expansion + +Recursive template patterns create O(N) instantiation depth. Use compiler intrinsics and fold expressions for O(1) depth. + +**Before** (O(N) recursive instantiation): + +```cpp +template +struct sequence_gen_impl +{ + using type = typename sequence_gen_impl{}), Is...>::type; +}; + +template +struct sequence_gen_impl<0, F, Is...> +{ + using type = Sequence; +}; +``` + +**After** (O(1) using compiler intrinsic): + +```cpp +template +struct sequence_gen +{ + template + static constexpr auto make(std::integer_sequence) + { + return Sequence{})...>{}; + } + using type = decltype(make(__make_integer_seq{})); +}; +``` + +The `__make_integer_seq` intrinsic (available in Clang and MSVC) generates integer sequences with O(1) template depth. + +### 2. Replace Lambdas with Named Functors + +Each lambda expression creates a unique closure type, causing separate template instantiations at every call site. + +**Before** (lambda creates unique instantiations): + +```cpp +// Called in multiple places - each creates new instantiations +auto result = transform_tensor_descriptor( + desc, + make_tuple(make_pass_through_transform(Length)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + +// The lambda inside transform_tensor_descriptor: +generate_tuple([](auto i) { return Sequence{}; }, Number{}); +``` + +**After** (named functor shares instantiations): + +```cpp +// Define functor once +struct generate_identity_sequence +{ + template + __host__ __device__ constexpr auto operator()(Number) const + { + return Sequence{}; + } +}; + +// Use everywhere - shares instantiations +generate_tuple(generate_identity_sequence{}, Number{}); +``` + +This reduced `transform_tensor_descriptor` instantiations from 388 to 32 (92% reduction). + +#### container_concat optimization + +The same pattern applies to utility functions like `container_concat`: + +**Before**: + +```cpp +template +__host__ __device__ constexpr auto container_concat(const Tuple& tx, const Tuple& ty) +{ + return unpack2([](auto&&... zs) { return make_tuple(forward(zs)...); }, tx, ty); +} +``` + +**After**: + +```cpp +struct make_tuple_functor +{ + template + __host__ __device__ constexpr auto operator()(Ts&&... xs) const + { + return make_tuple(forward(xs)...); + } +}; + +template +__host__ __device__ constexpr auto container_concat(const Tuple& tx, const Tuple& ty) +{ + return unpack2(make_tuple_functor{}, tx, ty); +} +``` + +This reduced `container_concat` instantiations from 186 to 93 (50% reduction). + +#### make_uniform_tuple helper + +For patterns that create tuples with repeated values, use dedicated helpers instead of lambdas: + +**Before**: + +```cpp +// Creates unique lambda type at each call site +generate_tuple([](auto) { return some_value; }, Number{}); +``` + +**After**: + +```cpp +// Defined once, shared across all call sites +template +__host__ __device__ constexpr auto make_uniform_tuple(T&& value) +{ + return detail::make_uniform_tuple_impl(static_cast(value), make_index_sequence{}); +} + +// Usage +make_uniform_tuple(some_value); +``` + +### 3. Use Constexpr Arrays Instead of Template Recursion + +Replace recursive template searches with constexpr functions using arrays. + +**Before** (O(N) recursive template search): + +```cpp +template +struct find_in_tuple_of_sequences_impl +{ + static constexpr index_t pos = sequence_find(FirstSeq{}); + static constexpr bool found_here = (pos >= 0); + + using next = find_in_tuple_of_sequences_impl; + + static constexpr index_t itran = found_here ? 0 : 1 + next::itran; + static constexpr index_t idim_up = found_here ? pos : next::idim_up; +}; +``` + +**After** (O(1) pack expansion with constexpr array): + +```cpp +template +struct FindInTupleOfSequencesCompute +{ + static constexpr auto compute() + { + if constexpr(sizeof...(Seqs) == 0) { + return ResultData{0, 0, false}; + } else { + // Pack expansion creates array - O(1) template depth + constexpr index_t indices[] = {sequence_find_value(Seqs{})...}; + for(index_t i = 0; i < sizeof...(Seqs); ++i) + if(indices[i] >= 0) return ResultData{i, indices[i], true}; + return ResultData{0, 0, false}; + } + } +}; +``` + +This reduced instantiations by 50% and wall-clock time by 69%. + +### 4. Avoid Unnecessary Template Parameter Variations + +Templates with many parameter combinations cause combinatorial explosion. + +- Cache template results where possible +- Use type erasure for runtime-only variations +- Consider `if constexpr` to reduce branch template instantiations + +## Case Studies + +The following PRs demonstrate these techniques applied to Composable Kernel: + +- **sequence_gen optimization**: Replaced O(N) recursion with `__make_integer_seq` intrinsic +- **transform_tensor_descriptor**: Replaced lambdas with named functors (92% instantiation reduction) +- **container_concat**: Replaced lambdas with named functors (50% instantiation reduction) +- **find_in_tuple_of_sequences**: Replaced recursive search with pack expansion (50% reduction) +- **sequence_merge**: Replaced O(log N) recursion with O(1) fold expression + +See tracking issue [#3575](https://github.com/ROCm/composable_kernel/issues/3575) for the full list of PRs. + +## Tools and Commands + +Identify optimization targets: + +```bash +# Run analysis on a specific target +script/tools/ck-build-analysis example_convnd_fwd_xdl_fp16 --granularity=1 + +# View the generated report +cat build_time_analysis_report.md +``` + +The report shows template instantiation counts, wall-clock times, and identifies the most expensive templates. From 05d9befe90023b7f4cd270ddbfa1143f2c6e5b9b Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Mon, 19 Jan 2026 15:45:52 -0600 Subject: [PATCH 2/3] Document sequence_map_inverse and element_space_size optimizations Add documentation for: - sequence_map_inverse: O(N) to O(1) via pack expansion (95% time reduction) - calculate_element_space_size: fold expression (73% time reduction) Update case studies section with these optimizations. --- BUILD_TIME_OPTIMIZATION.md | 80 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/BUILD_TIME_OPTIMIZATION.md b/BUILD_TIME_OPTIMIZATION.md index 77f94db5b1..79517f2c4f 100644 --- a/BUILD_TIME_OPTIMIZATION.md +++ b/BUILD_TIME_OPTIMIZATION.md @@ -169,6 +169,84 @@ __host__ __device__ constexpr auto make_uniform_tuple(T&& value) make_uniform_tuple(some_value); ``` +#### sequence_map_inverse optimization + +The `sequence_map_inverse` template inverts a permutation sequence. The original implementation used O(N) recursive template instantiations. + +**Before** (O(N) recursive instantiation): + +```cpp +template +struct find_source_index_impl +{ + static constexpr index_t value = + (Seq::template At() == Target) ? Pos : find_source_index_impl::value; +}; +``` + +**After** (O(1) using constexpr array lookup): + +```cpp +namespace detail { +template +__host__ __device__ constexpr index_t find_source_index(Sequence) +{ + constexpr index_t values[] = {Is...}; + for(index_t i = 0; i < sizeof...(Is); ++i) + if(values[i] == Target) return i; + return 0; +} + +template +__host__ __device__ constexpr auto invert_permutation_impl(Sequence) +{ + return Sequence(SeqMap{})...>{}; +} +} // namespace detail + +template +struct sequence_map_inverse +{ + using type = decltype(detail::invert_permutation_impl( + typename arithmetic_sequence_gen<0, SeqMap::Size(), 1>::type{})); +}; +``` + +This reduced instantiations from 45 to 10 (78% reduction) and wall-clock time by 95%. + +#### calculate_element_space_size optimization + +Computing element space size for tensor descriptors can use a fold expression instead of recursive template instantiation. + +**Before** (recursive or loop-based approach): + +```cpp +// Implicit recursion through generate_tuple and container_reduce +const auto element_space_size = container_reduce( + generate_tuple([&](auto i) { + return (lengths[i] - I1) * strides[i]; + }, Number{}), + math::plus{}, LongNumber<1>{}); +``` + +**After** (O(1) using fold expression): + +```cpp +namespace detail { +template +__host__ __device__ constexpr auto compute_element_space_size( + const Tuple& lengths, + const Tuple& strides, + Sequence) +{ + return (LongNumber<1>{} + ... + + ((lengths[Number{}] - Number<1>{}) * strides[Number{}])); +} +} // namespace detail +``` + +This reduced instantiations from 24 to 10 (58% reduction) and wall-clock time by 73%. + ### 3. Use Constexpr Arrays Instead of Template Recursion Replace recursive template searches with constexpr functions using arrays. @@ -227,6 +305,8 @@ The following PRs demonstrate these techniques applied to Composable Kernel: - **sequence_gen optimization**: Replaced O(N) recursion with `__make_integer_seq` intrinsic - **transform_tensor_descriptor**: Replaced lambdas with named functors (92% instantiation reduction) - **container_concat**: Replaced lambdas with named functors (50% instantiation reduction) +- **sequence_map_inverse**: Replaced O(N) recursion with pack expansion (78% instantiation, 95% time reduction) +- **calculate_element_space_size**: Replaced implicit recursion with fold expression (58% instantiation, 73% time reduction) - **find_in_tuple_of_sequences**: Replaced recursive search with pack expansion (50% reduction) - **sequence_merge**: Replaced O(log N) recursion with O(1) fold expression From 71413bddeb053514c8ffcf5314dd9bc812f62478 Mon Sep 17 00:00:00 2001 From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Mon, 19 Jan 2026 17:55:57 -0600 Subject: [PATCH 3/3] Address PR review comments for build time optimization doc Changes: - Move to include/ck/ (developer documentation, not customer-facing) - Add tracking issue link at top - Fix section structure (sequential numbering 1-5) - Remove mismatched transform_tensor_descriptor example - Clarify O(N) constexpr loop vs template recursion distinction - Remove "Case Studies" section (redundant with tracking issue) - Simplify examples for clarity --- BUILD_TIME_OPTIMIZATION.md | 327 -------------------------- include/ck/BUILD_TIME_OPTIMIZATION.md | 207 ++++++++++++++++ 2 files changed, 207 insertions(+), 327 deletions(-) delete mode 100644 BUILD_TIME_OPTIMIZATION.md create mode 100644 include/ck/BUILD_TIME_OPTIMIZATION.md diff --git a/BUILD_TIME_OPTIMIZATION.md b/BUILD_TIME_OPTIMIZATION.md deleted file mode 100644 index 79517f2c4f..0000000000 --- a/BUILD_TIME_OPTIMIZATION.md +++ /dev/null @@ -1,327 +0,0 @@ -# Build Time Optimization - -This document describes techniques for reducing C++ template instantiation overhead in the Composable Kernel codebase. - -## Why Build Time Matters - -Composable Kernel relies heavily on C++ template metaprogramming to achieve GPU kernels with no runtime abstraction penalty. However, deep template instantiation can significantly impact build times. A single translation unit may trigger hundreds of thousands of template instantiations, with each instantiation adding to compile time. - -## Measuring Build Time - -Use Clang's `-ftime-trace` flag to generate JSON build traces: - -```bash -# Build with time trace enabled -cmake -DCMAKE_CXX_FLAGS="-ftime-trace -ftime-trace-granularity=1" .. -ninja example_gemm_xdl_fp16 - -# Find the trace file -find . -name "*.json" -path "*/CMakeFiles/*" -``` - -The trace file can be viewed in Chrome's `chrome://tracing` or analyzed with tools like [ClangBuildAnalyzer](https://github.com/aras-p/ClangBuildAnalyzer). - -Key metrics to monitor: - -- **Template instantiation count**: Total number of unique template instantiations -- **Template instantiation depth**: Maximum recursion depth during instantiation -- **Wall-clock time**: Actual time spent instantiating templates - -The `script/tools/ck-build-analysis` script automates trace collection and analysis: - -```bash -script/tools/ck-build-analysis example_gemm_xdl_fp16 --granularity=1 -``` - -## Optimization Techniques - -### 1. Replace O(N) Recursion with O(1) Pack Expansion - -Recursive template patterns create O(N) instantiation depth. Use compiler intrinsics and fold expressions for O(1) depth. - -**Before** (O(N) recursive instantiation): - -```cpp -template -struct sequence_gen_impl -{ - using type = typename sequence_gen_impl{}), Is...>::type; -}; - -template -struct sequence_gen_impl<0, F, Is...> -{ - using type = Sequence; -}; -``` - -**After** (O(1) using compiler intrinsic): - -```cpp -template -struct sequence_gen -{ - template - static constexpr auto make(std::integer_sequence) - { - return Sequence{})...>{}; - } - using type = decltype(make(__make_integer_seq{})); -}; -``` - -The `__make_integer_seq` intrinsic (available in Clang and MSVC) generates integer sequences with O(1) template depth. - -### 2. Replace Lambdas with Named Functors - -Each lambda expression creates a unique closure type, causing separate template instantiations at every call site. - -**Before** (lambda creates unique instantiations): - -```cpp -// Called in multiple places - each creates new instantiations -auto result = transform_tensor_descriptor( - desc, - make_tuple(make_pass_through_transform(Length)), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0>{})); - -// The lambda inside transform_tensor_descriptor: -generate_tuple([](auto i) { return Sequence{}; }, Number{}); -``` - -**After** (named functor shares instantiations): - -```cpp -// Define functor once -struct generate_identity_sequence -{ - template - __host__ __device__ constexpr auto operator()(Number) const - { - return Sequence{}; - } -}; - -// Use everywhere - shares instantiations -generate_tuple(generate_identity_sequence{}, Number{}); -``` - -This reduced `transform_tensor_descriptor` instantiations from 388 to 32 (92% reduction). - -#### container_concat optimization - -The same pattern applies to utility functions like `container_concat`: - -**Before**: - -```cpp -template -__host__ __device__ constexpr auto container_concat(const Tuple& tx, const Tuple& ty) -{ - return unpack2([](auto&&... zs) { return make_tuple(forward(zs)...); }, tx, ty); -} -``` - -**After**: - -```cpp -struct make_tuple_functor -{ - template - __host__ __device__ constexpr auto operator()(Ts&&... xs) const - { - return make_tuple(forward(xs)...); - } -}; - -template -__host__ __device__ constexpr auto container_concat(const Tuple& tx, const Tuple& ty) -{ - return unpack2(make_tuple_functor{}, tx, ty); -} -``` - -This reduced `container_concat` instantiations from 186 to 93 (50% reduction). - -#### make_uniform_tuple helper - -For patterns that create tuples with repeated values, use dedicated helpers instead of lambdas: - -**Before**: - -```cpp -// Creates unique lambda type at each call site -generate_tuple([](auto) { return some_value; }, Number{}); -``` - -**After**: - -```cpp -// Defined once, shared across all call sites -template -__host__ __device__ constexpr auto make_uniform_tuple(T&& value) -{ - return detail::make_uniform_tuple_impl(static_cast(value), make_index_sequence{}); -} - -// Usage -make_uniform_tuple(some_value); -``` - -#### sequence_map_inverse optimization - -The `sequence_map_inverse` template inverts a permutation sequence. The original implementation used O(N) recursive template instantiations. - -**Before** (O(N) recursive instantiation): - -```cpp -template -struct find_source_index_impl -{ - static constexpr index_t value = - (Seq::template At() == Target) ? Pos : find_source_index_impl::value; -}; -``` - -**After** (O(1) using constexpr array lookup): - -```cpp -namespace detail { -template -__host__ __device__ constexpr index_t find_source_index(Sequence) -{ - constexpr index_t values[] = {Is...}; - for(index_t i = 0; i < sizeof...(Is); ++i) - if(values[i] == Target) return i; - return 0; -} - -template -__host__ __device__ constexpr auto invert_permutation_impl(Sequence) -{ - return Sequence(SeqMap{})...>{}; -} -} // namespace detail - -template -struct sequence_map_inverse -{ - using type = decltype(detail::invert_permutation_impl( - typename arithmetic_sequence_gen<0, SeqMap::Size(), 1>::type{})); -}; -``` - -This reduced instantiations from 45 to 10 (78% reduction) and wall-clock time by 95%. - -#### calculate_element_space_size optimization - -Computing element space size for tensor descriptors can use a fold expression instead of recursive template instantiation. - -**Before** (recursive or loop-based approach): - -```cpp -// Implicit recursion through generate_tuple and container_reduce -const auto element_space_size = container_reduce( - generate_tuple([&](auto i) { - return (lengths[i] - I1) * strides[i]; - }, Number{}), - math::plus{}, LongNumber<1>{}); -``` - -**After** (O(1) using fold expression): - -```cpp -namespace detail { -template -__host__ __device__ constexpr auto compute_element_space_size( - const Tuple& lengths, - const Tuple& strides, - Sequence) -{ - return (LongNumber<1>{} + ... + - ((lengths[Number{}] - Number<1>{}) * strides[Number{}])); -} -} // namespace detail -``` - -This reduced instantiations from 24 to 10 (58% reduction) and wall-clock time by 73%. - -### 3. Use Constexpr Arrays Instead of Template Recursion - -Replace recursive template searches with constexpr functions using arrays. - -**Before** (O(N) recursive template search): - -```cpp -template -struct find_in_tuple_of_sequences_impl -{ - static constexpr index_t pos = sequence_find(FirstSeq{}); - static constexpr bool found_here = (pos >= 0); - - using next = find_in_tuple_of_sequences_impl; - - static constexpr index_t itran = found_here ? 0 : 1 + next::itran; - static constexpr index_t idim_up = found_here ? pos : next::idim_up; -}; -``` - -**After** (O(1) pack expansion with constexpr array): - -```cpp -template -struct FindInTupleOfSequencesCompute -{ - static constexpr auto compute() - { - if constexpr(sizeof...(Seqs) == 0) { - return ResultData{0, 0, false}; - } else { - // Pack expansion creates array - O(1) template depth - constexpr index_t indices[] = {sequence_find_value(Seqs{})...}; - for(index_t i = 0; i < sizeof...(Seqs); ++i) - if(indices[i] >= 0) return ResultData{i, indices[i], true}; - return ResultData{0, 0, false}; - } - } -}; -``` - -This reduced instantiations by 50% and wall-clock time by 69%. - -### 4. Avoid Unnecessary Template Parameter Variations - -Templates with many parameter combinations cause combinatorial explosion. - -- Cache template results where possible -- Use type erasure for runtime-only variations -- Consider `if constexpr` to reduce branch template instantiations - -## Case Studies - -The following PRs demonstrate these techniques applied to Composable Kernel: - -- **sequence_gen optimization**: Replaced O(N) recursion with `__make_integer_seq` intrinsic -- **transform_tensor_descriptor**: Replaced lambdas with named functors (92% instantiation reduction) -- **container_concat**: Replaced lambdas with named functors (50% instantiation reduction) -- **sequence_map_inverse**: Replaced O(N) recursion with pack expansion (78% instantiation, 95% time reduction) -- **calculate_element_space_size**: Replaced implicit recursion with fold expression (58% instantiation, 73% time reduction) -- **find_in_tuple_of_sequences**: Replaced recursive search with pack expansion (50% reduction) -- **sequence_merge**: Replaced O(log N) recursion with O(1) fold expression - -See tracking issue [#3575](https://github.com/ROCm/composable_kernel/issues/3575) for the full list of PRs. - -## Tools and Commands - -Identify optimization targets: - -```bash -# Run analysis on a specific target -script/tools/ck-build-analysis example_convnd_fwd_xdl_fp16 --granularity=1 - -# View the generated report -cat build_time_analysis_report.md -``` - -The report shows template instantiation counts, wall-clock times, and identifies the most expensive templates. diff --git a/include/ck/BUILD_TIME_OPTIMIZATION.md b/include/ck/BUILD_TIME_OPTIMIZATION.md new file mode 100644 index 0000000000..c1b81141ea --- /dev/null +++ b/include/ck/BUILD_TIME_OPTIMIZATION.md @@ -0,0 +1,207 @@ +# Build Time Optimization + +Tracking issue: [#3575](https://github.com/ROCm/composable_kernel/issues/3575) + +This document describes techniques for reducing C++ template instantiation overhead in the Composable Kernel codebase. + +## Why Build Time Matters + +Composable Kernel relies heavily on C++ template metaprogramming to achieve GPU kernels with no runtime abstraction penalty. However, deep template instantiation can significantly impact build times. A single translation unit may trigger hundreds of thousands of template instantiations, with each instantiation adding to compile time. + +## Optimization Techniques + +### 1. Replace Recursive Templates with Pack Expansion + +Recursive template patterns create O(N) instantiation depth - the compiler must instantiate each level before proceeding to the next: + +``` +sequence_gen_impl<5, F> + → sequence_gen_impl<4, F> + → sequence_gen_impl<3, F> + → ... +``` + +Using `__make_integer_seq` (Clang/MSVC) combined with pack expansion reduces this to constant depth - the compiler generates the entire sequence in one step internally, without recursive template instantiation. + +**Before** (O(N) recursive instantiation): + +```cpp +template +struct sequence_gen_impl +{ + using type = typename sequence_gen_impl{}), Is...>::type; +}; + +template +struct sequence_gen_impl<0, F, Is...> +{ + using type = Sequence; +}; +``` + +**After** (constant depth using compiler intrinsic + pack expansion): + +```cpp +namespace detail { + +template +struct sequence_gen_helper +{ + // Apply functor F to all indices via pack expansion + // F{}(Number<0>{}), F{}(Number<1>{}), ..., F{}(Number{}) + template + using apply = Sequence{})...>; +}; + +} // namespace detail + +template +struct sequence_gen +{ + // __make_integer_seq produces + // sequence_gen_helper with constant depth + using type = + typename __make_integer_seq::template apply; +}; +``` + +Note: While `std::make_integer_sequence` is the standard C++14 way to generate integer sequences, it only produces `std::integer_sequence`. We use `__make_integer_seq` directly because it accepts any template as its first argument, enabling this pattern where the helper class receives the index pack directly. + +### 2. Replace Lambdas with Named Functors + +Each lambda expression creates a unique closure type, causing separate template instantiations at every call site. Named functors share a single type across all uses. + +**Before** (lambda creates unique instantiations at each call site): + +```cpp +// The lambda inside transform_tensor_descriptor: +generate_tuple([](auto i) { return Sequence{}; }, Number{}); +``` + +**After** (named functor shares instantiations): + +```cpp +// Define functor once +struct generate_identity_sequence +{ + template + __host__ __device__ constexpr auto operator()(Number) const + { + return Sequence{}; + } +}; + +// Use everywhere - shares instantiations +generate_tuple(generate_identity_sequence{}, Number{}); +``` + +This reduced `transform_tensor_descriptor` instantiations from 388 to 32 (92% reduction). + +**Example: container_concat** + +```cpp +// Before: lambda creates unique type per call site +// (unpack2 applies a functor to all elements from both tuples) +template +__host__ __device__ constexpr auto container_concat(const Tuple& tx, const Tuple& ty) +{ + return unpack2([](auto&&... zs) { return make_tuple(forward(zs)...); }, tx, ty); +} + +// After: named functor shares instantiations +struct make_tuple_functor +{ + template + __host__ __device__ constexpr auto operator()(Ts&&... xs) const + { + return make_tuple(forward(xs)...); + } +}; + +template +__host__ __device__ constexpr auto container_concat(const Tuple& tx, const Tuple& ty) +{ + return unpack2(make_tuple_functor{}, tx, ty); +} +``` + +This reduced `container_concat` instantiations from 186 to 93 (50% reduction). + +**Example: make_uniform_tuple** + +For patterns that create tuples with repeated values: + +```cpp +// Before: unique lambda type at each call site +generate_tuple([](auto) { return some_value; }, Number{}); + +// After: dedicated helper function +template +__host__ __device__ constexpr auto make_uniform_tuple(T&& value) +{ + return detail::make_uniform_tuple_impl(static_cast(value), make_index_sequence{}); +} + +// Usage +make_uniform_tuple(some_value); +``` + +### 3. Use Constexpr Loops Instead of Template Recursion + +Template recursion creates N template instantiations for N iterations. A constexpr loop executes at compile time but only requires a single template instantiation. While both are O(N) in complexity, constexpr loops are significantly faster because they avoid the overhead of template instantiation. + +**Before** (O(N) template instantiations): + +```cpp +template +struct find_source_index_impl +{ + static constexpr index_t value = + (Seq::template At() == Target) ? Pos : find_source_index_impl::value; +}; +``` + +**After** (single instantiation with constexpr loop): + +```cpp +template +__host__ __device__ constexpr index_t find_source_index(Sequence) +{ + constexpr index_t values[] = {Is...}; + for(index_t i = 0; i < sizeof...(Is); ++i) + if(values[i] == Target) return i; + return 0; +} +``` + +This reduced `sequence_map_inverse` instantiations from 45 to 10 (78% reduction) and wall-clock time by 95%. + +### 4. Use Fold Expressions for Accumulation + +Fold expressions (C++17) can replace recursive template patterns for accumulation operations. + +**Before** (implicit recursion through generate_tuple and container_reduce): + +```cpp +const auto element_space_size = container_reduce( + generate_tuple([&](auto i) { + return (lengths[i] - I1) * strides[i]; + }, Number{}), + math::plus{}, LongNumber<1>{}); +``` + +**After** (single fold expression): + +```cpp +template +__host__ __device__ constexpr auto compute_element_space_size( + const Tuple& lengths, + const Tuple& strides, + Sequence) +{ + return (LongNumber<1>{} + ... + + ((lengths[Number{}] - Number<1>{}) * strides[Number{}])); +} +``` + +This reduced `calculate_element_space_size` instantiations from 24 to 10 (58% reduction) and wall-clock time by 73%.