diff --git a/include/ck/tensor_description/tensor_descriptor_helper.hpp b/include/ck/tensor_description/tensor_descriptor_helper.hpp index 44ab0d90c3d..e03eb4cf21a 100644 --- a/include/ck/tensor_description/tensor_descriptor_helper.hpp +++ b/include/ck/tensor_description/tensor_descriptor_helper.hpp @@ -17,25 +17,18 @@ namespace ck { * functions on GPU without worrying about scratch memory usage. */ -#if CK_WORKAROUND_SWDEV_275126 -template -__host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengths& lengths, - const Strides& strides, - Number i, - AccOld acc_old) +// O(1) template depth helper for element space size calculation using fold expression +// Computes: 1 + sum((length[i] - 1) * stride[i]) for all i +namespace detail { +template +__host__ __device__ constexpr auto compute_element_space_size(const Tuple& lengths, + const Tuple& strides, + Sequence) { - auto acc_new = acc_old + (lengths[i] - Number<1>{}) * strides[i]; - - if constexpr(i.value < Lengths::Size() - 1) - { - return calculate_element_space_size_impl(lengths, strides, i + Number<1>{}, acc_new); - } - else - { - return acc_new; - } + return (LongNumber<1>{} + ... + + ((lengths[Number{}] - Number<1>{}) * strides[Number{}])); } -#endif +} // namespace detail // Lengths..., Strides... could be: // 1) index_t, which is known at run-time, or @@ -60,27 +53,8 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple::type{}; -#if !CK_WORKAROUND_SWDEV_275126 - // rocm-4.1 compiler would crash for recursive labmda - // recursive function for reduction - auto f = [&](auto fs, auto i, auto acc_old) { - auto acc_new = acc_old + (lengths[i] - Number<1>{}) * strides[i]; - - if constexpr(i.value < N - 1) - { - return fs(fs, i + Number<1>{}, acc_new); - } - else - { - return acc_new; - } - }; - - const auto element_space_size = f(f, Number<0>{}, LongNumber<1>{}); -#else - const auto element_space_size = - calculate_element_space_size_impl(lengths, strides, Number<0>{}, LongNumber<1>{}); -#endif + const auto element_space_size = detail::compute_element_space_size( + lengths, strides, typename arithmetic_sequence_gen<0, N, 1>::type{}); return TensorDescriptor, remove_cv_t, diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp index 6e68690048f..1bd84caf252 100644 --- a/include/ck/utility/sequence.hpp +++ b/include/ck/utility/sequence.hpp @@ -199,23 +199,69 @@ template using make_index_sequence = typename __make_integer_seq::seq_type; -// merge sequence -template +// Sequence merge using fold expressions - O(1) template instantiation depth +// +// Strategy: Use C++17 fold expressions with custom operator| to achieve true O(1) depth +// +// Why fold expressions achieve O(1) depth: +// - Fold expression: (Seq1{} | Seq2{} | Seq3{} | ...) expands in a single step +// - Compiler generates flat operation: operator|(operator|(Seq1, Seq2), Seq3), ... +// - Single instantiation of operator| handles all pairs → O(1) template nesting depth +// +// Comparison with alternative approaches: +// +// 1. Recursive template (old approach): +// - Depth: O(N) - each merge level requires template instantiation +// - Example: merge → merge, merge> → multiple levels +// +// 2. Binary tree reduction (alternative O(log N) approach): +// - Depth: O(log N) - divides work in half each level +// - Example: merge → merge → ABCD (2 levels for 4 sequences) +// - Specializes small cases (1-4 sequences) for common scenarios +// +// 3. Fold expression (THIS approach): +// - Depth: O(1) - single fold operation regardless of N +// - Example: (A | B | C | D) → ABCD (1 level for any N) +// - Requires C++17 and understanding of ADL for operator| +// +// Impact: +// - Significantly reduces sequence_merge instantiation count vs recursive approach +// - Maximum template nesting depth remains constant regardless of number of sequences +// - Compilation time improvement proportional to instantiation reduction +// +// Trade-offs: +// - Requires C++17 fold expressions +// - Operator| must be found via ADL (Argument-Dependent Lookup) in ck namespace +// - More abstract than explicit specializations (harder to debug) +// - Uniform cost for all cases (no special-casing of common small N) +// +// Recommendation: Use fold expressions when: +// - C++17 is available +// - Maximum compilation performance is critical +// - Code abstraction is acceptable +// +// Use binary tree (alternative approach) when: +// - Need C++14 compatibility +// - Want explicit control over small-case optimizations +// - Debugging and code clarity are prioritized +// +template +constexpr Sequence operator|(Sequence, Sequence) +{ + return {}; +} + +template struct sequence_merge { - using type = typename sequence_merge::type>::type; + // Left fold: ((S1 | S2) | S3) | ... - O(1) template depth + using type = decltype((Seqs{} | ...)); }; -template -struct sequence_merge, Sequence> +template <> +struct sequence_merge<> { - using type = Sequence; -}; - -template -struct sequence_merge -{ - using type = Seq; + using type = Sequence<>; }; // generate sequence @@ -525,31 +571,35 @@ struct is_valid_sequence_map : is_same -struct sequence_map_inverse +// O(1) template depth helper to find source index in permutation inversion +// For a permutation X2Y, finds i such that X2Y[i] == Target +namespace detail { +template +__host__ __device__ constexpr index_t find_source_index(Sequence) { - template - struct sequence_map_inverse_impl + constexpr index_t values[] = {Is...}; + for(index_t i = 0; i < static_cast(sizeof...(Is)); ++i) { - static constexpr auto new_y2x = - WorkingY2X::Modify(X2Y::At(Number{}), Number{}); - - using type = - typename sequence_map_inverse_impl:: - type; - }; + if(values[i] == Target) + return i; + } + return 0; // should not reach for valid permutation +} - template - struct sequence_map_inverse_impl - { - using type = WorkingY2X; - }; +template +__host__ __device__ constexpr auto invert_permutation_impl(Sequence) +{ + return Sequence(SeqMap{})...>{}; +} +} // namespace detail - using type = - typename sequence_map_inverse_impl::type, - 0, - SeqMap::Size()>::type; +// Invert a permutation sequence using O(1) template depth pack expansion +// For X2Y = {a, b, c, ...}, computes Y2X where Y2X[X2Y[i]] = i +template +struct sequence_map_inverse +{ + using type = decltype(detail::invert_permutation_impl( + typename arithmetic_sequence_gen<0, SeqMap::Size(), 1>::type{})); }; template diff --git a/test/util/CMakeLists.txt b/test/util/CMakeLists.txt index bf0a444f18b..ffa7c223027 100644 --- a/test/util/CMakeLists.txt +++ b/test/util/CMakeLists.txt @@ -5,3 +5,8 @@ add_gtest_executable(unit_sequence unit_sequence.cpp) if(result EQUAL 0) target_link_libraries(unit_sequence PRIVATE utility) endif() + +add_gtest_executable(unit_tensor_descriptor_helper unit_tensor_descriptor_helper.cpp) +if(result EQUAL 0) + target_link_libraries(unit_tensor_descriptor_helper PRIVATE utility) +endif() diff --git a/test/util/unit_sequence.cpp b/test/util/unit_sequence.cpp index f09fd86e063..b2705d3a88f 100644 --- a/test/util/unit_sequence.cpp +++ b/test/util/unit_sequence.cpp @@ -272,6 +272,34 @@ TEST(SequenceMerge, MergeSingleSequence) EXPECT_TRUE((is_same::value)); } +TEST(SequenceMerge, MergeEmptySequence) +{ + using Result = typename sequence_merge<>::type; + using Expected = Sequence<>; + EXPECT_TRUE((is_same::value)); +} + +TEST(SequenceMerge, MergeFourSequences) +{ + using Seq1 = Sequence<1>; + using Seq2 = Sequence<2, 3>; + using Seq3 = Sequence<4, 5, 6>; + using Seq4 = Sequence<7>; + using Result = typename sequence_merge::type; + using Expected = Sequence<1, 2, 3, 4, 5, 6, 7>; + EXPECT_TRUE((is_same::value)); +} + +TEST(SequenceMerge, MergeWithEmptySequences) +{ + using Seq1 = Sequence<>; + using Seq2 = Sequence<1, 2>; + using Seq3 = Sequence<>; + using Result = typename sequence_merge::type; + using Expected = Sequence<1, 2>; + EXPECT_TRUE((is_same::value)); +} + // Test sequence_split TEST(SequenceSplit, SplitInMiddle) { @@ -437,6 +465,36 @@ TEST(SequenceMapInverse, InverseIdentityMap) EXPECT_EQ((Map::At(Number{})>{}) == 3), true); } +TEST(SequenceMapInverse, InverseReverseMap) +{ + // Map = <3, 2, 1, 0> means reverse: old[0]->new[3], old[1]->new[2], etc. + using Map = Sequence<3, 2, 1, 0>; + using Result = typename sequence_map_inverse::type; + // Result should also be <3, 2, 1, 0> since reversing twice gives original + EXPECT_TRUE((is_same>::value)); +} + +TEST(SequenceMapInverse, InverseSmallMap) +{ + using Map = Sequence<1, 0>; + using Result = typename sequence_map_inverse::type; + // Swapping 0 and 1, so inverse should also swap + EXPECT_TRUE((is_same>::value)); +} + +TEST(SequenceMapInverse, InverseLargerMap) +{ + // Map = <4, 2, 0, 3, 1> is a more complex permutation + using Map = Sequence<4, 2, 0, 3, 1>; + using Result = typename sequence_map_inverse::type; + // Verify property: Map[Result[i]] == i for all i + EXPECT_EQ((Map::At(Number{})>{}) == 0), true); + EXPECT_EQ((Map::At(Number{})>{}) == 1), true); + EXPECT_EQ((Map::At(Number{})>{}) == 2), true); + EXPECT_EQ((Map::At(Number{})>{}) == 3), true); + EXPECT_EQ((Map::At(Number{})>{}) == 4), true); +} + // Test sequence operators TEST(SequenceOperators, Equality) { diff --git a/test/util/unit_tensor_descriptor_helper.cpp b/test/util/unit_tensor_descriptor_helper.cpp new file mode 100644 index 00000000000..5f4d9e08be0 --- /dev/null +++ b/test/util/unit_tensor_descriptor_helper.cpp @@ -0,0 +1,232 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" + +using namespace ck; + +// Test make_naive_tensor_descriptor (public API) +// Formula for element_space_size: 1 + sum((length[i] - 1) * stride[i]) + +TEST(MakeNaiveTensorDescriptor, ElementSpaceSize2D) +{ + // 5x4 tensor with row-major strides [4, 1] + // element_space_size = 1 + (5-1)*4 + (4-1)*1 = 1 + 16 + 3 = 20 + constexpr auto lengths = make_tuple(Number<5>{}, Number<4>{}); + constexpr auto strides = make_tuple(Number<4>{}, Number<1>{}); + constexpr auto desc = make_naive_tensor_descriptor(lengths, strides); + EXPECT_EQ(desc.GetElementSpaceSize(), 20); +} + +TEST(MakeNaiveTensorDescriptor, ElementSpaceSize3D) +{ + // 3x8x11 tensor with strides [88, 11, 1] + // element_space_size = 1 + (3-1)*88 + (8-1)*11 + (11-1)*1 = 1 + 176 + 77 + 10 = 264 + constexpr auto lengths = make_tuple(Number<3>{}, Number<8>{}, Number<11>{}); + constexpr auto strides = make_tuple(Number<88>{}, Number<11>{}, Number<1>{}); + constexpr auto desc = make_naive_tensor_descriptor(lengths, strides); + EXPECT_EQ(desc.GetElementSpaceSize(), 264); +} + +TEST(MakeNaiveTensorDescriptor, BroadcastDimension) +{ + // 8x5 tensor with broadcast on first dimension (stride 0) + // element_space_size = 1 + (8-1)*0 + (5-1)*1 = 1 + 0 + 4 = 5 + constexpr auto lengths = make_tuple(Number<8>{}, Number<5>{}); + constexpr auto strides = make_tuple(Number<0>{}, Number<1>{}); + constexpr auto desc = make_naive_tensor_descriptor(lengths, strides); + EXPECT_EQ(desc.GetElementSpaceSize(), 5); +} + +TEST(MakeNaiveTensorDescriptor, BroadcastNonAdjacentDims7D) +{ + // 7D tensor: 2x3x5x7x11x13x17 with broadcast on dims 2 and 4 (non-adjacent) + // Underlying data is 2x3x7x13x17 = 9282 elements, broadcast over dims 2 and 4 + // strides: [4641, 1547, 0, 221, 0, 17, 1] + // element_space_size = 1 + (2-1)*4641 + (3-1)*1547 + (5-1)*0 + (7-1)*221 + // + (11-1)*0 + (13-1)*17 + (17-1)*1 + // = 1 + 4641 + 3094 + 0 + 1326 + 0 + 204 + 16 = 9282 + constexpr auto lengths = make_tuple(Number<2>{}, + Number<3>{}, + Number<5>{}, + Number<7>{}, + Number<11>{}, + Number<13>{}, + Number<17>{}); + constexpr auto strides = make_tuple(Number<4641>{}, + Number<1547>{}, + Number<0>{}, + Number<221>{}, + Number<0>{}, + Number<17>{}, + Number<1>{}); + constexpr auto desc = make_naive_tensor_descriptor(lengths, strides); + EXPECT_EQ(desc.GetElementSpaceSize(), 9282); +} + +TEST(MakeNaiveTensorDescriptor, WithPaddingArbitrary) +{ + // 11x7x3 tensor with arbitrary strides [2, 97, 23] (prime numbers, no common factors) + // This tests padding where offsets don't correspond to any packed array + // element_space_size = 1 + (11-1)*2 + (7-1)*97 + (3-1)*23 = 1 + 20 + 582 + 46 = 649 + constexpr auto lengths = make_tuple(Number<11>{}, Number<7>{}, Number<3>{}); + constexpr auto strides = make_tuple(Number<2>{}, Number<97>{}, Number<23>{}); + constexpr auto desc = make_naive_tensor_descriptor(lengths, strides); + EXPECT_EQ(desc.GetElementSpaceSize(), 649); +} + +TEST(MakeNaiveTensorDescriptor, WithPaddingStrideSlice) +{ + // 2x3x5 tensor with strides [1, 7, 35] - like a slice from a 7x7xN column-major tensor + // This tests padding where there's space for extra elements + // element_space_size = 1 + (2-1)*1 + (3-1)*7 + (5-1)*35 = 1 + 1 + 14 + 140 = 156 + constexpr auto lengths = make_tuple(Number<2>{}, Number<3>{}, Number<5>{}); + constexpr auto strides = make_tuple(Number<1>{}, Number<7>{}, Number<35>{}); + constexpr auto desc = make_naive_tensor_descriptor(lengths, strides); + EXPECT_EQ(desc.GetElementSpaceSize(), 156); +} + +TEST(MakeNaiveTensorDescriptor, ColumnMajor) +{ + // 7x3 tensor with column-major strides [1, 7] + // element_space_size = 1 + (7-1)*1 + (3-1)*7 = 1 + 6 + 14 = 21 + constexpr auto lengths = make_tuple(Number<7>{}, Number<3>{}); + constexpr auto strides = make_tuple(Number<1>{}, Number<7>{}); + constexpr auto desc = make_naive_tensor_descriptor(lengths, strides); + EXPECT_EQ(desc.GetElementSpaceSize(), 21); +} + +// Test with runtime values (index_t instead of Number<>) +TEST(MakeNaiveTensorDescriptorRuntime, Simple2D) +{ + // 9x4 tensor with row-major strides + // element_space_size = 1 + (9-1)*4 + (4-1)*1 = 1 + 32 + 3 = 36 + const auto lengths = make_tuple(index_t{9}, index_t{4}); + const auto strides = make_tuple(index_t{4}, index_t{1}); + const auto desc = make_naive_tensor_descriptor(lengths, strides); + EXPECT_EQ(desc.GetElementSpaceSize(), 36); +} + +TEST(MakeNaiveTensorDescriptorRuntime, WithPadding) +{ + // 13x5x2 tensor with arbitrary strides (using prime numbers) + // element_space_size = 1 + (13-1)*2 + (5-1)*97 + (2-1)*23 = 1 + 24 + 388 + 23 = 436 + const auto lengths = make_tuple(index_t{13}, index_t{5}, index_t{2}); + const auto strides = make_tuple(index_t{2}, index_t{97}, index_t{23}); + const auto desc = make_naive_tensor_descriptor(lengths, strides); + EXPECT_EQ(desc.GetElementSpaceSize(), 436); +} + +// Test 1D tensors with explicit strides +TEST(MakeNaiveTensorDescriptor, ElementSpaceSize1D) +{ + // 13-element 1D tensor with stride 1 + constexpr auto lengths = make_tuple(Number<13>{}); + constexpr auto strides = make_tuple(Number<1>{}); + constexpr auto desc = make_naive_tensor_descriptor(lengths, strides); + EXPECT_EQ(desc.GetElementSpaceSize(), 13); +} + +TEST(MakeNaiveTensorDescriptor, ElementSpaceSize1DStrided) +{ + // 7-element 1D tensor with stride 3 (every 3rd element) + // element_space_size = 1 + (7-1)*3 = 19 + constexpr auto lengths = make_tuple(Number<7>{}); + constexpr auto strides = make_tuple(Number<3>{}); + constexpr auto desc = make_naive_tensor_descriptor(lengths, strides); + EXPECT_EQ(desc.GetElementSpaceSize(), 19); +} + +// Test make_naive_tensor_descriptor_packed (contiguous layout) +// element_space_size = product of all lengths +TEST(MakeNaiveTensorDescriptorPacked, Simple1D) +{ + // 17-element packed tensor - element_space_size = 17 + constexpr auto lengths = make_tuple(Number<17>{}); + constexpr auto desc = make_naive_tensor_descriptor_packed(lengths); + EXPECT_EQ(desc.GetElementSpaceSize(), 17); +} + +TEST(MakeNaiveTensorDescriptorPacked, Simple2D) +{ + // 6x5 packed tensor - element_space_size = 6*5 = 30 + constexpr auto lengths = make_tuple(Number<6>{}, Number<5>{}); + constexpr auto desc = make_naive_tensor_descriptor_packed(lengths); + EXPECT_EQ(desc.GetElementSpaceSize(), 30); +} + +TEST(MakeNaiveTensorDescriptorPacked, Simple3D) +{ + // 4x5x9 packed tensor - element_space_size = 4*5*9 = 180 + constexpr auto lengths = make_tuple(Number<4>{}, Number<5>{}, Number<9>{}); + constexpr auto desc = make_naive_tensor_descriptor_packed(lengths); + EXPECT_EQ(desc.GetElementSpaceSize(), 180); +} + +// Test make_naive_tensor_descriptor_aligned (stride alignment for memory access) +// Aligns the second-to-last stride to be a multiple of 'align' +TEST(MakeNaiveTensorDescriptorAligned, Align4) +{ + // 5x3 tensor aligned to 4 elements + // strides[1] = 1, strides[0] = integer_least_multiple(3, 4) = 4 + // element_space_size = 1 + (5-1)*4 + (3-1)*1 = 1 + 16 + 2 = 19 + constexpr auto lengths = make_tuple(Number<5>{}, Number<3>{}); + constexpr auto desc = make_naive_tensor_descriptor_aligned(lengths, Number<4>{}); + EXPECT_EQ(desc.GetElementSpaceSize(), 19); +} + +TEST(MakeNaiveTensorDescriptorAligned, Align8) +{ + // 3x5x7 tensor aligned to 8 elements + // strides[2] = 1, strides[1] = integer_least_multiple(7, 8) = 8, strides[0] = 5*8 = 40 + // element_space_size = 1 + (3-1)*40 + (5-1)*8 + (7-1)*1 = 1 + 80 + 32 + 6 = 119 + constexpr auto lengths = make_tuple(Number<3>{}, Number<5>{}, Number<7>{}); + constexpr auto desc = make_naive_tensor_descriptor_aligned(lengths, Number<8>{}); + EXPECT_EQ(desc.GetElementSpaceSize(), 119); +} + +// Test high-dimensional tensors (8D) to verify no integer overflow +TEST(MakeNaiveTensorDescriptorPacked, Simple8D) +{ + // 8D packed tensor with small prime dimensions: 2x3x5x7x11x13x2x3 + // element_space_size = 2*3*5*7*11*13*2*3 = 180180 + constexpr auto lengths = make_tuple(Number<2>{}, + Number<3>{}, + Number<5>{}, + Number<7>{}, + Number<11>{}, + Number<13>{}, + Number<2>{}, + Number<3>{}); + constexpr auto desc = make_naive_tensor_descriptor_packed(lengths); + EXPECT_EQ(desc.GetElementSpaceSize(), 180180); +} + +TEST(MakeNaiveTensorDescriptor, ElementSpaceSize8D) +{ + // 8D tensor with permuted layout (non-monotonous strides): 2x3x5x7x11x13x17x19 + // Memory order: [dim7, dim3, dim1, dim5, dim2, dim0, dim6, dim4] + // This gives strides: [25935, 133, 5187, 19, 881790, 399, 51870, 1] + // Note: strides go up/down/up/down - not monotonously increasing or decreasing + // element_space_size = 2*3*5*7*11*13*17*19 = 9699690 + constexpr auto lengths = make_tuple(Number<2>{}, + Number<3>{}, + Number<5>{}, + Number<7>{}, + Number<11>{}, + Number<13>{}, + Number<17>{}, + Number<19>{}); + constexpr auto strides = make_tuple(Number<25935>{}, + Number<133>{}, + Number<5187>{}, + Number<19>{}, + Number<881790>{}, + Number<399>{}, + Number<51870>{}, + Number<1>{}); + constexpr auto desc = make_naive_tensor_descriptor(lengths, strides); + EXPECT_EQ(desc.GetElementSpaceSize(), 9699690); +}