From f53c7c636154047adc88303754050d9816229c31 Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Fri, 16 Jan 2026 17:00:53 +0100 Subject: [PATCH 1/4] ck-builder: remove SPATIAL_DIM parameter from ConvTensorLayouts This information is already in the SIGNATURE, so its pointless to pass it separately. This streamlines the interface of those functions a bit. Also touches up the style of those files in general. --- .../factory/conv_bwd_weight_dl_factory.hpp | 2 +- ...onv_bwd_weight_multi_d_wmma_v3_factory.hpp | 2 +- .../conv_bwd_weight_multi_d_xdl_factory.hpp | 2 +- ...v_bwd_weight_two_stage_wmma_v3_factory.hpp | 2 +- .../conv_bwd_weight_two_stage_xdl_factory.hpp | 2 +- .../factory/conv_bwd_weight_wmma_factory.hpp | 2 +- .../conv_bwd_weight_wmma_v3_factory.hpp | 2 +- .../factory/conv_bwd_weight_xdl_factory.hpp | 2 +- .../conv_bwd_weight_xdl_v3_factory.hpp | 2 +- .../builder/factory/conv_fwd_dl_factory.hpp | 2 +- .../factory/conv_fwd_large_tensor_factory.hpp | 2 +- .../builder/factory/conv_fwd_v3_factory.hpp | 2 +- .../builder/factory/conv_fwd_wmma_factory.hpp | 2 +- .../builder/factory/conv_fwd_xdl_factory.hpp | 2 +- .../builder/factory/conv_tile_factory.hpp | 2 +- .../factory/helpers/ck/conv_tensor_layout.hpp | 47 +++++++++--------- .../ck_tile/conv_tile_tensor_layout.hpp | 49 ++++++++++--------- .../ck_tile/builder/testing/conv_fwd.hpp | 2 +- .../builder/test/unit_conv_tensor_layout.cpp | 32 ++++++------ 19 files changed, 83 insertions(+), 77 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_dl_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_dl_factory.hpp index fda1659c75f..e8aed8da511 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_dl_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_dl_factory.hpp @@ -23,7 +23,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_multi_d_wmma_v3_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_multi_d_wmma_v3_factory.hpp index b02dea95589..24dcf05f3af 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_multi_d_wmma_v3_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_multi_d_wmma_v3_factory.hpp @@ -26,7 +26,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_multi_d_xdl_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_multi_d_xdl_factory.hpp index 4f6812617aa..5cffdd87f09 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_multi_d_xdl_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_multi_d_xdl_factory.hpp @@ -26,7 +26,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_two_stage_wmma_v3_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_two_stage_wmma_v3_factory.hpp index adf108bac48..7a391ab74fd 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_two_stage_wmma_v3_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_two_stage_wmma_v3_factory.hpp @@ -26,7 +26,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_two_stage_xdl_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_two_stage_xdl_factory.hpp index d887c1c1ced..6a1daf6ef4d 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_two_stage_xdl_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_two_stage_xdl_factory.hpp @@ -26,7 +26,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_wmma_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_wmma_factory.hpp index 4067845291f..3fa15856fae 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_wmma_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_wmma_factory.hpp @@ -26,7 +26,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_wmma_v3_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_wmma_v3_factory.hpp index 027c8a1fba6..ab941eb9279 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_wmma_v3_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_wmma_v3_factory.hpp @@ -26,7 +26,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_xdl_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_xdl_factory.hpp index fbb177f3337..46b1ab39651 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_xdl_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_xdl_factory.hpp @@ -26,7 +26,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_xdl_v3_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_xdl_v3_factory.hpp index 66a47c54078..11f206483f9 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_xdl_v3_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_bwd_weight_xdl_v3_factory.hpp @@ -26,7 +26,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_fwd_dl_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_fwd_dl_factory.hpp index 1d55772dd65..03989c9527e 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_fwd_dl_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_fwd_dl_factory.hpp @@ -24,7 +24,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_fwd_large_tensor_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_fwd_large_tensor_factory.hpp index b80406c37e0..f7c98f244d1 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_fwd_large_tensor_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_fwd_large_tensor_factory.hpp @@ -26,7 +26,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_fwd_v3_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_fwd_v3_factory.hpp index 74554df7e94..14266ad63f8 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_fwd_v3_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_fwd_v3_factory.hpp @@ -26,7 +26,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_fwd_wmma_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_fwd_wmma_factory.hpp index cb36122f7cf..652b032a9b6 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_fwd_wmma_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_fwd_wmma_factory.hpp @@ -26,7 +26,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_fwd_xdl_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_fwd_xdl_factory.hpp index b3be21f1f36..79bcd84981f 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_fwd_xdl_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_fwd_xdl_factory.hpp @@ -26,7 +26,7 @@ template ; + using Layouts = internal::ConvTensorLayouts; using Types = internal::ConvTensorDataTypes; using Ops = internal::ConvElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_tile_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_tile_factory.hpp index 35c87b61cef..b1f9136eed6 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_tile_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_tile_factory.hpp @@ -29,7 +29,7 @@ template ; + using Layouts = internal::TileConvTensorLayouts; using Types = internal::TileConvTensorTypes; using Ops = internal::TileElementwiseOps; using AlgorithmType = decltype(ALGORITHM); diff --git a/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp b/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp index fd6de9ae21e..760106c1ae5 100644 --- a/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp @@ -172,10 +172,10 @@ struct LayoutToCK using type = ck::tensor_layout::convolution::GNDHWK; }; -template +template consteval auto TensorLayoutToCK() { - return typename LayoutToCK::type{}; + return typename LayoutToCK::type{}; } struct EmptyAuxiliaryTensorLayout @@ -183,49 +183,52 @@ struct EmptyAuxiliaryTensorLayout using type = ck::Tuple<>; }; -template +template consteval auto GetAuxiliaryTensorLayoutTuple(std::index_sequence) { return ck::Tuple< - decltype(TensorLayoutToCK())...>{}; + decltype(TensorLayoutToCK())...>{}; } -template +template requires(ConvSpatialDim) struct AuxiliaryTensorLayouts { - static constexpr auto Size = AuxiliaryTensorConfigsValue.size(); - using type = decltype(GetAuxiliaryTensorLayoutTuple( + static constexpr auto Size = AUXILIARY_TENSOR_CONFIGS_VALUE.size(); + using type = decltype(GetAuxiliaryTensorLayoutTuple( std::make_index_sequence{})); }; // TODO: Currently only the ouput tensor can have auxiliary tensors (e.g., bias). -template - requires(HasElementwiseOpWithAuxiliaryOperands) +template + requires HasElementwiseOpWithAuxiliaryOperands consteval auto GetAuxiliaryTensorLayouts() { - return AuxiliaryTensorLayouts{}; + return AuxiliaryTensorLayouts{}; } -template - requires(!HasElementwiseOpWithAuxiliaryOperands) +template + requires(!HasElementwiseOpWithAuxiliaryOperands) consteval auto GetAuxiliaryTensorLayouts() { return EmptyAuxiliaryTensorLayout{}; } -template - requires(ConvSpatialDim && - ValidConvInputLayoutForSpatialDim && - ValidConvWeightLayoutForSpatialDim && - ValidConvOutputLayoutForSpatialDim) +template + requires ConvSpatialDim && + ValidConvInputLayoutForSpatialDim && + ValidConvWeightLayoutForSpatialDim && + ValidConvOutputLayoutForSpatialDim struct ConvTensorLayouts { - using InLayout = decltype(TensorLayoutToCK()); - using WeiLayout = decltype(TensorLayoutToCK()); - using OutLayout = decltype(TensorLayoutToCK()); - using DsLayout = decltype(GetAuxiliaryTensorLayouts())::type; + using InLayout = decltype(TensorLayoutToCK()); + using WeiLayout = decltype(TensorLayoutToCK()); + using OutLayout = decltype(TensorLayoutToCK()); + using DsLayout = decltype(GetAuxiliaryTensorLayouts())::type; }; } // namespace ck_tile::builder::factory::internal diff --git a/experimental/builder/include/ck_tile/builder/factory/helpers/ck_tile/conv_tile_tensor_layout.hpp b/experimental/builder/include/ck_tile/builder/factory/helpers/ck_tile/conv_tile_tensor_layout.hpp index 2aaca985869..17615f84cc3 100644 --- a/experimental/builder/include/ck_tile/builder/factory/helpers/ck_tile/conv_tile_tensor_layout.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/helpers/ck_tile/conv_tile_tensor_layout.hpp @@ -9,10 +9,10 @@ namespace ck_tile::builder::factory::internal { using ALayout = ck_tile::tensor_layout::convolution::NWGC; -template +template struct LayoutToCKTile { - static_assert(sizeof(UnsupportedEnumValue) == 0, + static_assert(sizeof(UnsupportedEnumValue) == 0, "Unsupported layout conversion to CK."); }; @@ -152,49 +152,52 @@ struct EmptyAuxiliaryTileTensorLayout using type = ck_tile::tuple<>; }; -template +template consteval auto GetAuxiliaryTileTensorLayoutTuple(std::index_sequence) { return ck_tile::tuple< - decltype(TensorLayoutToCKTile())...>{}; + decltype(TensorLayoutToCKTile())...>{}; } -template - requires(ConvSpatialDim) +template + requires ConvSpatialDim struct AuxiliaryTileTensorLayouts { - static constexpr auto Size = AuxiliaryTileTensorConfigsValue.size(); - using type = decltype(GetAuxiliaryTileTensorLayoutTuple( + static constexpr auto Size = AUXILIARY_TILE_TENSOR_CONFIGS_VALUE.size(); + using type = decltype(GetAuxiliaryTileTensorLayoutTuple( std::make_index_sequence{})); }; // TODO: Currently only the ouput tensor can have auxiliary tensors (e.g., bias). -template - requires(HasElementwiseOpWithAuxiliaryOperands) +template + requires HasElementwiseOpWithAuxiliaryOperands consteval auto GetAuxiliaryTileTensorLayouts() { - return AuxiliaryTileTensorLayouts{}; + return AuxiliaryTileTensorLayouts{}; } -template - requires(!HasElementwiseOpWithAuxiliaryOperands) +template + requires(!HasElementwiseOpWithAuxiliaryOperands) consteval auto GetAuxiliaryTileTensorLayouts() { return EmptyAuxiliaryTileTensorLayout{}; } -template - requires(ConvSpatialDim && - ValidConvInputLayoutForSpatialDim && - ValidConvWeightLayoutForSpatialDim && - ValidConvOutputLayoutForSpatialDim) +template + requires ConvSpatialDim && + ValidConvInputLayoutForSpatialDim && + ValidConvWeightLayoutForSpatialDim && + ValidConvOutputLayoutForSpatialDim struct TileConvTensorLayouts { - using ALayout = decltype(TensorLayoutToCKTile()); - using BLayout = decltype(TensorLayoutToCKTile()); - using ELayout = decltype(TensorLayoutToCKTile()); - using DsLayout = decltype(GetAuxiliaryTileTensorLayouts())::type; + using ALayout = decltype(TensorLayoutToCKTile()); + using BLayout = decltype(TensorLayoutToCKTile()); + using ELayout = decltype(TensorLayoutToCKTile()); + using DsLayout = decltype(GetAuxiliaryTileTensorLayouts())::type; }; } // namespace ck_tile::builder::factory::internal diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp index dc2963edc2d..51edf41cba2 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp @@ -76,7 +76,7 @@ struct Args using Ops = factory::internal::ConvElementwiseOps; // TODO: We shouldn't need to call into an internal namespace here. - using Layouts = factory::internal::ConvTensorLayouts; + using Layouts = factory::internal::ConvTensorLayouts; ConvTensorLengths lengths; diff --git a/experimental/builder/test/unit_conv_tensor_layout.cpp b/experimental/builder/test/unit_conv_tensor_layout.cpp index 0df94d977e7..6d82248e082 100644 --- a/experimental/builder/test/unit_conv_tensor_layout.cpp +++ b/experimental/builder/test/unit_conv_tensor_layout.cpp @@ -38,7 +38,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor1D_NWGC_GKXC_NWGK) .weight = {.config = {.layout = GKXC}}, .output = {.config = {.layout = NWGK}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -57,7 +57,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor1D_NGCW_GKXC_NGKW) .weight = {.config = {.layout = GKXC}}, .output = {.config = {.layout = NGKW}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -76,7 +76,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor1D_GNWC_GKXC_GNWK) .weight = {.config = {.layout = GKXC}}, .output = {.config = {.layout = GNWK}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -95,7 +95,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor1D_NGCW_GKCX_NGKW) .weight = {.config = {.layout = GKCX}}, .output = {.config = {.layout = NGKW}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -114,7 +114,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor2D_NGCHW_GKYXC_NGKHW) .weight = {.config = {.layout = GKYXC}}, .output = {.config = {.layout = NGKHW}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -133,7 +133,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor2D_NHWGC_GKYXC_NHWGK) .weight = {.config = {.layout = GKYXC}}, .output = {.config = {.layout = NHWGK}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -152,7 +152,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor2D_GNHWC_GKYXC_GNHWK) .weight = {.config = {.layout = GKYXC}}, .output = {.config = {.layout = GNHWK}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -171,7 +171,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor2D_NGCHW_GKCYX_NGKHW) .weight = {.config = {.layout = GKCYX}}, .output = {.config = {.layout = NGKHW}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -190,7 +190,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor3D_NGCDHW_GKCZYX_NGKDHW) .weight = {.config = {.layout = GKCZYX}}, .output = {.config = {.layout = NGKDHW}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -209,7 +209,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor3D_NDHWGC_GKZYXC_NDHWGK) .weight = {.config = {.layout = GKZYXC}}, .output = {.config = {.layout = NDHWGK}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -228,7 +228,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor3D_GNDHWC_GKZYXC_GNDHWK) .weight = {.config = {.layout = GKZYXC}}, .output = {.config = {.layout = GNDHWK}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -387,7 +387,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv2DWithSingleBiasG_K) .operation = OutputOp{.elementwise_operation = ElementwiseOperation::SCALE}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -414,7 +414,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv2DWithSingleBiasGC) .operation = OutputOp{.elementwise_operation = ElementwiseOperation::SCALE}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -442,7 +442,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv2DWithTwoAuxiliaryTensors) .operation = OutputOp{.elementwise_operation = ElementwiseOperation::SCALEADD_SCALEADD_RELU}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -470,7 +470,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv1DWithBias) .operation = OutputOp{.elementwise_operation = ElementwiseOperation::SCALE}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); @@ -497,7 +497,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv3DWithBias) .operation = OutputOp{.elementwise_operation = ElementwiseOperation::BIAS_BNORM_CLAMP}}}; - using TensorLayouts = ConvTensorLayouts; + using TensorLayouts = ConvTensorLayouts; EXPECT_TRUE((std::is_same_v)); EXPECT_TRUE((std::is_same_v)); From f0657f1c33debe6fd0e0b46bb1ea1d52f8ed9b04 Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Mon, 19 Jan 2026 12:14:42 +0100 Subject: [PATCH 2/4] ck-builder: implement reference conv using old ck The old ck implementation is more featureful and better tested. --- .../builder/factory/reference_common.hpp | 118 ---- .../builder/factory/reference_factory.hpp | 255 +++----- .../builder/testing/conv_fwd_reference.hpp | 47 +- .../validation/test_reference_execution.cpp | 550 +----------------- 4 files changed, 97 insertions(+), 873 deletions(-) delete mode 100644 experimental/builder/include/ck_tile/builder/factory/reference_common.hpp diff --git a/experimental/builder/include/ck_tile/builder/factory/reference_common.hpp b/experimental/builder/include/ck_tile/builder/factory/reference_common.hpp deleted file mode 100644 index 698ed43cb98..00000000000 --- a/experimental/builder/include/ck_tile/builder/factory/reference_common.hpp +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#pragma once - -#include "ck_tile/core.hpp" -#include "ck_tile/builder/conv_signature_concepts.hpp" -#include "ck_tile/builder/types.hpp" -#include - -namespace ck_tile::builder::factory::internal { - -// Validation helper: Ensure reference implementation only receives PassThrough elementwise ops -template -consteval void ValidateReferenceSignature() -{ - using namespace ck_tile::builder; - - // Check input elementwise operation - static_assert( - !HasTensorOp || - SIGNATURE.input.operation.elementwise_operation == ElementwiseOperation::PASS_THROUGH, - "Reference implementation does not support elementwise operations on input tensor. " - "Input operation must be PassThrough (or not specified)."); - - // Check weight elementwise operation - static_assert( - !HasTensorOp || - SIGNATURE.weight.operation.elementwise_operation == ElementwiseOperation::PASS_THROUGH, - "Reference implementation does not support elementwise operations on weight tensor. " - "Weight operation must be PassThrough (or not specified)."); - - // Check output elementwise operation - static_assert( - !HasTensorOp || - SIGNATURE.output.operation.elementwise_operation == ElementwiseOperation::PASS_THROUGH, - "Reference implementation does not support elementwise operations on output tensor. " - "Output operation must be PassThrough (or not specified)."); -} - -// Common argument structure for reference convolution implementations -// Template parameters allow different const qualifiers for each direction -template -struct ReferenceConvArgument -{ - InPtrType input_; - WeiPtrType weight_; - OutPtrType output_; - int G_, N_, K_, C_; - std::vector input_spatial_; - std::vector filter_spatial_; - std::vector output_spatial_; - std::vector strides_; - std::vector dilations_; - std::vector left_pads_; - - ReferenceConvArgument(InPtrType input, - WeiPtrType weight, - OutPtrType output, - int G, - int N, - int K, - int C, - const std::vector& input_spatial, - const std::vector& filter_spatial, - const std::vector& output_spatial, - const std::vector& strides, - const std::vector& dilations, - const std::vector& left_pads) - : input_(input), - weight_(weight), - output_(output), - G_(G), - N_(N), - K_(K), - C_(C), - input_spatial_(input_spatial), - filter_spatial_(filter_spatial), - output_spatial_(output_spatial), - strides_(strides), - dilations_(dilations), - left_pads_(left_pads) - { - } -}; - -// Common invoker structure for reference convolution implementations -// Takes a callable (lambda or function pointer) to execute the actual convolution -template -struct ReferenceConvInvoker -{ - ConvFunc conv_func_; - - explicit ReferenceConvInvoker(ConvFunc func) : conv_func_(func) {} - - float Run(const ArgumentType* arg, const StreamConfig& stream_config = StreamConfig{}) - { - (void)stream_config; // Unused for reference implementation - - conv_func_(arg->input_, - arg->weight_, - arg->output_, - arg->G_, - arg->N_, - arg->K_, - arg->C_, - arg->input_spatial_, - arg->filter_spatial_, - arg->output_spatial_, - arg->strides_, - arg->dilations_, - arg->left_pads_); - - return 0.0f; // Reference implementation doesn't track timing - } -}; - -} // namespace ck_tile::builder::factory::internal diff --git a/experimental/builder/include/ck_tile/builder/factory/reference_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/reference_factory.hpp index f6fc2dbda85..32f3ff7e6e5 100644 --- a/experimental/builder/include/ck_tile/builder/factory/reference_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/reference_factory.hpp @@ -3,15 +3,15 @@ #pragma once -#include "ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp" -#include "ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp" -#include "ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp" #include "ck_tile/builder/conv_signature_concepts.hpp" #include "ck_tile/builder/conv_algorithm_concepts.hpp" #include "ck_tile/builder/types.hpp" #include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp" -#include "ck_tile/builder/factory/reference_common.hpp" -#include "ck_tile/core.hpp" +#include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp" +#include "ck/library/utility/convolution_parameter.hpp" #include namespace ck_tile::builder::factory { @@ -22,16 +22,23 @@ template struct ReferenceFactory { - // Validate that only PassThrough elementwise operations are specified - static constexpr auto kValidation = (internal::ValidateReferenceSignature(), 0); - static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim; - using Types = internal::ConvTensorDataTypes; + using Types = internal::ConvTensorDataTypes; using InDataType = typename Types::InDataType; using WeiDataType = typename Types::WeiDataType; using OutDataType = typename Types::OutDataType; + using Layouts = factory::internal::ConvTensorLayouts; + using InLayout = typename Layouts::InLayout; + using WeiLayout = typename Layouts::WeiLayout; + using OutLayout = typename Layouts::OutLayout; + + using Ops = factory::internal::ConvElementwiseOps; + using InElementwiseOp = typename Ops::InElementwiseOp; + using WeiElementwiseOp = typename Ops::WeiElementwiseOp; + using OutElementwiseOp = typename Ops::OutElementwiseOp; + struct Instance { // Store template parameters for InstanceTraits reflection @@ -39,163 +46,88 @@ struct ReferenceFactory static constexpr auto kAlgorithm = ALGORITHM; static constexpr auto kVersion = VERSION; - // Argument and Invoker types depend on direction - // Forward: const input, const weight, mutable output - // Backward Data: mutable input, const weight, const output_grad - // Backward Weight: const input, mutable weight_grad, const output_grad - - // Use appropriate Argument type based on direction - using Argument = std::conditional_t< - ConvDirectionIsForward, - internal::ReferenceConvArgument, - std::conditional_t< - ConvDirectionIsBackwardData, - internal:: - ReferenceConvArgument, - internal:: - ReferenceConvArgument>>; - - // Invoker calls the appropriate reference implementation based on direction - struct Invoker - { - float Run(const Argument* arg, const StreamConfig& stream_config = StreamConfig{}) - { - (void)stream_config; // Unused for reference implementation - - if constexpr(ConvDirectionIsForward) - { - ck_tile:: - naive_grouped_conv_fwd( - arg->input_, - arg->weight_, - arg->output_, - arg->G_, - arg->N_, - arg->K_, - arg->C_, - arg->input_spatial_, - arg->filter_spatial_, - arg->output_spatial_, - arg->strides_, - arg->dilations_, - arg->left_pads_); - } - else if constexpr(ConvDirectionIsBackwardData) - { - ck_tile::naive_grouped_conv_bwd_data(arg->input_, - arg->weight_, - arg->output_, - arg->G_, - arg->N_, - arg->K_, - arg->C_, - arg->input_spatial_, - arg->filter_spatial_, - arg->output_spatial_, - arg->strides_, - arg->dilations_, - arg->left_pads_); - } - else if constexpr(ConvDirectionIsBackwardWeight) - { - ck_tile::naive_grouped_conv_bwd_weight(arg->input_, - arg->weight_, - arg->output_, - arg->G_, - arg->N_, - arg->K_, - arg->C_, - arg->input_spatial_, - arg->filter_spatial_, - arg->output_spatial_, - arg->strides_, - arg->dilations_, - arg->left_pads_); - } - - return 0.0f; // Reference implementation doesn't track timing - } - }; - - // Direct Run method (simpler interface, direction-agnostic) + /// @brief Invoke reference convolution + /// + /// This is the primary overload to invoke reference convolution. As the underlying + /// function requires it, this function accepts ConvParam directly. template static void Run(InPtrType* input, WeiPtrType* weight, OutPtrType* output, - int G, - int N, - int K, - int C, - const std::vector& input_spatial, - const std::vector& filter_spatial, - const std::vector& output_spatial, - const std::vector& strides, - const std::vector& dilations, - const std::vector& left_pads) + const ck::utils::conv::ConvParam& param, + InElementwiseOp in_op = InElementwiseOp{}, + WeiElementwiseOp wei_op = WeiElementwiseOp{}, + OutElementwiseOp out_op = OutElementwiseOp{}) { if constexpr(ConvDirectionIsForward) { - ck_tile::naive_grouped_conv_fwd( + ck::ref::naive_conv_fwd( static_cast(input), static_cast(weight), static_cast(output), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); + param, + in_op, + wei_op, + out_op); } else if constexpr(ConvDirectionIsBackwardData) { - ck_tile:: - naive_grouped_conv_bwd_data( - static_cast(input), - static_cast(weight), - static_cast(output), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); + ck::ref::naive_conv_bwd_data( + static_cast(input), + static_cast(weight), + static_cast(output), + param, + in_op, + wei_op, + out_op); } else if constexpr(ConvDirectionIsBackwardWeight) { - ck_tile::naive_grouped_conv_bwd_weight( + ck::ref::naive_conv_bwd_weight( static_cast(input), static_cast(weight), static_cast(output), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); + param, + in_op, + wei_op, + out_op); } } + /// @brief Invoke reference convolution + /// + /// Convenience overload to avoid having to construct ConvParam manually. + template + static void Run(InPtrType* input, + WeiPtrType* weight, + OutPtrType* output, + int G, + int N, + int K, + int C, + const std::vector& input_spatial, + const std::vector& filter_spatial, + const std::vector& strides, + const std::vector& dilations, + const std::vector& left_pads, + const std::vector& right_pads) + { + Run(input, + weight, + output, + ck::utils::conv::ConvParam(SPATIAL_DIM, + G, + N, + K, + C, + filter_spatial, + input_spatial, + strides, + dilations, + left_pads, + right_pads)); + } + std::string GetTypeString() const { std::string dir_str; @@ -209,41 +141,6 @@ struct ReferenceFactory return std::string("GPU_Reference_") + dir_str + "_" + std::to_string(SPATIAL_DIM) + "D"; } - - // Old CK interface: Create argument pointer - template - std::unique_ptr - MakeArgumentPointer(InPtrType input, - WeiPtrType weight, - OutPtrType output, - int G, - int N, - int K, - int C, - const std::vector& input_spatial, - const std::vector& filter_spatial, - const std::vector& output_spatial, - const std::vector& strides, - const std::vector& dilations, - const std::vector& left_pads) const - { - return std::make_unique(input, - weight, - output, - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); - } - - // Old CK interface: Create invoker pointer - std::unique_ptr MakeInvokerPointer() const { return std::make_unique(); } }; }; diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp index 6401c6a5d57..ff276f7c9c5 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp @@ -32,27 +32,8 @@ concept RefConvInstance = requires(Conv& conv, const void* input, const void* weight, void* output, - int G, - int N, - int K, - int C, - std::vector dims) { - { - conv.Run(input, - weight, - output, - G, - N, - K, - C, - dims, // input_spatial - dims, // filter_spatial - dims, // output_spatial - dims, // strides - dims, // dilations - dims // left_pads - ) - }; + ck::utils::conv::ConvParam param) { + { conv.Run(input, weight, output, param) }; }; /// @brief `run()` specialization for forward convolution and the reference @@ -84,16 +65,6 @@ std::tuple run(RefConvInstance auto& conv, // Just throw for now, but regard these as TODO items that should be resolved // eventually. - // Right pads are not supported right now for some reason. - for(auto right_pad : param.input_right_pads_) - { - if(right_pad != 0) - { - std::cout << "TODO: Support right pad in reference conv" << std::endl; - return std::make_tuple(false, 0.0f); - } - } - if(!args.make_input_descriptor().is_packed()) { std::cout << "TODO: Support non-packed input tensor in reference conv" << std::endl; @@ -110,19 +81,7 @@ std::tuple run(RefConvInstance auto& conv, return std::make_tuple(false, 0.0f); } - conv.Run(inputs.input, - inputs.weight, - outputs.output, - param.G_, - param.N_, - param.K_, - param.C_, - param.input_spatial_lengths_, - param.filter_spatial_lengths_, - param.output_spatial_lengths_, - param.conv_filter_strides_, - param.conv_filter_dilations_, - param.input_left_pads_); + conv.Run(inputs.input, inputs.weight, outputs.output, param); return std::make_tuple(true, 0.0f); } diff --git a/experimental/builder/test/validation/test_reference_execution.cpp b/experimental/builder/test/validation/test_reference_execution.cpp index 29f9acacd3b..f31825a9767 100644 --- a/experimental/builder/test/validation/test_reference_execution.cpp +++ b/experimental/builder/test/validation/test_reference_execution.cpp @@ -53,10 +53,10 @@ TEST(ReferenceExecution, Forward_2D_FP16) // Prepare parameters for Run() std::vector input_spatial{H, W}; std::vector filter_spatial{3, 3}; - std::vector output_spatial{H, W}; std::vector strides{1, 1}; std::vector dilations{1, 1}; std::vector left_pads{1, 1}; + std::vector right_pads{1, 1}; RefKernel ref_kernel; ref_kernel.Run(reinterpret_cast(in_dev.GetDeviceBuffer()), @@ -68,10 +68,10 @@ TEST(ReferenceExecution, Forward_2D_FP16) C, input_spatial, filter_spatial, - output_spatial, strides, dilations, - left_pads); + left_pads, + right_pads); // If we get here, Run() worked! std::cout << "✓ Reference Forward kernel executed!" << std::endl; @@ -109,10 +109,10 @@ TEST(ReferenceExecution, BackwardData_2D_FP16) std::vector input_spatial{H, W}; std::vector filter_spatial{3, 3}; - std::vector output_spatial{H, W}; std::vector strides{1, 1}; std::vector dilations{1, 1}; std::vector left_pads{1, 1}; + std::vector right_pads{1, 1}; RefKernel ref_kernel; ref_kernel.Run(reinterpret_cast(in_grad_dev.GetDeviceBuffer()), @@ -124,10 +124,10 @@ TEST(ReferenceExecution, BackwardData_2D_FP16) C, input_spatial, filter_spatial, - output_spatial, strides, dilations, - left_pads); + left_pads, + right_pads); std::cout << "✓ Reference Backward Data kernel executed!" << std::endl; EXPECT_TRUE(true); @@ -164,10 +164,10 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16) std::vector input_spatial{H, W}; std::vector filter_spatial{3, 3}; - std::vector output_spatial{H, W}; std::vector strides{1, 1}; std::vector dilations{1, 1}; std::vector left_pads{1, 1}; + std::vector right_pads{1, 1}; RefKernel ref_kernel; ref_kernel.Run(reinterpret_cast(in_dev.GetDeviceBuffer()), @@ -179,204 +179,15 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16) C, input_spatial, filter_spatial, - output_spatial, strides, dilations, - left_pads); + left_pads, + right_pads); std::cout << "✓ Reference Backward Weight kernel executed!" << std::endl; EXPECT_TRUE(true); } -// Test the old CK interface: MakeArgumentPointer + MakeInvokerPointer -TEST(ReferenceExecution, BackwardData_2D_FP16_InvokerInterface) -{ - constexpr ConvSignature sig{.spatial_dim = 2, - .direction = ConvDirection::BACKWARD_DATA, - .data_type = DataType::FP16, - .accumulation_data_type = DataType::FP32, - .input = {.config = {.layout = TensorLayout::NHWGC}}, - .weight = {.config = {.layout = TensorLayout::GKYXC}}, - .output = {.config = {.layout = TensorLayout::NHWGK}}}; - - constexpr auto ref_alg = ConvAlgorithm_Reference{}; - using RefKernel = ConvBuilder::Instance; - - const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3; - - const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t); - const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t); - const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t); - - ck::DeviceMem in_grad_dev(in_grad_size); - ck::DeviceMem wei_dev(wei_size); - ck::DeviceMem out_grad_dev(out_grad_size); - - in_grad_dev.SetZero(); - wei_dev.SetZero(); - out_grad_dev.SetZero(); - - std::vector input_spatial{H, W}; - std::vector filter_spatial{3, 3}; - std::vector output_spatial{H, W}; - std::vector strides{1, 1}; - std::vector dilations{1, 1}; - std::vector left_pads{1, 1}; - - RefKernel ref_kernel; - - // TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer) - auto argument_ptr = ref_kernel.MakeArgumentPointer( - reinterpret_cast(in_grad_dev.GetDeviceBuffer()), - reinterpret_cast(wei_dev.GetDeviceBuffer()), - reinterpret_cast(out_grad_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); - - auto invoker_ptr = ref_kernel.MakeInvokerPointer(); - - // Run using invoker - float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); - - std::cout << "✓ Reference Backward Data kernel executed via Invoker interface!" << std::endl; - std::cout << " (time = " << time << " ms)" << std::endl; - EXPECT_TRUE(true); -} - -// Test the old CK interface for Forward convolution -TEST(ReferenceExecution, Forward_2D_FP16_InvokerInterface) -{ - constexpr ConvSignature sig{.spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .data_type = DataType::FP16, - .accumulation_data_type = DataType::FP32, - .input = {.config = {.layout = TensorLayout::GNHWC}}, - .weight = {.config = {.layout = TensorLayout::GKYXC}}, - .output = {.config = {.layout = TensorLayout::GNHWK}}}; - - constexpr auto ref_alg = ConvAlgorithm_Reference{}; - using RefKernel = ConvBuilder::Instance; - - const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3; - - const size_t in_size = G * N * C * H * W * sizeof(ck::half_t); - const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t); - const size_t out_size = G * N * K * H * W * sizeof(ck::half_t); - - ck::DeviceMem in_dev(in_size); - ck::DeviceMem wei_dev(wei_size); - ck::DeviceMem out_dev(out_size); - - in_dev.SetZero(); - wei_dev.SetZero(); - out_dev.SetZero(); - - std::vector input_spatial{H, W}; - std::vector filter_spatial{3, 3}; - std::vector output_spatial{H, W}; - std::vector strides{1, 1}; - std::vector dilations{1, 1}; - std::vector left_pads{1, 1}; - - RefKernel ref_kernel; - - // TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer) - auto argument_ptr = ref_kernel.MakeArgumentPointer( - reinterpret_cast(in_dev.GetDeviceBuffer()), - reinterpret_cast(wei_dev.GetDeviceBuffer()), - reinterpret_cast(out_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); - - auto invoker_ptr = ref_kernel.MakeInvokerPointer(); - - // Run using invoker - float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); - - std::cout << "✓ Reference Forward kernel executed via Invoker interface!" << std::endl; - std::cout << " (time = " << time << " ms)" << std::endl; - EXPECT_TRUE(true); -} - -// Test the old CK interface for Backward Weight convolution -TEST(ReferenceExecution, BackwardWeight_2D_FP16_InvokerInterface) -{ - constexpr ConvSignature sig{.spatial_dim = 2, - .direction = ConvDirection::BACKWARD_WEIGHT, - .data_type = DataType::FP16, - .accumulation_data_type = DataType::FP32, - .input = {.config = {.layout = TensorLayout::GNHWC}}, - .weight = {.config = {.layout = TensorLayout::GKYXC}}, - .output = {.config = {.layout = TensorLayout::GNHWK}}}; - - constexpr auto ref_alg = ConvAlgorithm_Reference{}; - using RefKernel = ConvBuilder::Instance; - - const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3; - - const size_t in_size = G * N * C * H * W * sizeof(ck::half_t); - const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t); - const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t); - - ck::DeviceMem in_dev(in_size); - ck::DeviceMem wei_grad_dev(wei_grad_size); - ck::DeviceMem out_grad_dev(out_grad_size); - - in_dev.SetZero(); - wei_grad_dev.SetZero(); - out_grad_dev.SetZero(); - - std::vector input_spatial{H, W}; - std::vector filter_spatial{3, 3}; - std::vector output_spatial{H, W}; - std::vector strides{1, 1}; - std::vector dilations{1, 1}; - std::vector left_pads{1, 1}; - - RefKernel ref_kernel; - - // TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer) - auto argument_ptr = ref_kernel.MakeArgumentPointer( - reinterpret_cast(in_dev.GetDeviceBuffer()), - reinterpret_cast(wei_grad_dev.GetDeviceBuffer()), - reinterpret_cast(out_grad_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); - - auto invoker_ptr = ref_kernel.MakeInvokerPointer(); - - // Run using invoker - float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); - - std::cout << "✓ Reference Backward Weight kernel executed via Invoker interface!" << std::endl; - std::cout << " (time = " << time << " ms)" << std::endl; - EXPECT_TRUE(true); -} - // Test Builder Reference vs Direct GPU Reference with RANDOM INPUT TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random) { @@ -434,6 +245,7 @@ TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random) std::vector strides{1, 1}; std::vector dilations{1, 1}; std::vector left_pads{1, 1}; + std::vector right_pads{1, 1}; RefKernel builder_kernel; @@ -447,10 +259,10 @@ TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random) C, input_spatial, filter_spatial, - output_spatial, strides, dilations, - left_pads); + left_pads, + right_pads); // Run 2: Direct GPU Reference (same kernel the Builder calls internally!) ck_tile::naive_grouped_conv_fwd<2, ck::half_t, ck::half_t, ck::half_t>( @@ -542,6 +354,7 @@ TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Rand std::vector strides{1, 1}; std::vector dilations{1, 1}; std::vector left_pads{1, 1}; + std::vector right_pads{1, 1}; RefKernel builder_kernel; @@ -555,10 +368,10 @@ TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Rand C, input_spatial, filter_spatial, - output_spatial, strides, dilations, - left_pads); + left_pads, + right_pads); // Run 2: Direct GPU Reference ck_tile::naive_grouped_conv_bwd_data<2, ck::half_t, ck::half_t, ck::half_t>( @@ -648,6 +461,7 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Ra std::vector strides{1, 1}; std::vector dilations{1, 1}; std::vector left_pads{1, 1}; + std::vector right_pads{1, 1}; RefKernel builder_kernel; @@ -661,10 +475,10 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Ra C, input_spatial, filter_spatial, - output_spatial, strides, dilations, - left_pads); + left_pads, + right_pads); // Run 2: Direct GPU Reference ck_tile::naive_grouped_conv_bwd_weight<2, ck::half_t, ck::half_t, ck::half_t>( @@ -700,332 +514,4 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Ra EXPECT_TRUE(pass); } -// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Forward -TEST(ReferenceExecution, Forward_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random) -{ - constexpr ConvSignature sig{.spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .data_type = DataType::FP16, - .accumulation_data_type = DataType::FP32, - .input = {.config = {.layout = TensorLayout::NHWGC}}, - .weight = {.config = {.layout = TensorLayout::GKYXC}}, - .output = {.config = {.layout = TensorLayout::NHWGK}}}; - - constexpr auto ref_alg = ConvAlgorithm_Reference{}; - using RefKernel = ConvBuilder::Instance; - - const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14; - - const size_t in_size = G * N * C * H * W * sizeof(ck::half_t); - const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t); - const size_t out_size = G * N * K * H * W * sizeof(ck::half_t); - - const size_t in_elements = G * N * C * H * W; - const size_t wei_elements = G * K * C * 3 * 3; - const size_t out_elements = G * N * K * H * W; - - std::vector in_host(in_elements); - std::vector wei_host(wei_elements); - - std::srand(12348); - for(size_t i = 0; i < in_elements; i++) - { - in_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); - } - for(size_t i = 0; i < wei_elements; i++) - { - wei_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); - } - - ck::DeviceMem in_dev(in_size); - ck::DeviceMem wei_dev(wei_size); - ck::DeviceMem out_invoker_dev(out_size); - ck::DeviceMem out_naive_dev(out_size); - - in_dev.ToDevice(in_host.data()); - wei_dev.ToDevice(wei_host.data()); - out_invoker_dev.SetZero(); - out_naive_dev.SetZero(); - - std::vector input_spatial{H, W}; - std::vector filter_spatial{3, 3}; - std::vector output_spatial{H, W}; - std::vector strides{1, 1}; - std::vector dilations{1, 1}; - std::vector left_pads{1, 1}; - - RefKernel builder_kernel; - - // Run 1: Builder Invoker Interface - auto argument_ptr = builder_kernel.MakeArgumentPointer( - reinterpret_cast(in_dev.GetDeviceBuffer()), - reinterpret_cast(wei_dev.GetDeviceBuffer()), - reinterpret_cast(out_invoker_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); - - auto invoker_ptr = builder_kernel.MakeInvokerPointer(); - invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); - - // Run 2: Direct GPU Reference - ck_tile::naive_grouped_conv_fwd<2, ck::half_t, ck::half_t, ck::half_t>( - reinterpret_cast(in_dev.GetDeviceBuffer()), - reinterpret_cast(wei_dev.GetDeviceBuffer()), - reinterpret_cast(out_naive_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); - - // Compare - std::vector out_invoker_result(out_elements); - std::vector out_naive_result(out_elements); - out_invoker_dev.FromDevice(out_invoker_result.data()); - out_naive_dev.FromDevice(out_naive_result.data()); - - bool pass = ck::utils::check_err(out_invoker_result, - out_naive_result, - "Error: Invoker Interface != Direct GPU Reference", - 1e-6, - 1e-6); - - std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Forward)!" << std::endl; - std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl; - EXPECT_TRUE(pass); -} - -// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Backward Data -TEST(ReferenceExecution, BackwardData_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random) -{ - constexpr ConvSignature sig{.spatial_dim = 2, - .direction = ConvDirection::BACKWARD_DATA, - .data_type = DataType::FP16, - .accumulation_data_type = DataType::FP32, - .input = {.config = {.layout = TensorLayout::NHWGC}}, - .weight = {.config = {.layout = TensorLayout::GKYXC}}, - .output = {.config = {.layout = TensorLayout::NHWGK}}}; - - constexpr auto ref_alg = ConvAlgorithm_Reference{}; - using RefKernel = ConvBuilder::Instance; - - const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14; - - const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t); - const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t); - const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t); - - const size_t in_grad_elements = G * N * C * H * W; - const size_t wei_elements = G * K * C * 3 * 3; - const size_t out_grad_elements = G * N * K * H * W; - - std::vector wei_host(wei_elements); - std::vector out_grad_host(out_grad_elements); - - std::srand(12349); - for(size_t i = 0; i < wei_elements; i++) - { - wei_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); - } - for(size_t i = 0; i < out_grad_elements; i++) - { - out_grad_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); - } - - ck::DeviceMem in_grad_invoker_dev(in_grad_size); - ck::DeviceMem in_grad_naive_dev(in_grad_size); - ck::DeviceMem wei_dev(wei_size); - ck::DeviceMem out_grad_dev(out_grad_size); - - wei_dev.ToDevice(wei_host.data()); - out_grad_dev.ToDevice(out_grad_host.data()); - in_grad_invoker_dev.SetZero(); - in_grad_naive_dev.SetZero(); - - std::vector input_spatial{H, W}; - std::vector filter_spatial{3, 3}; - std::vector output_spatial{H, W}; - std::vector strides{1, 1}; - std::vector dilations{1, 1}; - std::vector left_pads{1, 1}; - - RefKernel builder_kernel; - - // Run 1: Builder Invoker Interface - auto argument_ptr = builder_kernel.MakeArgumentPointer( - reinterpret_cast(in_grad_invoker_dev.GetDeviceBuffer()), - reinterpret_cast(wei_dev.GetDeviceBuffer()), - reinterpret_cast(out_grad_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); - - auto invoker_ptr = builder_kernel.MakeInvokerPointer(); - invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); - - // Run 2: Direct GPU Reference - ck_tile::naive_grouped_conv_bwd_data<2, ck::half_t, ck::half_t, ck::half_t>( - reinterpret_cast(in_grad_naive_dev.GetDeviceBuffer()), - reinterpret_cast(wei_dev.GetDeviceBuffer()), - reinterpret_cast(out_grad_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); - - // Compare - std::vector in_grad_invoker_result(in_grad_elements); - std::vector in_grad_naive_result(in_grad_elements); - in_grad_invoker_dev.FromDevice(in_grad_invoker_result.data()); - in_grad_naive_dev.FromDevice(in_grad_naive_result.data()); - - bool pass = - ck::utils::check_err(in_grad_invoker_result, - in_grad_naive_result, - "Error: Invoker Interface != Direct GPU Reference (Backward Data)", - 1e-6, - 1e-6); - - std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Backward Data)!" - << std::endl; - std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl; - EXPECT_TRUE(pass); -} - -// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Backward Weight -TEST(ReferenceExecution, BackwardWeight_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random) -{ - constexpr ConvSignature sig{.spatial_dim = 2, - .direction = ConvDirection::BACKWARD_WEIGHT, - .data_type = DataType::FP16, - .accumulation_data_type = DataType::FP32, - .input = {.config = {.layout = TensorLayout::NHWGC}}, - .weight = {.config = {.layout = TensorLayout::GKYXC}}, - .output = {.config = {.layout = TensorLayout::NHWGK}}}; - - constexpr auto ref_alg = ConvAlgorithm_Reference{}; - using RefKernel = ConvBuilder::Instance; - - const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14; - - const size_t in_size = G * N * C * H * W * sizeof(ck::half_t); - const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t); - const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t); - - const size_t in_elements = G * N * C * H * W; - const size_t wei_grad_elements = G * K * C * 3 * 3; - const size_t out_grad_elements = G * N * K * H * W; - - std::vector in_host(in_elements); - std::vector out_grad_host(out_grad_elements); - - std::srand(12350); - for(size_t i = 0; i < in_elements; i++) - { - in_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); - } - for(size_t i = 0; i < out_grad_elements; i++) - { - out_grad_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); - } - - ck::DeviceMem in_dev(in_size); - ck::DeviceMem wei_grad_invoker_dev(wei_grad_size); - ck::DeviceMem wei_grad_naive_dev(wei_grad_size); - ck::DeviceMem out_grad_dev(out_grad_size); - - in_dev.ToDevice(in_host.data()); - out_grad_dev.ToDevice(out_grad_host.data()); - wei_grad_invoker_dev.SetZero(); - wei_grad_naive_dev.SetZero(); - - std::vector input_spatial{H, W}; - std::vector filter_spatial{3, 3}; - std::vector output_spatial{H, W}; - std::vector strides{1, 1}; - std::vector dilations{1, 1}; - std::vector left_pads{1, 1}; - - RefKernel builder_kernel; - - // Run 1: Builder Invoker Interface - auto argument_ptr = builder_kernel.MakeArgumentPointer( - reinterpret_cast(in_dev.GetDeviceBuffer()), - reinterpret_cast(wei_grad_invoker_dev.GetDeviceBuffer()), - reinterpret_cast(out_grad_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); - - auto invoker_ptr = builder_kernel.MakeInvokerPointer(); - invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); - - // Run 2: Direct GPU Reference - ck_tile::naive_grouped_conv_bwd_weight<2, ck::half_t, ck::half_t, ck::half_t>( - reinterpret_cast(in_dev.GetDeviceBuffer()), - reinterpret_cast(wei_grad_naive_dev.GetDeviceBuffer()), - reinterpret_cast(out_grad_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); - - // Compare - std::vector wei_grad_invoker_result(wei_grad_elements); - std::vector wei_grad_naive_result(wei_grad_elements); - wei_grad_invoker_dev.FromDevice(wei_grad_invoker_result.data()); - wei_grad_naive_dev.FromDevice(wei_grad_naive_result.data()); - - bool pass = - ck::utils::check_err(wei_grad_invoker_result, - wei_grad_naive_result, - "Error: Invoker Interface != Direct GPU Reference (Backward Weight)", - 1e-6, - 1e-6); - - std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Backward Weight)!" - << std::endl; - std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl; - EXPECT_TRUE(pass); -} - } // namespace From 34abf6f87cb08e4f7d9a85934109239748f94f18 Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Mon, 19 Jan 2026 13:04:22 +0100 Subject: [PATCH 3/4] ck-builder: replace test_reference_execution reference with old ck This strips out the ck-tile gpu reference implementation completely. --- .../validation/test_reference_execution.cpp | 99 ++++++++++++------- 1 file changed, 63 insertions(+), 36 deletions(-) diff --git a/experimental/builder/test/validation/test_reference_execution.cpp b/experimental/builder/test/validation/test_reference_execution.cpp index f31825a9767..856df494e00 100644 --- a/experimental/builder/test/validation/test_reference_execution.cpp +++ b/experimental/builder/test/validation/test_reference_execution.cpp @@ -4,10 +4,10 @@ #include "ck_tile/builder/conv_builder.hpp" #include "ck_tile/builder/types.hpp" #include "impl/conv_algorithm_types.hpp" -#include "ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp" -#include "ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp" -#include "ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp" #include "utils/ckb_conv_test_configs.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp" +#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/check_err.hpp" #include @@ -265,20 +265,29 @@ TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random) right_pads); // Run 2: Direct GPU Reference (same kernel the Builder calls internally!) - ck_tile::naive_grouped_conv_fwd<2, ck::half_t, ck::half_t, ck::half_t>( + ck::ref::naive_conv_fwd( reinterpret_cast(in_dev.GetDeviceBuffer()), reinterpret_cast(wei_dev.GetDeviceBuffer()), reinterpret_cast(out_naive_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); + ck::utils::conv::ConvParam(2, + G, + N, + K, + C, + filter_spatial, + input_spatial, + strides, + dilations, + left_pads, + right_pads)); // Copy results back std::vector out_builder_result(out_elements); @@ -374,20 +383,29 @@ TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Rand right_pads); // Run 2: Direct GPU Reference - ck_tile::naive_grouped_conv_bwd_data<2, ck::half_t, ck::half_t, ck::half_t>( + ck::ref::naive_conv_bwd_data( reinterpret_cast(in_grad_naive_dev.GetDeviceBuffer()), reinterpret_cast(wei_dev.GetDeviceBuffer()), reinterpret_cast(out_grad_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); + ck::utils::conv::ConvParam(2, + G, + N, + K, + C, + filter_spatial, + input_spatial, + strides, + dilations, + left_pads, + right_pads)); // Compare std::vector in_grad_builder_result(in_grad_elements); @@ -481,20 +499,29 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Ra right_pads); // Run 2: Direct GPU Reference - ck_tile::naive_grouped_conv_bwd_weight<2, ck::half_t, ck::half_t, ck::half_t>( + ck::ref::naive_conv_bwd_weight( reinterpret_cast(in_dev.GetDeviceBuffer()), reinterpret_cast(wei_grad_naive_dev.GetDeviceBuffer()), reinterpret_cast(out_grad_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - output_spatial, - strides, - dilations, - left_pads); + ck::utils::conv::ConvParam(2, + G, + N, + K, + C, + filter_spatial, + input_spatial, + strides, + dilations, + left_pads, + right_pads)); // Compare std::vector wei_grad_builder_result(wei_grad_elements); From ae4243b3e124cfd37e2a5c6c9af16180edbfdcb3 Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Mon, 19 Jan 2026 13:13:44 +0100 Subject: [PATCH 4/4] ck-builder: clean up test_reference_execution - Remove unneccesary messages - Replace EXPECT_TRUE(true) with EXPECT_NO_THROW() --- .../validation/test_reference_execution.cpp | 121 +++++++----------- .../test_reference_instance_traits.cpp | 6 - 2 files changed, 47 insertions(+), 80 deletions(-) diff --git a/experimental/builder/test/validation/test_reference_execution.cpp b/experimental/builder/test/validation/test_reference_execution.cpp index 856df494e00..0aa656ae55d 100644 --- a/experimental/builder/test/validation/test_reference_execution.cpp +++ b/experimental/builder/test/validation/test_reference_execution.cpp @@ -59,23 +59,19 @@ TEST(ReferenceExecution, Forward_2D_FP16) std::vector right_pads{1, 1}; RefKernel ref_kernel; - ref_kernel.Run(reinterpret_cast(in_dev.GetDeviceBuffer()), - reinterpret_cast(wei_dev.GetDeviceBuffer()), - reinterpret_cast(out_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - strides, - dilations, - left_pads, - right_pads); - - // If we get here, Run() worked! - std::cout << "✓ Reference Forward kernel executed!" << std::endl; - EXPECT_TRUE(true); + EXPECT_NO_THROW(ref_kernel.Run(reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + strides, + dilations, + left_pads, + right_pads)); } TEST(ReferenceExecution, BackwardData_2D_FP16) @@ -115,22 +111,20 @@ TEST(ReferenceExecution, BackwardData_2D_FP16) std::vector right_pads{1, 1}; RefKernel ref_kernel; - ref_kernel.Run(reinterpret_cast(in_grad_dev.GetDeviceBuffer()), - reinterpret_cast(wei_dev.GetDeviceBuffer()), - reinterpret_cast(out_grad_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - strides, - dilations, - left_pads, - right_pads); - - std::cout << "✓ Reference Backward Data kernel executed!" << std::endl; - EXPECT_TRUE(true); + EXPECT_NO_THROW( + ref_kernel.Run(reinterpret_cast(in_grad_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + strides, + dilations, + left_pads, + right_pads)); } TEST(ReferenceExecution, BackwardWeight_2D_FP16) @@ -170,22 +164,20 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16) std::vector right_pads{1, 1}; RefKernel ref_kernel; - ref_kernel.Run(reinterpret_cast(in_dev.GetDeviceBuffer()), - reinterpret_cast(wei_grad_dev.GetDeviceBuffer()), - reinterpret_cast(out_grad_dev.GetDeviceBuffer()), - G, - N, - K, - C, - input_spatial, - filter_spatial, - strides, - dilations, - left_pads, - right_pads); - - std::cout << "✓ Reference Backward Weight kernel executed!" << std::endl; - EXPECT_TRUE(true); + EXPECT_NO_THROW( + ref_kernel.Run(reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_grad_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + strides, + dilations, + left_pads, + right_pads)); } // Test Builder Reference vs Direct GPU Reference with RANDOM INPUT @@ -241,7 +233,6 @@ TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random) std::vector input_spatial{H, W}; std::vector filter_spatial{3, 3}; - std::vector output_spatial{H, W}; std::vector strides{1, 1}; std::vector dilations{1, 1}; std::vector left_pads{1, 1}; @@ -296,17 +287,11 @@ TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random) out_naive_dev.FromDevice(out_naive_result.data()); // Compare - should be IDENTICAL (both call same kernel) - bool pass = ck::utils::check_err(out_builder_result, + EXPECT_TRUE(ck::utils::check_err(out_builder_result, out_naive_result, "Error: Builder Reference != Direct GPU Reference", 1e-6, - 1e-6); // Very tight tolerance! - - std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT)!" << std::endl; - std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl; - std::cout << " This validates Builder Reference Factory is correct!" << std::endl; - - EXPECT_TRUE(pass); + 1e-6)); // Very tight tolerance! } // Test Builder Reference vs Direct GPU Reference with RANDOM INPUT - Backward Data @@ -359,7 +344,6 @@ TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Rand std::vector input_spatial{H, W}; std::vector filter_spatial{3, 3}; - std::vector output_spatial{H, W}; std::vector strides{1, 1}; std::vector dilations{1, 1}; std::vector left_pads{1, 1}; @@ -413,16 +397,11 @@ TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Rand in_grad_builder_dev.FromDevice(in_grad_builder_result.data()); in_grad_naive_dev.FromDevice(in_grad_naive_result.data()); - bool pass = ck::utils::check_err(in_grad_builder_result, + EXPECT_TRUE(ck::utils::check_err(in_grad_builder_result, in_grad_naive_result, "Error: Builder Backward Data != Direct GPU Reference", 1e-6, - 1e-6); - - std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT - Backward Data)!" - << std::endl; - std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl; - EXPECT_TRUE(pass); + 1e-6)); } // Test Builder Reference vs Direct GPU Reference with RANDOM INPUT - Backward Weight @@ -475,7 +454,6 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Ra std::vector input_spatial{H, W}; std::vector filter_spatial{3, 3}; - std::vector output_spatial{H, W}; std::vector strides{1, 1}; std::vector dilations{1, 1}; std::vector left_pads{1, 1}; @@ -529,16 +507,11 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Ra wei_grad_builder_dev.FromDevice(wei_grad_builder_result.data()); wei_grad_naive_dev.FromDevice(wei_grad_naive_result.data()); - bool pass = ck::utils::check_err(wei_grad_builder_result, + EXPECT_TRUE(ck::utils::check_err(wei_grad_builder_result, wei_grad_naive_result, "Error: Builder Backward Weight != Direct GPU Reference", 1e-6, - 1e-6); - - std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT - Backward Weight)!" - << std::endl; - std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl; - EXPECT_TRUE(pass); + 1e-6)); } } // namespace diff --git a/experimental/builder/test/validation/test_reference_instance_traits.cpp b/experimental/builder/test/validation/test_reference_instance_traits.cpp index 3e79d51ac71..154a0693e47 100644 --- a/experimental/builder/test/validation/test_reference_instance_traits.cpp +++ b/experimental/builder/test/validation/test_reference_instance_traits.cpp @@ -62,8 +62,6 @@ TEST(ReferenceInstanceTraits, Forward_2D_FP16) // Verify instance_string() - now includes data type and layouts! std::string instance_str = Traits::instance_string(); EXPECT_EQ(instance_str, "GPU_Reference_Forward_2D_fp16_NHWGC_GKYXC_NHWGK"); - - std::cout << "✓ Forward InstanceTraits validated: " << instance_str << std::endl; } TEST(ReferenceInstanceTraits, BackwardData_2D_FP16) @@ -86,8 +84,6 @@ TEST(ReferenceInstanceTraits, BackwardData_2D_FP16) std::string instance_str = Traits::instance_string(); EXPECT_EQ(instance_str, "GPU_Reference_BackwardData_2D_fp16_NHWGC_GKYXC_NHWGK"); - - std::cout << "✓ Backward Data InstanceTraits validated: " << instance_str << std::endl; } TEST(ReferenceInstanceTraits, BackwardWeight_2D_FP16) @@ -110,8 +106,6 @@ TEST(ReferenceInstanceTraits, BackwardWeight_2D_FP16) std::string instance_str = Traits::instance_string(); EXPECT_EQ(instance_str, "GPU_Reference_BackwardWeight_2D_fp16_NHWGC_GKYXC_NHWGK"); - - std::cout << "✓ Backward Weight InstanceTraits validated: " << instance_str << std::endl; } } // namespace