diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt index 06512de6d66..27bf8d03d8c 100644 --- a/backends/iluvatar_gpu/CMakeLists.txt +++ b/backends/iluvatar_gpu/CMakeLists.txt @@ -814,9 +814,20 @@ list( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/moe_ops_partial_nosoftmaxtopk_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/sparse_attention_kernel.cu) +file( + GLOB + AP_CC_SRCS + ${PADDLE_SOURCE_DIR}/paddle/ap/src/axpr/*.cc + ${PADDLE_SOURCE_DIR}/paddle/ap/src/fs/*.cc + ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_module/*.cc + ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_gen/*.cc + ${PADDLE_SOURCE_DIR}/paddle/ap/src/kernel_dispatch/*.cc + ${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/*.cc) + file( GLOB_RECURSE CC_SRCS RELATIVE ${CMAKE_SOURCE_DIR} + ${AP_CC_SRCS} runtime/runtime.cc runtime/iluvatar_context.cc common/*.cc diff --git a/backends/iluvatar_gpu/apy/device/compile_command_util.py b/backends/iluvatar_gpu/apy/device/compile_command_util.py new file mode 100644 index 00000000000..e0037f84ea1 --- /dev/null +++ b/backends/iluvatar_gpu/apy/device/compile_command_util.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ap + + +class CompileCommandGenerator: + def __init__(self): + self.file_ext = "cu" + self.op_type2generate_func = ap.OrderedDict( + [ + ["matmul", self.generate_matmul_compile_command], + ] + ) + + def __call__(self, op_type, tpl_dirname, library_name): + return self.op_type2generate_func[op_type](tpl_dirname, library_name) + + def generate_matmul_compile_command(self, tpl_dirname, library_name): + parent_dir = ap.dirname(ap.dirname(__file__)) + cutlass_dir = f"{parent_dir}/matmul/cutlass" + matmul_source_dir = f"{parent_dir}/matmul" + + compile_cmd = "clang++ -x ivcore -L/usr/local/corex/lib -lcudart --cuda-path=/usr/local/corex -std=c++17 -O3 -fPIC --cuda-gpu-arch=ivcore11 -Xclang=-fcuda-allow-variadic-functions" + compile_cmd = compile_cmd + " -I " + cutlass_dir + "/include" + compile_cmd = compile_cmd + " -I " + cutlass_dir + "/tools/util/include" + compile_cmd = compile_cmd + " -I " + matmul_source_dir + compile_cmd = ( + compile_cmd + + " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -DCUTLASS_DEBUG_TRACE_LEVEL=0 -DCUTLASS_ILUVATAR" + ) + compile_cmd = compile_cmd + " -DAP_ENABLE_AUTOTUNE=0 -DAP_ENABLE_DEBUG=0" + compile_cmd = ( + compile_cmd + + f" --shared {library_name}.{self.file_ext} -o lib{library_name}.so" + ) + return compile_cmd diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh b/backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh new file mode 100644 index 00000000000..f48822fce28 --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh @@ -0,0 +1,290 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "cute/atom/mma_traits.hpp" +#include "cute/numeric/integral_constant.hpp" +#include "cute/tensor.hpp" + +#include "cutlass/cutlass.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/device_memory.h" + +#include "cutlass/detail/layout.hpp" + +#include "cute/atom/copy_traits_ix11_sme.hpp" +#include "cute/atom/copy_traits_ix11.hpp" +#include "cute/atom/copy_atom.hpp" + +#include "cutlass/arch/config.h" +#include "cutlass/epilogue/dispatch_policy.hpp" + +#include "cutlass_patch/batched_matrix_coord.h" +#include "cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp" +#include "cutlass_patch/epilogue/thread/linear_combination_variadic.h" + +#include "default_config_id.h" +#include "params.h" + +#define CHECK_CUTLASS(status) \ + { \ + cutlass::Status error = status; \ + if (error != cutlass::Status::kSuccess) { \ + std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) \ + << " at: " << __LINE__ << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } + +namespace ap { +using bfloat16 = nv_bfloat16; + +template +using Array = cutlass::Array; + +using MatrixCoord = cutlass::BatchedMatrixCoord; + +// Convert CUDA data type to cutlass data type +template +struct CutlassDataType { + using Type = T; +}; + +template <> +struct CutlassDataType { + using Type = cutlass::half_t; +}; + +template <> +struct CutlassDataType<__nv_bfloat16> { + using Type = cutlass::bfloat16_t; +}; + +// Convert to cutlass layout +template +struct MatrixLayout { + using Type = cutlass::layout::RowMajor; +}; + +template <> +struct MatrixLayout { + using Type = cutlass::layout::ColumnMajor; +}; + +static cutlass::gemm::GemmUniversalMode GetGemmMode(int batch_count) { + return batch_count > 1 ? cutlass::gemm::GemmUniversalMode::kBatched + : cutlass::gemm::GemmUniversalMode::kGemm; +} + +template + class VariadicFunctor, + int AlignA = 64 / sizeof(ElementT), + int AlignB = 64 / sizeof(ElementT), + int ConfigId = 0> +void MatmulAddVariadic( + const GemmEpilogueParams ¶ms, + const typename VariadicFunctor::Arguments &variadic_args) { + + using namespace cute; + + using ElementAccumulator = + typename CutlassDataType::Type; // <- data type of + // accumulator + using ElementComputeEpilogue = + ElementAccumulator; // <- data type of epilogue operations + using ElementA = + typename CutlassDataType::Type; // <- data type of elements in + // input matrix A + using ElementB = + typename CutlassDataType::Type; // <- data type of elements in + // input matrix B + using ElementC = ElementA; + using ElementD = ElementA; + using ElementOutput = + typename CutlassDataType::Type; // <- data type of elements in + // output matrix D + + constexpr int AlignC = AlignB; + + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + using LayoutC = cutlass::layout::RowMajor; + using LayoutD = cutlass::layout::RowMajor; + + using EpilogueThreadOp = cutlass::epilogue::thread::LinearCombinationVariadic< + VariadicFunctor, ElementD, 1, ElementAccumulator, ElementAccumulator>; + + // Epilogue -------------------------------- + // Params + // Tile of C/D + using EpilogueTile = Tile< + Layout,Stride<_1,_64>>, + Layout,Stride<_1,_64>> + >; + + // Layout of smem + using SmemLayout = + ComposedLayout< + Swizzle<1,1,6>, + _0, + Layout< + Shape ,Shape <_16, _2, _4>>, + Stride,Stride< _2,_128,_1024>> + > + >; + + // G2R + using EpiG2RAtom = Copy_Atom, ElementC>; + + // R2S + using EpiR2SAtom = Copy_Atom, ElementD>; + + // S2R + using EpiS2RAtom = EpiR2SAtom; + using TiledCopyS2R = + TiledCopy< + EpiS2RAtom, + Layout< + Shape ,Shape <_2, _2,_2>>, + Stride,Stride<_4,_64,_8>>>, + decltype(product_each(shape(EpilogueTile{}))) + >; + + // R2R + using CopyAtomR2R = Copy_Atom; + + // R2G + using EpiR2GAtom = Copy_Atom, ElementD>; + + // Epilogue Collective + using CollectiveEpilogue = cutlass::epilogue::collective::EpilogueVariadic< + // cutlass::detail::TagToStrideC_t, + cutlass::detail::TagToStrideC_t, + EpilogueThreadOp, + EpilogueTile, + SmemLayout, + EpiG2RAtom, + EpiR2SAtom, + TiledCopyS2R, + CopyAtomR2R, + EpiR2GAtom, + cutlass::epilogue::EpilogueSimtVectorized>; + + using CtaTiler = Shape<_256,_256,_32>; + using TiledMma = + TiledMMA< + MMA_Atom, + Layout>, + Tile< + Layout, Stride<_1,_64,_16>>, + Layout, Stride<_1,_64,_16>>, + _16 + > + >; + + using CopyA_Op = IX11_SME_I_16x512b; + using CopyA_Atom = Copy_Atom, ElementA>; + using CopyA = decltype(make_tiled_copy(CopyA_Atom{0}, Layout>{}, Layout,Stride<_1,_16>>{})); + using SmemLayoutAtomA = IX11::Layout_SME_I_16x512b_Atom; + using SmemCopyA = Copy_Atom, ElementA>; + + using CopyB_Op = IX11_SME_I_16x512b; + using CopyB_Atom = Copy_Atom, ElementB>; + using CopyB = decltype(make_tiled_copy(CopyB_Atom{0}, Layout>{}, Layout,Stride<_1,_16>>{})); + using SmemLayoutAtomB = IX11::Layout_SME_I_16x512b_Atom; + using SmemCopyB = Copy_Atom, ElementB>; + + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + cutlass::gemm::MainloopIx11SmeUnpredicated<2>, + CtaTiler, + ElementA, cutlass::detail::TagToStrideA_t, + ElementB, cutlass::detail::TagToStrideB_t, + TiledMma, + CopyA, SmemLayoutAtomA, SmemCopyA, cute::identity, + CopyB, SmemLayoutAtomB, SmemCopyB, cute::identity + >; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveMainloop, + CollectiveEpilogue + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + ProblemShapeType problem_shape{params.m, params.n, params.k, params.batch_count}; + + const ElementA *input = + reinterpret_cast(params.input); + const ElementB *weight = + reinterpret_cast(params.weight); + ElementOutput *output = reinterpret_cast(params.output); + + cutlass::KernelHardwareInfo hw_info; + // TODO + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.device_id = 0; + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + + typename EpilogueThreadOp::Params epilogue_op_args; + epilogue_op_args.variadic_args = variadic_args; + + int m = params.m, n = params.n, k = params.k, l = params.batch_count; + + auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, l)); + auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, l)); + auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(m, n, l)); + + typename Gemm::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_shape, + {input, stride_A, weight, stride_B}, + {epilogue_op_args, output, stride_D}, + hw_info + }; + + Gemm device_gemm; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + cudaStream_t* stream_ptr = reinterpret_cast(params.stream_ptr); + + CHECK_CUTLASS(device_gemm.can_implement(arguments)); + CHECK_CUTLASS(device_gemm.initialize(arguments, workspace.get(), *stream_ptr)); + CHECK_CUTLASS(device_gemm(*stream_ptr)); + +} + +} // namespace ap diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h new file mode 100644 index 00000000000..11c08461eb0 --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h @@ -0,0 +1,39 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "cutlass/cutlass.h" + +namespace cutlass { + +struct BatchedMatrixCoord { + int batch; + int row; + int column; + bool is_valid; + + CUTLASS_HOST_DEVICE + BatchedMatrixCoord() : batch(0), row(0), column(0), is_valid(false) {} + + CUTLASS_HOST_DEVICE + BatchedMatrixCoord(int b, int r, int c) + : batch(b), row(r), column(c), is_valid(true) {} + + CUTLASS_HOST_DEVICE + BatchedMatrixCoord(int b, int r, int c, bool valid) + : batch(b), row(r), column(c), is_valid(valid) {} +}; + +}; // namespace cutlass diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp new file mode 100644 index 00000000000..269399222ea --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp @@ -0,0 +1,445 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Functor performing elementwise operations used by epilogues. +*/ + +#pragma once + +#include "cute/tensor.hpp" +#include "cutlass/cutlass.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace collective { + +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + // class StrideC, + class StrideD, + class ThreadEpilogueOp, + class EpilogueTile, + class SmemLayout, + class CopyAtomG2R_, + class CopyAtomR2S, + class TiledCopyS2R, + class CopyAtomR2R, + class CopyAtomR2G, + class EpilogueScheduleType = EpilogueSimtVectorized, + class Enable = void> +class EpilogueVariadic { + static_assert( + cute::is_same_v || + cute::is_same_v, + "Could not find an epilogue specialization."); +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Iluvatar ix11/ix30 Epilogue Vectorized +/// 1. Load C(optional), compute D = alpha * Acc + beta * C, +/// then transform to ElementD(8/16-bit typically) +/// 2. Copy D from Register to Smem +/// 3. Load D from Smem to Register +/// 4. Register-to-Register transform +/// 5. Store D from Register to Gmem +/// +/// Support alpha and beta params, bias not supported yet. +template < + // class StrideC_, + class StrideD_, + class ThreadEpilogueOp_, + class EpilogueTile_, + class SmemLayout_, + class CopyAtomG2R_, + class CopyAtomR2S_, + class TiledCopyS2R_, + class CopyAtomR2R_, + class CopyAtomR2G_, + class EpilogueScheduleType_> +class EpilogueVariadic< + // StrideC_, + StrideD_, + ThreadEpilogueOp_, + EpilogueTile_, + SmemLayout_, + CopyAtomG2R_, + CopyAtomR2S_, + TiledCopyS2R_, + CopyAtomR2R_, + CopyAtomR2G_, + EpilogueScheduleType_, + cute::enable_if_t< + cute::is_same_v>> { + public: + // + // Type Aliases + // + // derived types of output thread level operator + using ThreadEpilogueOp = ThreadEpilogueOp_; + using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator; + using ElementCompute = typename ThreadEpilogueOp::ElementCompute; + using ElementScalar = ElementCompute; + using ElementOutput = typename ThreadEpilogueOp::ElementOutput; + using ElementD = typename ThreadEpilogueOp::ElementOutput; + using StrideD = StrideD_; + using ElementC = ElementD; // for GemmUniversal + using StrideC = StrideD; + using ElementBias = + typename detail::IsThreadEpilogueOpWithBias::type; + using EpilogueTile = EpilogueTile_; + using SmemLayout = SmemLayout_; + using CopyAtomG2R = CopyAtomG2R_; + using CopyAtomR2S = CopyAtomR2S_; + using TiledCopyS2R = TiledCopyS2R_; + using CopyAtomR2R = CopyAtomR2R_; + using CopyAtomR2G = CopyAtomR2G_; + + // using GmemTiledCopyC = void; + using GmemTiledCopyD = CopyAtomR2G; + + static constexpr bool IsEpilogueBiasSupported = + detail::IsThreadEpilogueOpWithBias::value; + using StrideBias = cute::conditional_t(), + Stride<_1, _0, int64_t>, + Stride<_0, _1, int64_t>>; + + // static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, + // L]"); + static_assert(cute::rank(StrideD{}) == 3, + "StrideCD must be rank-3: [M, N, L]"); + + struct SharedStorage { + cute::array_aligned> smem_epilogue; + }; + + struct Arguments { + typename ThreadEpilogueOp::Params epilogue_op{}; + // using StrideBias = decltype(thread.dBias); + // ElementC const* ptr_C = nullptr; + // StrideC dC{}; + ElementD* ptr_D = nullptr; + StrideD dD{}; + }; + + // Device side epilogue params + template + struct ParamsType { + typename ThreadEpiOp::Params epilogue_op{}; + ElementD* ptr_D = nullptr; + StrideD dD{}; + }; + + using Params = ParamsType; + + // + // Methods + // + + template + static constexpr Params to_underlying_arguments( + [[maybe_unused]] ProblemShape const& _, + Arguments const& args, + [[maybe_unused]] void* workspace) { + return { + args.epilogue_op, + args.ptr_D, + args.dD, + }; + } + + template + static size_t get_workspace_size(ProblemShape const& problem_shape, + Arguments const& args) { + return 0; + } + + template + static cutlass::Status initialize_workspace( + ProblemShape const& problem_shape, + Arguments const& args, + void* workspace, + cudaStream_t stream, + CudaHostAdapter* cuda_adapter = nullptr) { + return cutlass::Status::kSuccess; + } + + template + static bool can_implement([[maybe_unused]] ProblemShape const& problem_shape, + [[maybe_unused]] Arguments const& args) { + return true; + } + + CUTLASS_HOST_DEVICE + explicit EpilogueVariadic(Params const& params_) + : params(params_), epilogue_op(params_.epilogue_op) {} + + CUTLASS_DEVICE + bool is_source_needed() { return epilogue_op.is_source_needed(); } + + template + CUTLASS_DEVICE void operator()(ProblemShapeMNKL problem_shape_mnkl, + BlockShapeMNK blk_shape_MNK, + BlockCoordMNKL blk_coord_mnkl, + cute::Tensor const& + accumulators, // (MMA,MMA_M,MMA_N) + TiledMma tiled_mma, + ResidueMNK residue_mnk, + int thread_idx, + char* smem_buf) { + using namespace cute; // NOLINT + using X = Underscore; + + static_assert(cute::rank(ProblemShapeMNKL{}) == 4, + "ProblemShapeMNKL must be rank 4"); + static_assert(is_static::value, + "ThreadBlock tile shape must be static"); + static_assert(cute::rank(BlockShapeMNK{}) == 3, + "BlockShapeMNK must be rank 3"); + static_assert(cute::rank(BlockCoordMNKL{}) == 4, + "BlockCoordMNKL must be rank 3"); + + static_assert(rank(EpilogueTile{}) == 2, + "Rank of EpilogueTile should be 2"); + static_assert(rank(SmemLayout{}) == 2, "Rank of SmemLayout should be 2"); + CUTE_STATIC_ASSERT(size(SmemLayout{}) == + size<0>(EpilogueTile{}) * size<1>(EpilogueTile{})); + + // Separate out problem shape for convenience + auto M = get<0>(problem_shape_mnkl); + auto N = get<1>(problem_shape_mnkl); + auto L = get<3>(problem_shape_mnkl); + + // Represent the full output tensor + // Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), + // make_shape(M,N,L), params.dC); // (m,n,l) + Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), + make_shape(M, N, L), + params.dD); // (m,n,l) + // Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), + // make_shape(M,N,L), params.dBias); // (m,n,l) + + // Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), + // Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + Tensor gD_mnl = local_tile(mD_mnl, + blk_shape_MNK, + make_coord(_, _, _), + Step<_1, _1, X>{}); // (BLK_M,BLK_N,m,n,l) + // Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, + // make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + + // Slice to get the tile this CTA is responsible for + auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl; + // Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + Tensor gD = gD_mnl(_, _, m_coord, n_coord, l_coord); // (BLK_M,BLK_N) + // Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + + // Construct a tensor in SMEM that we can partition for rearranging data + SharedStorage& storage = *reinterpret_cast(smem_buf); + + // Common part (with sm70 epilogue) end-------------------------------- + // init register transform + // TV -> MN(logical) + auto layout_gd_tv = tiled_mma.thrfrg_C(make_layout(gD.shape())); + + // G2R / R2S ------------------------------------- + // In G2R / R2S tiling, we assemble the final V->MN layout out from given + // epilogue tile and mma layout + + // epilogue tile of logical C + auto tE_D = + flat_divide(make_layout(product_each(gD.shape())), EpilogueTile{}); + + using EpilogueTileSize = decltype(product_each(shape(EpilogueTile{}))); + + Tensor sAcc = + make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{}); + + auto r2s_mn_tiler_tv = take<0, 2>( + right_inverse(layout_gd_tv).compose(tE_D)); // epilogue logical mn->tv + // get tiler of TV from tiler of MN + auto r2s_tv_tiler_shape = product_each(layout_gd_tv.shape()); + auto r2s_tv_zeros = repeat_like(r2s_tv_tiler_shape, Int<0>{}); // T and V + auto r2s_tv_tiler = + cute::transform(make_seq{}, [&](auto i) { + auto tiler_origin = filter( + composition(make_layout(r2s_tv_tiler_shape, + replace(r2s_tv_zeros, Int<1>{})), + r2s_mn_tiler_tv)); + auto complemented = + complement(tiler_origin, get(r2s_tv_tiler_shape)); + return cute::layout<1>( + // divide complement to make it monotonic + zipped_divide(make_layout(shape(r2s_tv_tiler_shape)), + complemented)); + }); + + auto r2s_TV_D_tiler = + left_inverse(take<0, 2>(tE_D)) + .compose( // tv -> mn in epilogue tiler + layout_gd_tv.compose( // tv -> mn in C + make_layout(r2s_tv_tiler_shape) + .compose(r2s_tv_tiler))); // tv_tiler -> tv coord + + // G2R ---------------------------------------- + using TiledCopyG2R = + TiledCopy; + auto thread_g2r = TiledCopyG2R::get_slice(thread_idx); + // auto tEgD = flat_divide(gC, EpilogueTile{}); + // auto tGR_gC = thread_g2r.partition_S(tEgD); + auto cC = make_identity_tensor( + make_shape(size<0>(gD), size<1>(gD))); // same shape as gC/gD + auto cCt = flat_divide(cC, EpilogueTile{}); + auto tRS_cC = thread_g2r.partition_S(cCt); + + // R2S ---------------------------------------- + using TiledCopyR2S = + TiledCopy; + auto tiled_r2s = TiledCopyR2S{}; + + auto tile_RS_R = cute::layout<1>( // just V + make_layout(make_shape(typename TiledCopyR2S::TiledNumThr{}, + typename TiledCopyR2S::TiledNumVal{}), + make_stride(_0{}, _1{})) + .compose(group<1, 3>( // V_copy -> V_fragment + right_inverse(layout_gd_tv) // TV_copy -> TV_fragment + .compose(tiled_r2s.tidfrg_S(tE_D)))))( + _, repeat>(_)); // TV_copy -> MN + + auto thread_r2s = tiled_r2s.get_slice(thread_idx); + auto tErAcc = make_tensor(accumulators.data(), tile_RS_R); + auto tRS_sAcc = thread_r2s.partition_D(sAcc); + + // S2R ---------------------------------------- + auto tiled_s2r = TiledCopyS2R{}; + auto thread_s2r = tiled_s2r.get_slice(thread_idx); + auto tSR_sAcc = thread_s2r.partition_S(sAcc); + auto tSR_rAcc = make_tensor(make_layout(tSR_sAcc.shape())); + + // R2R -------------------------------------------- + auto tiled_r2r = + make_tiled_copy(CopyAtomR2R{}, + typename CopyAtomR2R::ThrID{}, + make_layout(typename TiledCopyR2S::TiledNumVal{})); + auto thread_r2r = tiled_r2r.get_slice(thread_idx); + auto tRR_rAcc = make_tensor(tSR_rAcc.shape()); + auto tRR_rSrc = thread_r2r.retile_S(tSR_rAcc); + auto tRR_rDst = thread_r2r.retile_D(tRR_rAcc); + + // R2G ------------------------------------------- + auto r2r_vlayout = + filter(cute::layout<1>(tiled_r2r.get_layoutS_TV()) + .compose(cute::layout<1>(tiled_r2r.get_layoutD_TV()))); + auto r2g_tv_layout = + (typename TiledCopyS2R::TiledLayout_TV{}).compose(_, r2r_vlayout); + auto tiled_r2g = + TiledCopy{}; + auto thread_r2g = tiled_r2g.get_slice(thread_idx); + + auto tRG_rAcc = thread_r2g.retile_S(tRR_rDst); + auto tEgD = flat_divide(gD, EpilogueTile{}); + auto tRG_gD = thread_r2g.partition_D(tEgD); + + // Repeat the D-partitioning for coordinates and predication + Tensor cD = make_identity_tensor(make_shape( + size<0>(gD), size<1>(gD))); // (BLK_M,BLK_N) -> (blk_m,blk_n) + Tensor cDt = + flat_divide(cD, EpilogueTile{}); // (SMEM_M,SMEM_N,TILE_M,TILE_N) + Tensor tRG_cD = thread_r2g.partition_D(cDt); + + CUTLASS_PRAGMA_UNROLL + for (int epi_tile_m = 0; epi_tile_m < size<2>(tEgD).value; ++epi_tile_m) { + CUTLASS_PRAGMA_UNROLL + for (int epi_tile_n = 0; epi_tile_n < size<3>(tEgD).value; ++epi_tile_n) { + Tensor tRS_rAccmn = + tErAcc(_, _, _, epi_tile_m, epi_tile_n); // ((2, (2, 2)), 1, 1) + Tensor cC_mn = tRS_cC(_, _, _, epi_tile_m, epi_tile_n); + Tensor tRS_rD = make_tensor_like(tRS_rAccmn); + + int m_cta_coord_base = m_coord * size<0>(gD); + int n_cta_coord_base = n_coord * size<1>(gD); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(tRS_rAccmn); ++i) { + auto cta_coord = cC_mn(i); + tRS_rD(i) = + epilogue_op(tRS_rAccmn(i), + l_coord, + m_cta_coord_base + get<0>(cta_coord), + n_cta_coord_base + get<1>(cta_coord), + elem_less(cta_coord, take<0, 2>(residue_mnk))); + } + + copy(CopyAtomR2S{}, tRS_rD, tRS_sAcc); + copy(TiledCopyS2R{}, tSR_sAcc, tSR_rAcc); + copy(CopyAtomR2R{}, tRR_rSrc, tRR_rDst); + + Tensor tRG_gDmn = tRG_gD(_, _, _, epi_tile_m, epi_tile_n); + Tensor tRG_cDmn = tRG_cD(_, _, _, epi_tile_m, epi_tile_n); + CUTLASS_PRAGMA_UNROLL + for (int atom_i = 0; atom_i < size<0, 1>(tRR_rAcc); ++atom_i) { + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < size<1>(tRR_rAcc); ++m) { + CUTLASS_PRAGMA_UNROLL + + for (int n = 0; n < size<2>(tRR_rAcc); ++n) { + if (elem_less(tRG_cDmn(make_coord(0, atom_i), m, n), + take<0, 2>(residue_mnk))) { + copy(CopyAtomR2G{}, + tRG_rAcc(make_coord(_, atom_i), m, n), + tRG_gDmn(make_coord(_, atom_i), m, n)); + } + } + } + } + } + } + } + + private: + Params params; + ThreadEpilogueOp epilogue_op; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace collective +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h new file mode 100644 index 00000000000..ffda0b076b8 --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h @@ -0,0 +1,409 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/*! \file + \brief Functor performing linear combination operations used by epilogues. +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/cutlass.h" +#include "cutlass/epilogue/thread/scale_type.h" +#include "cutlass/functional.h" +#include "cutlass/numeric_conversion.h" +#include "cutlass/numeric_types.h" +#include "cutlass_patch/batched_matrix_coord.h" +#include "cutlass_patch/trace_device.h" + +namespace cutlass { +namespace epilogue { +namespace thread { + +template +struct GenericVariadicTraits { + static constexpr bool IsArgumentsNeeded = false; + struct Arguments {}; +}; + +template +struct GenericVariadicTraits { + static constexpr bool IsArgumentsNeeded = true; + using Arguments = typename VariadicOp::Arguments; +}; + +/// Applies a linear combination operator to an array of elements. +/// +/// D = VariadicOp(alpha * accumulator + beta * source) +/// +template < + template + class VariadicOp, + typename ElementOutput_, ///< Data type used to load and store tensors + int ElementsPerAccess, ///< Number of elements computed per operation. + ///< Usually it is 128/sizeof_bits, + ///< but we use 64 or 32 sometimes when there are + ///< not enough data to store + typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type + typename ElementCompute_ = + ElementOutput_, ///< Data type used to compute linear combination + ScaleType::Kind Scale = + ScaleType::Default, ///< Control Alpha and Beta scaling + FloatRoundStyle Round = FloatRoundStyle::round_to_nearest, + bool IsHeavy = false> +class LinearCombinationVariadic { + public: + using ElementOutput = ElementOutput_; + using ElementAccumulator = ElementAccumulator_; + using ElementCompute = ElementCompute_; + using ElementC = ElementOutput_; + using ElementD = ElementOutput_; + + using VariadicArguments = + typename GenericVariadicTraits>::Arguments; + + static bool const kIsHeavy = IsHeavy; + static int const kElementsPerAccess = ElementsPerAccess; + static int const kCount = ElementsPerAccess; + static const ScaleType::Kind kScale = Scale; + + using FragmentOutput = Array; + using FragmentAccumulator = Array; + using FragmentSource = Array; + using FragmentCompute = Array; + + static FloatRoundStyle const kRound = Round; + + /// Host-constructable parameters structure + struct Params { + ElementCompute alpha; ///< scales accumulators + ElementCompute beta; ///< scales source tensor + ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if + ///< not null, loads it from memory + ElementCompute const *beta_ptr; ///< pointer to source scalar - if not + ///< null, loads it from memory + VariadicArguments variadic_args; + + CUTLASS_HOST_DEVICE + Params() + : alpha(ElementCompute(1)), + beta(ElementCompute(0)), + alpha_ptr(nullptr), + beta_ptr(nullptr) {} + + CUTLASS_HOST_DEVICE + Params(ElementCompute alpha, + ElementCompute beta, + VariadicArguments variadic_args_ = VariadicArguments{}) + : alpha(alpha), + beta(beta), + alpha_ptr(nullptr), + beta_ptr(nullptr), + variadic_args(variadic_args_) {} + }; + + private: + // + // Data members + // + + Params params_; + bool skip_elementwise_; + + public: + /// Constructs the function object, possibly loading from pointers in host + /// memory + CUTLASS_HOST_DEVICE + LinearCombinationVariadic(Params const ¶ms) { + params_ = params; + params_.alpha = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); + params_.beta = (params.beta_ptr ? *params.beta_ptr : params.beta); + skip_elementwise_ = false; + } + + /// Returns true if source is needed + CUTLASS_HOST_DEVICE + bool is_source_needed() const { + if (Scale == ScaleType::NoBetaScaling) + return params_.beta != ElementCompute(0); + + if (Scale == ScaleType::OnlyAlphaScaling) return false; + + if (Scale == ScaleType::Nothing) return false; + + return params_.beta != ElementCompute(0); + } + + /// Functionally required for serial reduction in the epilogue + CUTLASS_HOST_DEVICE + void set_k_partition(int k_partition, int k_partition_count) { + if (k_partition) { + params_.beta = ElementCompute(1); + } + + if (k_partition != k_partition_count - 1) { + skip_elementwise_ = true; + } + } + + /// Computes linear scaling with source: D = alpha * accumulator + beta * + /// source + CUTLASS_HOST_DEVICE + FragmentOutput operator()(FragmentAccumulator const &accumulator, + FragmentSource const &source, + int batch, + int row_offset, + int column_offset, + bool valid) const { + CUTLASS_TRACE_DEVICE( + "kElementsPerAccess: %d, row_offset: %d, column_offset: %d", + kElementsPerAccess, + row_offset, + column_offset); + + // Convert source to internal compute numeric type + NumericArrayConverter + source_converter; + NumericArrayConverter + accumulator_converter; + + FragmentCompute converted_source = source_converter(source); + FragmentCompute converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + FragmentCompute intermediate; + + multiplies mul_add_source; + multiply_add mul_add_accumulator; + VariadicOp variadic_op; + + if (Scale == ScaleType::NoBetaScaling) { + intermediate = converted_source; + // D = alpha * Accum + X + intermediate = mul_add_accumulator( + params_.alpha, converted_accumulator, intermediate); + } else if (Scale == ScaleType::Nothing) { + intermediate = converted_accumulator; + } else { + // X = beta * C + uniform + intermediate = mul_add_source(params_.beta, converted_source); + // D = alpha * Accum + X + intermediate = mul_add_accumulator( + params_.alpha, converted_accumulator, intermediate); + } + + if constexpr (GenericVariadicTraits< + VariadicOp>::IsArgumentsNeeded) { + if (!skip_elementwise_) { +#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE + intermediate = variadic_op.Compute( + intermediate, + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset, valid)); +#else + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kElementsPerAccess; ++i) { + intermediate[i] = variadic_op( + intermediate[i], + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset + i, valid)); + } +#endif + } + } else { + if (!skip_elementwise_) { +#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE + intermediate = variadic_op.Compute(intermediate); +#else + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kElementsPerAccess; ++i) { + intermediate[i] = variadic_op(intermediate[i]); + } +#endif + } + } + + // Convert to destination numeric type + NumericArrayConverter + destination_converter; + + return destination_converter(intermediate); + } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()(FragmentAccumulator const &accumulator, + int batch, + int row_offset, + int column_offset, + bool valid) const { + CUTLASS_TRACE_DEVICE( + "kElementsPerAccess: %d, row_offset: %d, column_offset: %d", + kElementsPerAccess, + row_offset, + column_offset); + + // Convert source to internal compute numeric type + NumericArrayConverter + accumulator_converter; + + FragmentCompute converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + FragmentCompute intermediate; + + multiplies mul_accumulator; + VariadicOp variadic_op; + + if (Scale == ScaleType::Nothing) { + intermediate = converted_accumulator; + } else { + // D = alpha * Accum + intermediate = mul_accumulator(params_.alpha, converted_accumulator); + } + + if constexpr (GenericVariadicTraits< + VariadicOp>::IsArgumentsNeeded) { + if (!skip_elementwise_) { +#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE + intermediate = variadic_op.Compute( + intermediate, + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset, valid)); +#else + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kElementsPerAccess; ++i) { + intermediate[i] = variadic_op( + intermediate[i], + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset + i, valid)); + } +#endif + } + } else { + if (!skip_elementwise_) { +#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE + intermediate = variadic_op.Compute(intermediate); +#else + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kElementsPerAccess; ++i) { + intermediate[i] = variadic_op(intermediate[i]); + } +#endif + } + } + + // Convert to destination numeric type + NumericArrayConverter + destination_converter; + + return destination_converter(intermediate); + } + + // + // Specializations for scalar (for use with cute::collective::DefaultEpilogue) + // + CUTLASS_HOST_DEVICE + ElementD operator()(ElementAccumulator const accumulator, + ElementC const source, + int batch, + int row_offset, + int column_offset, + bool valid) const { + // Convert everything to Compute type, do compute, and then store to output + // type + NumericConverter + accumulator_converter; + [[maybe_unused]] NumericConverter + source_converter; + NumericConverter destination_converter; + + // Convert to destination numeric type + + ElementCompute converted_accumulator = accumulator_converter(accumulator); + if constexpr (Scale == ScaleType::Nothing) { + return destination_converter(converted_accumulator); + } + + // Perform binary operations + ElementCompute intermediate; + multiplies multiply; + multiply_add madd; + VariadicOp variadic_op; + + if constexpr (Scale == ScaleType::NoBetaScaling) { + intermediate = source_converter(source); + } else { + intermediate = multiply(params_.beta, source); // X = beta * C + uniform + } + + intermediate = madd(params_.alpha, + converted_accumulator, + intermediate); // D = alpha * Accum + X + intermediate = variadic_op( + intermediate, + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset, valid)); + return destination_converter(intermediate); + } + + CUTLASS_HOST_DEVICE + ElementD operator()(ElementAccumulator const accumulator, + int batch, + int row_offset, + int column_offset, + bool valid) const { + // Convert everything to Compute type, do compute, and then store to output + // type + NumericConverter + accumulator_converter; + NumericConverter destination_converter; + ElementCompute converted_accumulator = accumulator_converter(accumulator); + + // Convert to destination numeric type + if constexpr (Scale == ScaleType::Nothing) { + return destination_converter(converted_accumulator); + } + + // Perform binary operations + ElementCompute intermediate; + multiplies multiply; + VariadicOp variadic_op; + + intermediate = multiply(params_.alpha, accumulator); // D = alpha * Accum + intermediate = variadic_op( + intermediate, + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset, valid)); + return destination_converter(intermediate); + } +}; + +} // namespace thread +} // namespace epilogue +} // namespace cutlass diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h new file mode 100644 index 00000000000..470e17b1021 --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h @@ -0,0 +1,73 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if CUTLASS_DEBUG_TRACE_LEVEL + +#ifndef CUTLASS_TRACE_DEVICE +#define CUTLASS_TRACE_DEVICE(format, ...) \ + { \ + if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && \ + threadIdx.x == 0 && threadIdx.y == 0) { \ + printf("[DEVICE][%s:%d, %s]" format "\n", \ + __FILE__, \ + __LINE__, \ + __FUNCTION__, \ + ##__VA_ARGS__); \ + } \ + } +#endif + +#ifndef CUTLASS_TRACE_DEVICE_TID_DETAIL +#define CUTLASS_TRACE_DEVICE_TID_DETAIL(bidz, bidx, tidx, format, ...) \ + { \ + if (blockIdx.x == bidx && blockIdx.y == 0 && blockIdx.z == bidz && \ + threadIdx.x == tidx && threadIdx.y == 0) { \ + printf("[DEVICE][%s:%d, %s][bid={%d,%d,%d}, tid={%d,%d,%d}]" format \ + "\n", \ + __FILE__, \ + __LINE__, \ + __FUNCTION__, \ + blockIdx.x, \ + blockIdx.y, \ + blockIdx.z, \ + threadIdx.x, \ + threadIdx.y, \ + threadIdx.z, \ + ##__VA_ARGS__); \ + } \ + } +#endif + +#ifndef CUTLASS_TRACE_DEVICE_TID +#define CUTLASS_TRACE_DEVICE_TID(format, ...) \ + { \ + CUTLASS_TRACE_DEVICE_TID_DETAIL(0, 0, 0, format, ##__VA_ARGS__) \ + CUTLASS_TRACE_DEVICE_TID_DETAIL(0, 0, 1, format, ##__VA_ARGS__) \ + CUTLASS_TRACE_DEVICE_TID_DETAIL(0, 1, 0, format, ##__VA_ARGS__) \ + } +#endif + +#else + +#ifndef CUTLASS_TRACE_DEVICE +#define CUTLASS_TRACE_DEVICE(format, ...) +#endif + +#ifndef CUTLASS_TRACE_DEVICE_TID +#define CUTLASS_TRACE_DEVICE_TID(format, ...) +#endif + +#endif diff --git a/backends/iluvatar_gpu/apy/matmul/default_config_id.h b/backends/iluvatar_gpu/apy/matmul/default_config_id.h new file mode 100644 index 00000000000..d160fad04e5 --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/default_config_id.h @@ -0,0 +1,29 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +//#include "all_tuning_configs.h" // NOLINT + +#pragma once + +namespace ap { + +struct DefaultConfig { + static constexpr int kConfigId = 0; + static constexpr int kSwizzleFactor = 1; + static constexpr bool kBatched = false; +}; + +} // namespace ap diff --git a/backends/iluvatar_gpu/apy/matmul/matmul.h b/backends/iluvatar_gpu/apy/matmul/matmul.h new file mode 100644 index 00000000000..d2794859f0b --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/matmul.h @@ -0,0 +1,39 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include +#include +#include + +#define CHECK_CUDA(func) \ + { \ + cudaError_t err = func; \ + if (err != cudaSuccess) { \ + std::cerr << "[" << __FILE__ << ":" << __LINE__ << ", " << __FUNCTION__ \ + << "] " \ + << "CUDA error(" << err << "), " << cudaGetErrorString(err) \ + << " when call " << #func << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } + +#include "cutlass_matmul.cuh" // NOLINT +//#include "math_function.h" // NOLINT +//#include "profile.h" // NOLINT diff --git a/backends/iluvatar_gpu/apy/matmul/params.h b/backends/iluvatar_gpu/apy/matmul/params.h new file mode 100644 index 00000000000..6be8a242b63 --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/params.h @@ -0,0 +1,178 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#define ASSERT_CHECK(__cond) \ + do { \ + const bool __cond_var = (__cond); \ + if (!__cond_var) { \ + ::std::string __err_msg = ::std::string("`") + #__cond + \ + "` check failed at " + __FILE__ + ":" + \ + ::std::to_string(__LINE__); \ + throw std::runtime_error(__err_msg); \ + } \ + } while (0) + +namespace ap { + +template +struct Alignment { + static constexpr int kValue = + ((Dim % 8) == 0) ? 8 + : (((Dim % 4) == 0) ? 4 : (((Dim % 2) == 0) ? 2 : 1)); +}; + +template +struct Alignment { + static constexpr int kValue = + ((Dim % 4) == 0) ? 4 : (((Dim % 2) == 0) ? 2 : 1); +}; + +struct GemmEpilogueParams { + int batch_count; + int m; + int n; + int k; + + bool transpose_a; + bool transpose_b; + + // Shape related aruguments + struct ShapeArguments { + int64_t batch_stride_A; + int64_t batch_stride_B; + int64_t batch_stride_C; + int64_t batch_stride_D; + int64_t lda; + int64_t ldb; + int64_t ldc_bias; + int64_t ldd; + }; + + ShapeArguments shape_args; + + const void *input; + const void *weight; + const void *bias; + void *output; + + cudaStream_t *stream_ptr; + + std::vector input0_shape; + std::vector input1_shape; + std::vector epilogue_in_ptrs; + std::vector epilogue_out_ptrs; + std::vector> epilogue_in_shapes; + std::vector> epilogue_out_shapes; + + GemmEpilogueParams() {} + GemmEpilogueParams(void *stream_ptr, + const void *input, + const void *weight, + const void *bias, + void *output, + const std::vector &input_shape, + const std::vector &weight_shape, + const std::vector &bias_shape, + bool transpose_a = false, + bool transpose_b = false) + : stream_ptr(reinterpret_cast(stream_ptr)), + input(input), + weight(weight), + bias(bias), + output(output), + transpose_a(transpose_a), + transpose_b(transpose_b) { + ASSERT_CHECK(input_shape.size() >= 2U); + ASSERT_CHECK(weight_shape.size() >= 2U); + + input0_shape = input_shape; + input1_shape = weight_shape; + + batch_count = 1; + for (size_t i = 0; i < input_shape.size() - 2; ++i) { + batch_count *= input_shape[i]; + } + + if (transpose_a) { + m = input_shape[input_shape.size() - 1]; + k = input_shape[input_shape.size() - 2]; + } else { + m = input_shape[input_shape.size() - 2]; + k = input_shape[input_shape.size() - 1]; + } + if (transpose_b) { + ASSERT_CHECK(weight_shape[weight_shape.size() - 1] == k); + n = weight_shape[weight_shape.size() - 2]; + } else { + ASSERT_CHECK(weight_shape[weight_shape.size() - 2] == k); + n = weight_shape[weight_shape.size() - 1]; + } + + if (bias) { + ASSERT_CHECK(bias_shape.size() >= 1U); + ASSERT_CHECK(bias_shape[bias_shape.size() - 1] == n); + } + +#if AP_ENABLE_DEBUG + std::cout << "-- [GemmEpilogueParams] batch_count: " << batch_count + << ", m: " << m << ", n: " << n << ", k: " << k << std::endl; + std::cout << "-- [GemmEpilogueParams] input: " << input << std::endl; + std::cout << "-- [GemmEpilogueParams] weight: " << weight << std::endl; + std::cout << "-- [GemmEpilogueParams] bias: " << bias << std::endl; + std::cout << "-- [GemmEpilogueParams] output: " << output << std::endl; + std::cout << "-- [GemmEpilogueParams] stream_str: " << stream_str + << std::endl; +#endif + + shape_args.batch_stride_A = m * k; + shape_args.batch_stride_B = (weight_shape.size() == 2) ? 0 : n * k; + shape_args.batch_stride_D = m * n; + + shape_args.lda = transpose_a ? m : k; + shape_args.ldb = transpose_b ? k : n; + shape_args.ldd = n; + + bool is_C_bias = bias_shape.size() == 1UL; + + /// Only available in RRR format + shape_args.batch_stride_C = (!bias || is_C_bias) ? 0 : m * n; + shape_args.ldc_bias = (!bias || is_C_bias) ? 0 : n; + } + + void SetEpilogues(const std::vector &in_ptrs, + const std::vector &out_ptrs) { + epilogue_in_ptrs = in_ptrs; + epilogue_out_ptrs = out_ptrs; + } + + void SetEpilogueAndShapes( + const std::vector &in_ptrs, + const std::vector> &in_shapes, + const std::vector &out_ptrs, + const std::vector> &out_shapes) { + ASSERT_CHECK(in_ptrs.size() == in_shapes.size()); + epilogue_in_ptrs = in_ptrs; + epilogue_in_shapes = in_shapes; + ASSERT_CHECK(out_ptrs.size() == out_shapes.size()); + epilogue_out_ptrs = out_ptrs; + epilogue_out_shapes = out_shapes; + } +}; + +} // namespace ap diff --git a/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py new file mode 100644 index 00000000000..ccb41f116aa --- /dev/null +++ b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py @@ -0,0 +1,159 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import numpy as np +from pathlib import Path + +import paddle +import paddle.incubate.cc as pcc +import paddle.incubate.cc.typing as pct +import paddle.profiler as profiler + + +os.environ["AP_WORKSPACE_DIR"] = "/tmp/paddle/ap_workspace" + + +def GetPirProgram(fused_func, tensor_args): + dtypes = tuple(tensor.dtype for tensor in tensor_args) + func = fused_func.func_overload_ctx.dtypes2func.get(dtypes, None) + return str(func.infer_program.forward_program) + + +DT = "float16" +BS = 4 +MS = 784 +NS = 192 +KS = 768 + + +class TestMatmulEpilogue(unittest.TestCase): + def setUp(self): + dtype = DT + x_shape = [BS, MS, KS] + self.x = paddle.randn(x_shape, dtype=dtype) + self.x.stop_gradient = True + + y_shape = [KS, NS] + self.y = paddle.randn(y_shape, dtype=dtype) + self.y.stop_gradient = True + + b_shape = [BS, MS, NS] + self.b = paddle.randn(b_shape, dtype=dtype) + self.b.stop_gradient = True + + bias_shape = [NS] + self.bias = paddle.randn(bias_shape, dtype=dtype) + self.bias.stop_gradient = True + + residual_shape = [BS, MS, NS] + self.residual = paddle.randn(residual_shape, dtype=dtype) + self.residual.stop_gradient = True + + mask_shape = [BS, MS, NS] + self.mask = paddle.randn(mask_shape, dtype=dtype) + self.mask.stop_gradient = True + + def get_matmul_add_act(self): + B = pct.DimVar(BS) + M = pct.DimVar(MS) + K = pct.DimVar(KS) + N = pct.DimVar(NS) + T = pct.DTypeVar("T", DT) + + def matmul_add_act( + x: pct.Tensor([B, M, K], T), + y: pct.Tensor([K, N], T), + b: pct.Tensor([B, M, N], T), + ): + + out = paddle.matmul(x, y) + out = out + b + return paddle.nn.functional.relu(out) + + return matmul_add_act + + def get_matmul_add_divide_multipy_add(self): + B = pct.DimVar(BS) + M = pct.DimVar(MS) + K = pct.DimVar(KS) + N = pct.DimVar(NS) + T = pct.DTypeVar("T", DT) + + def matmul_add_divide_multipy_add( + x: pct.Tensor([B, M, K], T), + y: pct.Tensor([K, N], T), + bias: pct.Tensor([N], T), + residual: pct.Tensor([B, M, N], T), + mask: pct.Tensor([B, M, N], T), + ): + out = paddle.matmul(x, y) + out = out + bias + # out = out / 1.2 + out = out * mask + return residual + out + + return matmul_add_divide_multipy_add + + def check_if_ap_variadic_exist(self, fused_foo, foo_args): + generated_pir_program = GetPirProgram(fused_foo, foo_args) + assert ( + "pd_op.ap_variadic" in generated_pir_program + ), "AP fusion failed, none pd_op.ap_variadic found in the pir_program." + + def check_by_profiler(self, fused_foo, foo_args): + paddle.device.synchronize() + + iters = 10 + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU], + on_trace_ready=profiler.export_chrome_tracing("./profiler_log"), + timer_only=False, + ) as prof: + for _ in range(iters): + _ = fused_foo(*foo_args) + prof.step() + prof.summary( + sorted_by=profiler.SortedKeys.GPUTotal, + op_detail=True, + thread_sep=False, + time_unit="us", + ) + + def test_subgraph(self): + foo = self.get_matmul_add_act() + foo_args = (self.x, self.y, self.b) + + # foo = self.get_matmul_add_divide_multipy_add() + # foo_args = (self.x, self.y, self.bias, self.residual, self.mask) + + iluvatar_gpu_dir = Path(__file__).resolve().parent.parent.parent + fused_foo = pcc.compile( + foo, + ap_path=f"{iluvatar_gpu_dir}/apy/device", + backend_device="custom_device", + ) + + self.check_if_ap_variadic_exist(fused_foo, foo_args) + self.check_by_profiler(fused_foo, foo_args) + + ap_outs = fused_foo(*foo_args) + dy_outs = foo(*foo_args) + for dy_out, ap_out in zip(dy_outs, ap_outs): + np.testing.assert_allclose(dy_out, ap_out, rtol=5e-2, atol=1e-1) + + +if __name__ == "__main__": + unittest.main()