diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt
index 06512de6d66..27bf8d03d8c 100644
--- a/backends/iluvatar_gpu/CMakeLists.txt
+++ b/backends/iluvatar_gpu/CMakeLists.txt
@@ -814,9 +814,20 @@ list(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/moe_ops_partial_nosoftmaxtopk_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/sparse_attention_kernel.cu)
 
+file(
+  GLOB
+  AP_CC_SRCS
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/axpr/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/fs/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_module/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_gen/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/kernel_dispatch/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/*.cc)
+
 file(
   GLOB_RECURSE CC_SRCS
   RELATIVE ${CMAKE_SOURCE_DIR}
+  ${AP_CC_SRCS}
   runtime/runtime.cc
   runtime/iluvatar_context.cc
   common/*.cc
diff --git a/backends/iluvatar_gpu/apy/device/compile_command_util.py b/backends/iluvatar_gpu/apy/device/compile_command_util.py
new file mode 100644
index 00000000000..e0037f84ea1
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/device/compile_command_util.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ap
+
+
+class CompileCommandGenerator:
+    def __init__(self):
+        self.file_ext = "cu"
+        self.op_type2generate_func = ap.OrderedDict(
+            [
+                ["matmul", self.generate_matmul_compile_command],
+            ]
+        )
+
+    def __call__(self, op_type, tpl_dirname, library_name):
+        return self.op_type2generate_func[op_type](tpl_dirname, library_name)
+
+    def generate_matmul_compile_command(self, tpl_dirname, library_name):
+        parent_dir = ap.dirname(ap.dirname(__file__))
+        cutlass_dir = f"{parent_dir}/matmul/cutlass"
+        matmul_source_dir = f"{parent_dir}/matmul"
+
+        compile_cmd = "clang++ -x ivcore -L/usr/local/corex/lib -lcudart --cuda-path=/usr/local/corex -std=c++17 -O3 -fPIC --cuda-gpu-arch=ivcore11 -Xclang=-fcuda-allow-variadic-functions"
+        compile_cmd = compile_cmd + " -I " + cutlass_dir + "/include"
+        compile_cmd = compile_cmd + " -I " + cutlass_dir + "/tools/util/include"
+        compile_cmd = compile_cmd + " -I " + matmul_source_dir
+        compile_cmd = (
+            compile_cmd
+            + " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -DCUTLASS_DEBUG_TRACE_LEVEL=0 -DCUTLASS_ILUVATAR"
+        )
+        compile_cmd = compile_cmd + " -DAP_ENABLE_AUTOTUNE=0 -DAP_ENABLE_DEBUG=0"
+        compile_cmd = (
+            compile_cmd
+            + f" --shared {library_name}.{self.file_ext} -o lib{library_name}.so"
+        )
+        return compile_cmd
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh b/backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh
new file mode 100644
index 00000000000..f48822fce28
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh
@@ -0,0 +1,290 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cuda.h>
+#include <cuda_bf16.h>
+
+#include "cute/atom/mma_traits.hpp"
+#include "cute/numeric/integral_constant.hpp"
+#include "cute/tensor.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/device_memory.h"
+
+#include "cutlass/detail/layout.hpp"
+
+#include "cute/atom/copy_traits_ix11_sme.hpp"
+#include "cute/atom/copy_traits_ix11.hpp"
+#include "cute/atom/copy_atom.hpp"
+
+#include "cutlass/arch/config.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+
+#include "cutlass_patch/batched_matrix_coord.h"
+#include "cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp"
+#include "cutlass_patch/epilogue/thread/linear_combination_variadic.h"
+
+#include "default_config_id.h"
+#include "params.h"
+
+#define CHECK_CUTLASS(status)                                             \
+  {                                                                       \
+    cutlass::Status error = status;                                       \
+    if (error != cutlass::Status::kSuccess) {                             \
+      std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) \
+                << " at: " << __LINE__ << std::endl;                      \
+      exit(EXIT_FAILURE);                                                 \
+    }                                                                     \
+  }
+
+namespace ap {
+using bfloat16 = nv_bfloat16;
+
+template <typename T, int N>
+using Array = cutlass::Array<T, N>;
+
+using MatrixCoord = cutlass::BatchedMatrixCoord;
+
+// Convert CUDA data type to cutlass data type
+template <typename T>
+struct CutlassDataType {
+  using Type = T;
+};
+
+template <>
+struct CutlassDataType<half> {
+  using Type = cutlass::half_t;
+};
+
+template <>
+struct CutlassDataType<__nv_bfloat16> {
+  using Type = cutlass::bfloat16_t;
+};
+
+// Convert to cutlass layout
+template <bool Transposed>
+struct MatrixLayout {
+  using Type = cutlass::layout::RowMajor;
+};
+
+template <>
+struct MatrixLayout<true> {
+  using Type = cutlass::layout::ColumnMajor;
+};
+
+static cutlass::gemm::GemmUniversalMode GetGemmMode(int batch_count) {
+  return batch_count > 1 ? cutlass::gemm::GemmUniversalMode::kBatched
+                         : cutlass::gemm::GemmUniversalMode::kGemm;
+}
+
+template <typename ElementT,
+          typename ElementComputeT,
+          template <typename T>
+          class VariadicFunctor,
+          int AlignA = 64 / sizeof(ElementT),
+          int AlignB = 64 / sizeof(ElementT),
+          int ConfigId = 0>
+void MatmulAddVariadic(
+    const GemmEpilogueParams &params,
+    const typename VariadicFunctor<ElementComputeT>::Arguments &variadic_args) {
+  
+  using namespace cute;
+
+  using ElementAccumulator =
+      typename CutlassDataType<ElementComputeT>::Type;  // <- data type of
+                                                        // accumulator
+  using ElementComputeEpilogue =
+      ElementAccumulator;  // <- data type of epilogue operations
+  using ElementA =
+      typename CutlassDataType<ElementT>::Type;  // <- data type of elements in
+                                                 // input matrix A
+  using ElementB =
+      typename CutlassDataType<ElementT>::Type;  // <- data type of elements in
+                                                 // input matrix B
+  using ElementC = ElementA;
+  using ElementD = ElementA;
+  using ElementOutput =
+      typename CutlassDataType<ElementT>::Type;  // <- data type of elements in
+                                                 // output matrix D
+
+  constexpr int AlignC = AlignB;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using EpilogueThreadOp = cutlass::epilogue::thread::LinearCombinationVariadic<
+      VariadicFunctor, ElementD, 1, ElementAccumulator, ElementAccumulator>;
+
+  // Epilogue --------------------------------
+  // Params
+  // Tile of C/D
+  using EpilogueTile = Tile<
+    Layout<Shape<_16,_4>,Stride<_1,_64>>,
+    Layout<Shape<_32,_4>,Stride<_1,_64>>
+  >;
+
+  // Layout of smem
+  using SmemLayout = 
+    ComposedLayout<
+      Swizzle<1,1,6>,
+      _0,
+      Layout<
+        Shape <Shape < _4,_2,   _2,  _4>,Shape <_16,  _2,   _4>>,
+        Stride<Stride<_32,_1,_4096,_256>,Stride< _2,_128,_1024>>
+      >
+    >;
+
+  // G2R
+  using EpiG2RAtom = Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<32>, ElementC>;
+
+  // R2S
+  using EpiR2SAtom = Copy_Atom<UniversalCopy<uint32_t>, ElementD>;
+
+  // S2R
+  using EpiS2RAtom = EpiR2SAtom;
+  using TiledCopyS2R =
+    TiledCopy<
+      EpiS2RAtom,
+      Layout<
+        Shape <Shape < _16,_4, _4,   _4>,Shape <_2, _2,_2>>,
+        Stride<Stride<_128,_1,_16,_2048>,Stride<_4,_64,_8>>>,
+      decltype(product_each(shape(EpilogueTile{})))
+    >;
+
+  // R2R
+  using CopyAtomR2R = Copy_Atom<U16_2x2Trans,ElementD>;
+
+  // R2G
+  using EpiR2GAtom = Copy_Atom<UniversalCopy<uint32_t>, ElementD>;
+
+  // Epilogue Collective
+  using CollectiveEpilogue = cutlass::epilogue::collective::EpilogueVariadic<
+      // cutlass::detail::TagToStrideC_t<LayoutC>,
+      cutlass::detail::TagToStrideC_t<LayoutD>,
+      EpilogueThreadOp,
+      EpilogueTile,
+      SmemLayout,
+      EpiG2RAtom,
+      EpiR2SAtom,
+      TiledCopyS2R,
+      CopyAtomR2R,
+      EpiR2GAtom,
+      cutlass::epilogue::EpilogueSimtVectorized>;
+
+  using CtaTiler = Shape<_256,_256,_32>;
+  using TiledMma = 
+    TiledMMA<
+      MMA_Atom<IX11_16x16x16_F32F16F16F32_TT>,
+      Layout<Shape<_4,_4,_1>>,
+      Tile<
+        Layout<Shape<_16,_4,_4>, Stride<_1,_64,_16>>,
+        Layout<Shape<_16,_4,_4>, Stride<_1,_64,_16>>,
+        _16
+      >
+    >;
+
+  using CopyA_Op = IX11_SME_I_16x512b<IX11::SMESwizzle::Row16b, IX11::CacheOP::CacheAll>;
+  using CopyA_Atom = Copy_Atom<Copy_Traits<CopyA_Op>, ElementA>;
+  using CopyA = decltype(make_tiled_copy(CopyA_Atom{0}, Layout<Shape<_16,_1>>{}, Layout<Shape<_16,_32>,Stride<_1,_16>>{}));
+  using SmemLayoutAtomA = IX11::Layout_SME_I_16x512b_Atom<IX11::SMESwizzle::Row16b, ElementA, IX11::Major::K>;
+  using SmemCopyA = Copy_Atom<UniversalCopy<uint32_t>, ElementA>;
+
+  using CopyB_Op = IX11_SME_I_16x512b<IX11::SMESwizzle::Col, IX11::CacheOP::CacheAll>;
+  using CopyB_Atom = Copy_Atom<Copy_Traits<CopyB_Op>, ElementB>;
+  using CopyB = decltype(make_tiled_copy(CopyB_Atom{0}, Layout<Shape<_16,_1>>{}, Layout<Shape<_16,_32>,Stride<_1,_16>>{}));
+  using SmemLayoutAtomB = IX11::Layout_SME_I_16x512b_Atom<IX11::SMESwizzle::Col, ElementB, IX11::Major::K>;
+  using SmemCopyB = Copy_Atom<UniversalCopy<uint32_t>, ElementB>;
+
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+    cutlass::gemm::MainloopIx11SmeUnpredicated<2>,
+    CtaTiler,
+    ElementA, cutlass::detail::TagToStrideA_t<LayoutA>,
+    ElementB, cutlass::detail::TagToStrideB_t<LayoutB>,
+    TiledMma,
+    CopyA, SmemLayoutAtomA, SmemCopyA, cute::identity,
+    CopyB, SmemLayoutAtomB, SmemCopyB, cute::identity
+  >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  ProblemShapeType problem_shape{params.m, params.n, params.k, params.batch_count};
+  
+  const ElementA *input =
+      reinterpret_cast<const ElementA *>(params.input);
+  const ElementB *weight =
+      reinterpret_cast<const ElementB *>(params.weight);
+  ElementOutput *output = reinterpret_cast<ElementOutput *>(params.output);
+  
+  cutlass::KernelHardwareInfo hw_info;
+  // TODO
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  typename EpilogueThreadOp::Params epilogue_op_args;
+  epilogue_op_args.variadic_args = variadic_args;
+
+  int m = params.m, n = params.n, k = params.k, l = params.batch_count;
+  
+  auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, l));
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, l));
+  auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(m, n, l));
+
+  typename Gemm::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    problem_shape,
+    {input, stride_A, weight, stride_B},
+    {epilogue_op_args, output, stride_D},
+    hw_info
+  };
+
+  Gemm device_gemm;
+
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  cudaStream_t* stream_ptr = reinterpret_cast<cudaStream_t*>(params.stream_ptr);
+
+  CHECK_CUTLASS(device_gemm.can_implement(arguments));
+  CHECK_CUTLASS(device_gemm.initialize(arguments, workspace.get(), *stream_ptr));
+  CHECK_CUTLASS(device_gemm(*stream_ptr));
+
+}
+
+}  // namespace ap
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h
new file mode 100644
index 00000000000..11c08461eb0
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+
+struct BatchedMatrixCoord {
+  int batch;
+  int row;
+  int column;
+  bool is_valid;
+
+  CUTLASS_HOST_DEVICE
+  BatchedMatrixCoord() : batch(0), row(0), column(0), is_valid(false) {}
+
+  CUTLASS_HOST_DEVICE
+  BatchedMatrixCoord(int b, int r, int c)
+      : batch(b), row(r), column(c), is_valid(true) {}
+
+  CUTLASS_HOST_DEVICE
+  BatchedMatrixCoord(int b, int r, int c, bool valid)
+      : batch(b), row(r), column(c), is_valid(valid) {}
+};
+
+};  // namespace cutlass
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp
new file mode 100644
index 00000000000..269399222ea
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp
@@ -0,0 +1,445 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    // class StrideC,
+    class StrideD,
+    class ThreadEpilogueOp,
+    class EpilogueTile,
+    class SmemLayout,
+    class CopyAtomG2R_,
+    class CopyAtomR2S,
+    class TiledCopyS2R,
+    class CopyAtomR2R,
+    class CopyAtomR2G,
+    class EpilogueScheduleType = EpilogueSimtVectorized,
+    class Enable = void>
+class EpilogueVariadic {
+  static_assert(
+      cute::is_same_v<EpilogueScheduleType, EpilogueSimtVectorized> ||
+          cute::is_same_v<EpilogueScheduleType, EpiloguePtrArraySimtVectorized>,
+      "Could not find an epilogue specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Iluvatar ix11/ix30 Epilogue Vectorized
+/// 1. Load C(optional), compute D = alpha * Acc + beta * C,
+///   then transform to ElementD(8/16-bit typically)
+/// 2. Copy D from Register to Smem
+/// 3. Load D from Smem to Register
+/// 4. Register-to-Register transform
+/// 5. Store D from Register to Gmem
+///
+/// Support alpha and beta params, bias not supported yet.
+template <
+    // class StrideC_,
+    class StrideD_,
+    class ThreadEpilogueOp_,
+    class EpilogueTile_,
+    class SmemLayout_,
+    class CopyAtomG2R_,
+    class CopyAtomR2S_,
+    class TiledCopyS2R_,
+    class CopyAtomR2R_,
+    class CopyAtomR2G_,
+    class EpilogueScheduleType_>
+class EpilogueVariadic<
+    // StrideC_,
+    StrideD_,
+    ThreadEpilogueOp_,
+    EpilogueTile_,
+    SmemLayout_,
+    CopyAtomG2R_,
+    CopyAtomR2S_,
+    TiledCopyS2R_,
+    CopyAtomR2R_,
+    CopyAtomR2G_,
+    EpilogueScheduleType_,
+    cute::enable_if_t<
+        cute::is_same_v<EpilogueScheduleType_, EpilogueSimtVectorized>>> {
+ public:
+  //
+  // Type Aliases
+  //
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementD = typename ThreadEpilogueOp::ElementOutput;
+  using StrideD = StrideD_;
+  using ElementC = ElementD;  // for GemmUniversal
+  using StrideC = StrideD;
+  using ElementBias =
+      typename detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::type;
+  using EpilogueTile = EpilogueTile_;
+  using SmemLayout = SmemLayout_;
+  using CopyAtomG2R = CopyAtomG2R_;
+  using CopyAtomR2S = CopyAtomR2S_;
+  using TiledCopyS2R = TiledCopyS2R_;
+  using CopyAtomR2R = CopyAtomR2R_;
+  using CopyAtomR2G = CopyAtomR2G_;
+
+  // using GmemTiledCopyC = void;
+  using GmemTiledCopyD = CopyAtomR2G;
+
+  static constexpr bool IsEpilogueBiasSupported =
+      detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
+  using StrideBias = cute::conditional_t<detail::is_m_major<StrideD>(),
+                                         Stride<_1, _0, int64_t>,
+                                         Stride<_0, _1, int64_t>>;
+
+  // static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N,
+  // L]");
+  static_assert(cute::rank(StrideD{}) == 3,
+                "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage {
+    cute::array_aligned<ElementD, cute::cosize_v<SmemLayout>> smem_epilogue;
+  };
+
+  struct Arguments {
+    typename ThreadEpilogueOp::Params epilogue_op{};
+    // using StrideBias = decltype(thread.dBias);
+    // ElementC const* ptr_C = nullptr;
+    // StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  template <class ThreadEpiOp>
+  struct ParamsType {
+    typename ThreadEpiOp::Params epilogue_op{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  using Params = ParamsType<ThreadEpilogueOp>;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return {
+        args.epilogue_op,
+        args.ptr_D,
+        args.dD,
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t get_workspace_size(ProblemShape const& problem_shape,
+                                   Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status initialize_workspace(
+      ProblemShape const& problem_shape,
+      Arguments const& args,
+      void* workspace,
+      cudaStream_t stream,
+      CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool can_implement([[maybe_unused]] ProblemShape const& problem_shape,
+                            [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit EpilogueVariadic(Params const& params_)
+      : params(params_), epilogue_op(params_.epilogue_op) {}
+
+  CUTLASS_DEVICE
+  bool is_source_needed() { return epilogue_op.is_source_needed(); }
+
+  template <class ProblemShapeMNKL,
+            class BlockShapeMNK,
+            class BlockCoordMNKL,
+            class FrgEngine,
+            class FrgLayout,
+            class TiledMma,
+            class ResidueMNK>
+  CUTLASS_DEVICE void operator()(ProblemShapeMNKL problem_shape_mnkl,
+                                 BlockShapeMNK blk_shape_MNK,
+                                 BlockCoordMNKL blk_coord_mnkl,
+                                 cute::Tensor<FrgEngine, FrgLayout> const&
+                                     accumulators,  // (MMA,MMA_M,MMA_N)
+                                 TiledMma tiled_mma,
+                                 ResidueMNK residue_mnk,
+                                 int thread_idx,
+                                 char* smem_buf) {
+    using namespace cute;  // NOLINT
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4,
+                  "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value,
+                  "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3,
+                  "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4,
+                  "BlockCoordMNKL must be rank 3");
+
+    static_assert(rank(EpilogueTile{}) == 2,
+                  "Rank of EpilogueTile should be 2");
+    static_assert(rank(SmemLayout{}) == 2, "Rank of SmemLayout should be 2");
+    CUTE_STATIC_ASSERT(size(SmemLayout{}) ==
+                       size<0>(EpilogueTile{}) * size<1>(EpilogueTile{}));
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    // Represent the full output tensor
+    // Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C),
+    // make_shape(M,N,L), params.dC);             //             (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D),
+                                make_shape(M, N, L),
+                                params.dD);  //             (m,n,l)
+    // Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias),
+    // make_shape(M,N,L), params.dBias);    //             (m,n,l)
+
+    // Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_),
+    // Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl,
+                               blk_shape_MNK,
+                               make_coord(_, _, _),
+                               Step<_1, _1, X>{});  // (BLK_M,BLK_N,m,n,l)
+    // Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK,
+    // make_coord(_,_,_), Step<_1,_1, X>{});       // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    // Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_, _, m_coord, n_coord, l_coord);  // (BLK_M,BLK_N)
+    // Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N)
+
+    // Construct a tensor in SMEM that we can partition for rearranging data
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // Common part (with sm70 epilogue) end--------------------------------
+    // init register transform
+    // TV -> MN(logical)
+    auto layout_gd_tv = tiled_mma.thrfrg_C(make_layout(gD.shape()));
+
+    // G2R / R2S -------------------------------------
+    // In G2R / R2S tiling, we assemble the final V->MN layout out from given
+    // epilogue tile and mma layout
+
+    // epilogue tile of logical C
+    auto tE_D =
+        flat_divide(make_layout(product_each(gD.shape())), EpilogueTile{});
+
+    using EpilogueTileSize = decltype(product_each(shape(EpilogueTile{})));
+
+    Tensor sAcc =
+        make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{});
+
+    auto r2s_mn_tiler_tv = take<0, 2>(
+        right_inverse(layout_gd_tv).compose(tE_D));  // epilogue logical mn->tv
+    // get tiler of TV from tiler of MN
+    auto r2s_tv_tiler_shape = product_each(layout_gd_tv.shape());
+    auto r2s_tv_zeros = repeat_like(r2s_tv_tiler_shape, Int<0>{});  // T and V
+    auto r2s_tv_tiler =
+        cute::transform(make_seq<rank(r2s_tv_tiler_shape)>{}, [&](auto i) {
+          auto tiler_origin = filter(
+              composition(make_layout(r2s_tv_tiler_shape,
+                                      replace<i>(r2s_tv_zeros, Int<1>{})),
+                          r2s_mn_tiler_tv));
+          auto complemented =
+              complement(tiler_origin, get<i>(r2s_tv_tiler_shape));
+          return cute::layout<1>(
+              // divide complement to make it monotonic
+              zipped_divide(make_layout(shape<i>(r2s_tv_tiler_shape)),
+                            complemented));
+        });
+
+    auto r2s_TV_D_tiler =
+        left_inverse(take<0, 2>(tE_D))
+            .compose(                  // tv -> mn in epilogue tiler
+                layout_gd_tv.compose(  // tv -> mn in C
+                    make_layout(r2s_tv_tiler_shape)
+                        .compose(r2s_tv_tiler)));  // tv_tiler -> tv coord
+
+    // G2R ----------------------------------------
+    using TiledCopyG2R =
+        TiledCopy<CopyAtomG2R, decltype(r2s_TV_D_tiler), EpilogueTileSize>;
+    auto thread_g2r = TiledCopyG2R::get_slice(thread_idx);
+    // auto tEgD = flat_divide(gC, EpilogueTile{});
+    // auto tGR_gC = thread_g2r.partition_S(tEgD);
+    auto cC = make_identity_tensor(
+        make_shape(size<0>(gD), size<1>(gD)));  // same shape as gC/gD
+    auto cCt = flat_divide(cC, EpilogueTile{});
+    auto tRS_cC = thread_g2r.partition_S(cCt);
+
+    // R2S ----------------------------------------
+    using TiledCopyR2S =
+        TiledCopy<CopyAtomR2S, decltype(r2s_TV_D_tiler), EpilogueTileSize>;
+    auto tiled_r2s = TiledCopyR2S{};
+
+    auto tile_RS_R = cute::layout<1>(  // just V
+        make_layout(make_shape(typename TiledCopyR2S::TiledNumThr{},
+                               typename TiledCopyR2S::TiledNumVal{}),
+                    make_stride(_0{}, _1{}))
+            .compose(group<1, 3>(            // V_copy -> V_fragment
+                right_inverse(layout_gd_tv)  // TV_copy -> TV_fragment
+                    .compose(tiled_r2s.tidfrg_S(tE_D)))))(
+        _, repeat<rank_v<decltype(tE_D)>>(_));  // TV_copy -> MN
+
+    auto thread_r2s = tiled_r2s.get_slice(thread_idx);
+    auto tErAcc = make_tensor(accumulators.data(), tile_RS_R);
+    auto tRS_sAcc = thread_r2s.partition_D(sAcc);
+
+    // S2R ----------------------------------------
+    auto tiled_s2r = TiledCopyS2R{};
+    auto thread_s2r = tiled_s2r.get_slice(thread_idx);
+    auto tSR_sAcc = thread_s2r.partition_S(sAcc);
+    auto tSR_rAcc = make_tensor<ElementD>(make_layout(tSR_sAcc.shape()));
+
+    // R2R --------------------------------------------
+    auto tiled_r2r =
+        make_tiled_copy(CopyAtomR2R{},
+                        typename CopyAtomR2R::ThrID{},
+                        make_layout(typename TiledCopyR2S::TiledNumVal{}));
+    auto thread_r2r = tiled_r2r.get_slice(thread_idx);
+    auto tRR_rAcc = make_tensor<ElementD>(tSR_rAcc.shape());
+    auto tRR_rSrc = thread_r2r.retile_S(tSR_rAcc);
+    auto tRR_rDst = thread_r2r.retile_D(tRR_rAcc);
+
+    // R2G -------------------------------------------
+    auto r2r_vlayout =
+        filter(cute::layout<1>(tiled_r2r.get_layoutS_TV())
+                   .compose(cute::layout<1>(tiled_r2r.get_layoutD_TV())));
+    auto r2g_tv_layout =
+        (typename TiledCopyS2R::TiledLayout_TV{}).compose(_, r2r_vlayout);
+    auto tiled_r2g =
+        TiledCopy<CopyAtomR2G, decltype(r2g_tv_layout), EpilogueTileSize>{};
+    auto thread_r2g = tiled_r2g.get_slice(thread_idx);
+
+    auto tRG_rAcc = thread_r2g.retile_S(tRR_rDst);
+    auto tEgD = flat_divide(gD, EpilogueTile{});
+    auto tRG_gD = thread_r2g.partition_D(tEgD);
+
+    // Repeat the D-partitioning for coordinates and predication
+    Tensor cD = make_identity_tensor(make_shape(
+        size<0>(gD), size<1>(gD)));  // (BLK_M,BLK_N) -> (blk_m,blk_n)
+    Tensor cDt =
+        flat_divide(cD, EpilogueTile{});  // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor tRG_cD = thread_r2g.partition_D(cDt);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_tile_m = 0; epi_tile_m < size<2>(tEgD).value; ++epi_tile_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_tile_n = 0; epi_tile_n < size<3>(tEgD).value; ++epi_tile_n) {
+        Tensor tRS_rAccmn =
+            tErAcc(_, _, _, epi_tile_m, epi_tile_n);  // ((2, (2, 2)), 1, 1)
+        Tensor cC_mn = tRS_cC(_, _, _, epi_tile_m, epi_tile_n);
+        Tensor tRS_rD = make_tensor_like<ElementD>(tRS_rAccmn);
+
+        int m_cta_coord_base = m_coord * size<0>(gD);
+        int n_cta_coord_base = n_coord * size<1>(gD);
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tRS_rAccmn); ++i) {
+          auto cta_coord = cC_mn(i);
+          tRS_rD(i) =
+              epilogue_op(tRS_rAccmn(i),
+                          l_coord,
+                          m_cta_coord_base + get<0>(cta_coord),
+                          n_cta_coord_base + get<1>(cta_coord),
+                          elem_less(cta_coord, take<0, 2>(residue_mnk)));
+        }
+
+        copy(CopyAtomR2S{}, tRS_rD, tRS_sAcc);
+        copy(TiledCopyS2R{}, tSR_sAcc, tSR_rAcc);
+        copy(CopyAtomR2R{}, tRR_rSrc, tRR_rDst);
+
+        Tensor tRG_gDmn = tRG_gD(_, _, _, epi_tile_m, epi_tile_n);
+        Tensor tRG_cDmn = tRG_cD(_, _, _, epi_tile_m, epi_tile_n);
+        CUTLASS_PRAGMA_UNROLL
+        for (int atom_i = 0; atom_i < size<0, 1>(tRR_rAcc); ++atom_i) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tRR_rAcc); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+
+            for (int n = 0; n < size<2>(tRR_rAcc); ++n) {
+              if (elem_less(tRG_cDmn(make_coord(0, atom_i), m, n),
+                            take<0, 2>(residue_mnk))) {
+                copy(CopyAtomR2G{},
+                     tRG_rAcc(make_coord(_, atom_i), m, n),
+                     tRG_gDmn(make_coord(_, atom_i), m, n));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  Params params;
+  ThreadEpilogueOp epilogue_op;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace collective
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h
new file mode 100644
index 00000000000..ffda0b076b8
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h
@@ -0,0 +1,409 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass_patch/batched_matrix_coord.h"
+#include "cutlass_patch/trace_device.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+template <class VariadicOp, class = void>
+struct GenericVariadicTraits {
+  static constexpr bool IsArgumentsNeeded = false;
+  struct Arguments {};
+};
+
+template <class VariadicOp>
+struct GenericVariadicTraits<VariadicOp,
+                             decltype(typename VariadicOp::Arguments(),
+                                      void())> {
+  static constexpr bool IsArgumentsNeeded = true;
+  using Arguments = typename VariadicOp::Arguments;
+};
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = VariadicOp(alpha * accumulator + beta * source)
+///
+template <
+    template <typename T>
+    class VariadicOp,
+    typename ElementOutput_,  ///< Data type used to load and store tensors
+    int ElementsPerAccess,    ///< Number of elements computed per operation.
+                            ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                            ///< but we use 64 or 32 sometimes when there are
+                            ///< not enough data to store
+    typename ElementAccumulator_ = ElementOutput_,  ///< Accumulator data type
+    typename ElementCompute_ =
+        ElementOutput_,  ///< Data type used to compute linear combination
+    ScaleType::Kind Scale =
+        ScaleType::Default,  ///< Control Alpha and Beta scaling
+    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+    bool IsHeavy = false>
+class LinearCombinationVariadic {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementC = ElementOutput_;
+  using ElementD = ElementOutput_;
+
+  using VariadicArguments =
+      typename GenericVariadicTraits<VariadicOp<ElementCompute>>::Arguments;
+
+  static bool const kIsHeavy = IsHeavy;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = ElementsPerAccess;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentSource = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;             ///< scales accumulators
+    ElementCompute beta;              ///< scales source tensor
+    ElementCompute const *alpha_ptr;  ///< pointer to accumulator scalar - if
+                                      ///< not null, loads it from memory
+    ElementCompute const *beta_ptr;   ///< pointer to source scalar - if not
+                                      ///< null, loads it from memory
+    VariadicArguments variadic_args;
+
+    CUTLASS_HOST_DEVICE
+    Params()
+        : alpha(ElementCompute(1)),
+          beta(ElementCompute(0)),
+          alpha_ptr(nullptr),
+          beta_ptr(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha,
+           ElementCompute beta,
+           VariadicArguments variadic_args_ = VariadicArguments{})
+        : alpha(alpha),
+          beta(beta),
+          alpha_ptr(nullptr),
+          beta_ptr(nullptr),
+          variadic_args(variadic_args_) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  Params params_;
+  bool skip_elementwise_;
+
+ public:
+  /// Constructs the function object, possibly loading from pointers in host
+  /// memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationVariadic(Params const &params) {
+    params_ = params;
+    params_.alpha = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    params_.beta = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling)
+      return params_.beta != ElementCompute(0);
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return params_.beta != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      params_.beta = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Computes linear scaling with source: D = alpha * accumulator + beta *
+  /// source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator,
+                            FragmentSource const &source,
+                            int batch,
+                            int row_offset,
+                            int column_offset,
+                            bool valid) const {
+    CUTLASS_TRACE_DEVICE(
+        "kElementsPerAccess: %d, row_offset: %d, column_offset: %d",
+        kElementsPerAccess,
+        row_offset,
+        column_offset);
+
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute,
+                          ElementOutput,
+                          kElementsPerAccess,
+                          Round>
+        source_converter;
+    NumericArrayConverter<ElementCompute,
+                          ElementAccumulator,
+                          kElementsPerAccess,
+                          Round>
+        accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    VariadicOp<ElementCompute> variadic_op;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      // D = alpha * Accum + X
+      intermediate = mul_add_accumulator(
+          params_.alpha, converted_accumulator, intermediate);
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      // X =  beta * C + uniform
+      intermediate = mul_add_source(params_.beta, converted_source);
+      // D = alpha * Accum + X
+      intermediate = mul_add_accumulator(
+          params_.alpha, converted_accumulator, intermediate);
+    }
+
+    if constexpr (GenericVariadicTraits<
+                      VariadicOp<ElementCompute>>::IsArgumentsNeeded) {
+      if (!skip_elementwise_) {
+#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE
+        intermediate = variadic_op.Compute<kElementsPerAccess>(
+            intermediate,
+            params_.variadic_args,
+            BatchedMatrixCoord(batch, row_offset, column_offset, valid));
+#else
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kElementsPerAccess; ++i) {
+          intermediate[i] = variadic_op(
+              intermediate[i],
+              params_.variadic_args,
+              BatchedMatrixCoord(batch, row_offset, column_offset + i, valid));
+        }
+#endif
+      }
+    } else {
+      if (!skip_elementwise_) {
+#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE
+        intermediate = variadic_op.Compute<kElementsPerAccess>(intermediate);
+#else
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kElementsPerAccess; ++i) {
+          intermediate[i] = variadic_op(intermediate[i]);
+        }
+#endif
+      }
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput,
+                          ElementCompute,
+                          kElementsPerAccess,
+                          Round>
+        destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator,
+                            int batch,
+                            int row_offset,
+                            int column_offset,
+                            bool valid) const {
+    CUTLASS_TRACE_DEVICE(
+        "kElementsPerAccess: %d, row_offset: %d, column_offset: %d",
+        kElementsPerAccess,
+        row_offset,
+        column_offset);
+
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute,
+                          ElementAccumulator,
+                          kElementsPerAccess,
+                          Round>
+        accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+    VariadicOp<ElementCompute> variadic_op;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      // D = alpha * Accum
+      intermediate = mul_accumulator(params_.alpha, converted_accumulator);
+    }
+
+    if constexpr (GenericVariadicTraits<
+                      VariadicOp<FragmentCompute>>::IsArgumentsNeeded) {
+      if (!skip_elementwise_) {
+#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE
+        intermediate = variadic_op.Compute<kElementsPerAccess>(
+            intermediate,
+            params_.variadic_args,
+            BatchedMatrixCoord(batch, row_offset, column_offset, valid));
+#else
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kElementsPerAccess; ++i) {
+          intermediate[i] = variadic_op(
+              intermediate[i],
+              params_.variadic_args,
+              BatchedMatrixCoord(batch, row_offset, column_offset + i, valid));
+        }
+#endif
+      }
+    } else {
+      if (!skip_elementwise_) {
+#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE
+        intermediate = variadic_op.Compute<kElementsPerAccess>(intermediate);
+#else
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kElementsPerAccess; ++i) {
+          intermediate[i] = variadic_op(intermediate[i]);
+        }
+#endif
+      }
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  //
+  // Specializations for scalar (for use with cute::collective::DefaultEpilogue)
+  //
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator,
+                      ElementC const source,
+                      int batch,
+                      int row_offset,
+                      int column_offset,
+                      bool valid) const {
+    // Convert everything to Compute type, do compute, and then store to output
+    // type
+    NumericConverter<ElementCompute, ElementAccumulator, Round>
+        accumulator_converter;
+    [[maybe_unused]] NumericConverter<ElementCompute, ElementC, Round>
+        source_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+
+    // Convert to destination numeric type
+
+    ElementCompute converted_accumulator = accumulator_converter(accumulator);
+    if constexpr (Scale == ScaleType::Nothing) {
+      return destination_converter(converted_accumulator);
+    }
+
+    // Perform binary operations
+    ElementCompute intermediate;
+    multiplies<ElementCompute> multiply;
+    multiply_add<ElementCompute> madd;
+    VariadicOp<ElementCompute> variadic_op;
+
+    if constexpr (Scale == ScaleType::NoBetaScaling) {
+      intermediate = source_converter(source);
+    } else {
+      intermediate = multiply(params_.beta, source);  // X =  beta * C + uniform
+    }
+
+    intermediate = madd(params_.alpha,
+                        converted_accumulator,
+                        intermediate);  // D = alpha * Accum + X
+    intermediate = variadic_op(
+        intermediate,
+        params_.variadic_args,
+        BatchedMatrixCoord(batch, row_offset, column_offset, valid));
+    return destination_converter(intermediate);
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator,
+                      int batch,
+                      int row_offset,
+                      int column_offset,
+                      bool valid) const {
+    // Convert everything to Compute type, do compute, and then store to output
+    // type
+    NumericConverter<ElementCompute, ElementAccumulator, Round>
+        accumulator_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+    ElementCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Convert to destination numeric type
+    if constexpr (Scale == ScaleType::Nothing) {
+      return destination_converter(converted_accumulator);
+    }
+
+    // Perform binary operations
+    ElementCompute intermediate;
+    multiplies<ElementCompute> multiply;
+    VariadicOp<ElementCompute> variadic_op;
+
+    intermediate = multiply(params_.alpha, accumulator);  // D = alpha * Accum
+    intermediate = variadic_op(
+        intermediate,
+        params_.variadic_args,
+        BatchedMatrixCoord(batch, row_offset, column_offset, valid));
+    return destination_converter(intermediate);
+  }
+};
+
+}  // namespace thread
+}  // namespace epilogue
+}  // namespace cutlass
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h
new file mode 100644
index 00000000000..470e17b1021
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if CUTLASS_DEBUG_TRACE_LEVEL
+
+#ifndef CUTLASS_TRACE_DEVICE
+#define CUTLASS_TRACE_DEVICE(format, ...)                        \
+  {                                                              \
+    if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && \
+        threadIdx.x == 0 && threadIdx.y == 0) {                  \
+      printf("[DEVICE][%s:%d, %s]" format "\n",                  \
+             __FILE__,                                           \
+             __LINE__,                                           \
+             __FUNCTION__,                                       \
+             ##__VA_ARGS__);                                     \
+    }                                                            \
+  }
+#endif
+
+#ifndef CUTLASS_TRACE_DEVICE_TID_DETAIL
+#define CUTLASS_TRACE_DEVICE_TID_DETAIL(bidz, bidx, tidx, format, ...)    \
+  {                                                                       \
+    if (blockIdx.x == bidx && blockIdx.y == 0 && blockIdx.z == bidz &&    \
+        threadIdx.x == tidx && threadIdx.y == 0) {                        \
+      printf("[DEVICE][%s:%d, %s][bid={%d,%d,%d}, tid={%d,%d,%d}]" format \
+             "\n",                                                        \
+             __FILE__,                                                    \
+             __LINE__,                                                    \
+             __FUNCTION__,                                                \
+             blockIdx.x,                                                  \
+             blockIdx.y,                                                  \
+             blockIdx.z,                                                  \
+             threadIdx.x,                                                 \
+             threadIdx.y,                                                 \
+             threadIdx.z,                                                 \
+             ##__VA_ARGS__);                                              \
+    }                                                                     \
+  }
+#endif
+
+#ifndef CUTLASS_TRACE_DEVICE_TID
+#define CUTLASS_TRACE_DEVICE_TID(format, ...)                       \
+  {                                                                 \
+    CUTLASS_TRACE_DEVICE_TID_DETAIL(0, 0, 0, format, ##__VA_ARGS__) \
+    CUTLASS_TRACE_DEVICE_TID_DETAIL(0, 0, 1, format, ##__VA_ARGS__) \
+    CUTLASS_TRACE_DEVICE_TID_DETAIL(0, 1, 0, format, ##__VA_ARGS__) \
+  }
+#endif
+
+#else
+
+#ifndef CUTLASS_TRACE_DEVICE
+#define CUTLASS_TRACE_DEVICE(format, ...)
+#endif
+
+#ifndef CUTLASS_TRACE_DEVICE_TID
+#define CUTLASS_TRACE_DEVICE_TID(format, ...)
+#endif
+
+#endif
diff --git a/backends/iluvatar_gpu/apy/matmul/default_config_id.h b/backends/iluvatar_gpu/apy/matmul/default_config_id.h
new file mode 100644
index 00000000000..d160fad04e5
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/default_config_id.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+//#include "all_tuning_configs.h"  // NOLINT
+
+#pragma once
+
+namespace ap {
+
+struct DefaultConfig {
+  static constexpr int kConfigId = 0;
+  static constexpr int kSwizzleFactor = 1;
+  static constexpr bool kBatched = false;
+};
+
+}  // namespace ap
diff --git a/backends/iluvatar_gpu/apy/matmul/matmul.h b/backends/iluvatar_gpu/apy/matmul/matmul.h
new file mode 100644
index 00000000000..d2794859f0b
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/matmul.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include <iostream>
+#include <map>
+#include <vector>
+
+#define CHECK_CUDA(func)                                                      \
+  {                                                                           \
+    cudaError_t err = func;                                                   \
+    if (err != cudaSuccess) {                                                 \
+      std::cerr << "[" << __FILE__ << ":" << __LINE__ << ", " << __FUNCTION__ \
+                << "] "                                                       \
+                << "CUDA error(" << err << "), " << cudaGetErrorString(err)   \
+                << " when call " << #func << std::endl;                       \
+      exit(EXIT_FAILURE);                                                     \
+    }                                                                         \
+  }
+
+#include "cutlass_matmul.cuh"  // NOLINT
+//#include "math_function.h"     // NOLINT
+//#include "profile.h"           // NOLINT
diff --git a/backends/iluvatar_gpu/apy/matmul/params.h b/backends/iluvatar_gpu/apy/matmul/params.h
new file mode 100644
index 00000000000..6be8a242b63
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/params.h
@@ -0,0 +1,178 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#define ASSERT_CHECK(__cond)                                            \
+  do {                                                                  \
+    const bool __cond_var = (__cond);                                   \
+    if (!__cond_var) {                                                  \
+      ::std::string __err_msg = ::std::string("`") + #__cond +          \
+                                "` check failed at " + __FILE__ + ":" + \
+                                ::std::to_string(__LINE__);             \
+      throw std::runtime_error(__err_msg);                              \
+    }                                                                   \
+  } while (0)
+
+namespace ap {
+
+template <typename T, int Dim>
+struct Alignment {
+  static constexpr int kValue =
+      ((Dim % 8) == 0) ? 8
+                       : (((Dim % 4) == 0) ? 4 : (((Dim % 2) == 0) ? 2 : 1));
+};
+
+template <int Dim>
+struct Alignment<float, Dim> {
+  static constexpr int kValue =
+      ((Dim % 4) == 0) ? 4 : (((Dim % 2) == 0) ? 2 : 1);
+};
+
+struct GemmEpilogueParams {
+  int batch_count;
+  int m;
+  int n;
+  int k;
+
+  bool transpose_a;
+  bool transpose_b;
+
+  // Shape related aruguments
+  struct ShapeArguments {
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+    int64_t lda;
+    int64_t ldb;
+    int64_t ldc_bias;
+    int64_t ldd;
+  };
+
+  ShapeArguments shape_args;
+
+  const void *input;
+  const void *weight;
+  const void *bias;
+  void *output;
+
+  cudaStream_t *stream_ptr;
+
+  std::vector<int64_t> input0_shape;
+  std::vector<int64_t> input1_shape;
+  std::vector<const void *> epilogue_in_ptrs;
+  std::vector<void *> epilogue_out_ptrs;
+  std::vector<std::vector<int64_t>> epilogue_in_shapes;
+  std::vector<std::vector<int64_t>> epilogue_out_shapes;
+
+  GemmEpilogueParams() {}
+  GemmEpilogueParams(void *stream_ptr,
+                     const void *input,
+                     const void *weight,
+                     const void *bias,
+                     void *output,
+                     const std::vector<int64_t> &input_shape,
+                     const std::vector<int64_t> &weight_shape,
+                     const std::vector<int64_t> &bias_shape,
+                     bool transpose_a = false,
+                     bool transpose_b = false)
+      : stream_ptr(reinterpret_cast<cudaStream_t *>(stream_ptr)),
+        input(input),
+        weight(weight),
+        bias(bias),
+        output(output),
+        transpose_a(transpose_a),
+        transpose_b(transpose_b) {
+    ASSERT_CHECK(input_shape.size() >= 2U);
+    ASSERT_CHECK(weight_shape.size() >= 2U);
+
+    input0_shape = input_shape;
+    input1_shape = weight_shape;
+
+    batch_count = 1;
+    for (size_t i = 0; i < input_shape.size() - 2; ++i) {
+      batch_count *= input_shape[i];
+    }
+
+    if (transpose_a) {
+      m = input_shape[input_shape.size() - 1];
+      k = input_shape[input_shape.size() - 2];
+    } else {
+      m = input_shape[input_shape.size() - 2];
+      k = input_shape[input_shape.size() - 1];
+    }
+    if (transpose_b) {
+      ASSERT_CHECK(weight_shape[weight_shape.size() - 1] == k);
+      n = weight_shape[weight_shape.size() - 2];
+    } else {
+      ASSERT_CHECK(weight_shape[weight_shape.size() - 2] == k);
+      n = weight_shape[weight_shape.size() - 1];
+    }
+
+    if (bias) {
+      ASSERT_CHECK(bias_shape.size() >= 1U);
+      ASSERT_CHECK(bias_shape[bias_shape.size() - 1] == n);
+    }
+
+#if AP_ENABLE_DEBUG
+    std::cout << "-- [GemmEpilogueParams] batch_count: " << batch_count
+              << ", m: " << m << ", n: " << n << ", k: " << k << std::endl;
+    std::cout << "-- [GemmEpilogueParams] input: " << input << std::endl;
+    std::cout << "-- [GemmEpilogueParams] weight: " << weight << std::endl;
+    std::cout << "-- [GemmEpilogueParams] bias: " << bias << std::endl;
+    std::cout << "-- [GemmEpilogueParams] output: " << output << std::endl;
+    std::cout << "-- [GemmEpilogueParams] stream_str: " << stream_str
+              << std::endl;
+#endif
+
+    shape_args.batch_stride_A = m * k;
+    shape_args.batch_stride_B = (weight_shape.size() == 2) ? 0 : n * k;
+    shape_args.batch_stride_D = m * n;
+
+    shape_args.lda = transpose_a ? m : k;
+    shape_args.ldb = transpose_b ? k : n;
+    shape_args.ldd = n;
+
+    bool is_C_bias = bias_shape.size() == 1UL;
+
+    /// Only available in RRR format
+    shape_args.batch_stride_C = (!bias || is_C_bias) ? 0 : m * n;
+    shape_args.ldc_bias = (!bias || is_C_bias) ? 0 : n;
+  }
+
+  void SetEpilogues(const std::vector<const void *> &in_ptrs,
+                    const std::vector<void *> &out_ptrs) {
+    epilogue_in_ptrs = in_ptrs;
+    epilogue_out_ptrs = out_ptrs;
+  }
+
+  void SetEpilogueAndShapes(
+      const std::vector<const void *> &in_ptrs,
+      const std::vector<std::vector<int64_t>> &in_shapes,
+      const std::vector<void *> &out_ptrs,
+      const std::vector<std::vector<int64_t>> &out_shapes) {
+    ASSERT_CHECK(in_ptrs.size() == in_shapes.size());
+    epilogue_in_ptrs = in_ptrs;
+    epilogue_in_shapes = in_shapes;
+    ASSERT_CHECK(out_ptrs.size() == out_shapes.size());
+    epilogue_out_ptrs = out_ptrs;
+    epilogue_out_shapes = out_shapes;
+  }
+};
+
+}  // namespace ap
diff --git a/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py
new file mode 100644
index 00000000000..ccb41f116aa
--- /dev/null
+++ b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+from pathlib import Path
+
+import paddle
+import paddle.incubate.cc as pcc
+import paddle.incubate.cc.typing as pct
+import paddle.profiler as profiler
+
+
+os.environ["AP_WORKSPACE_DIR"] = "/tmp/paddle/ap_workspace"
+
+
+def GetPirProgram(fused_func, tensor_args):
+    dtypes = tuple(tensor.dtype for tensor in tensor_args)
+    func = fused_func.func_overload_ctx.dtypes2func.get(dtypes, None)
+    return str(func.infer_program.forward_program)
+
+
+DT = "float16"
+BS = 4
+MS = 784
+NS = 192
+KS = 768
+
+
+class TestMatmulEpilogue(unittest.TestCase):
+    def setUp(self):
+        dtype = DT
+        x_shape = [BS, MS, KS]
+        self.x = paddle.randn(x_shape, dtype=dtype)
+        self.x.stop_gradient = True
+
+        y_shape = [KS, NS]
+        self.y = paddle.randn(y_shape, dtype=dtype)
+        self.y.stop_gradient = True
+
+        b_shape = [BS, MS, NS]
+        self.b = paddle.randn(b_shape, dtype=dtype)
+        self.b.stop_gradient = True
+
+        bias_shape = [NS]
+        self.bias = paddle.randn(bias_shape, dtype=dtype)
+        self.bias.stop_gradient = True
+
+        residual_shape = [BS, MS, NS]
+        self.residual = paddle.randn(residual_shape, dtype=dtype)
+        self.residual.stop_gradient = True
+
+        mask_shape = [BS, MS, NS]
+        self.mask = paddle.randn(mask_shape, dtype=dtype)
+        self.mask.stop_gradient = True
+
+    def get_matmul_add_act(self):
+        B = pct.DimVar(BS)
+        M = pct.DimVar(MS)
+        K = pct.DimVar(KS)
+        N = pct.DimVar(NS)
+        T = pct.DTypeVar("T", DT)
+
+        def matmul_add_act(
+            x: pct.Tensor([B, M, K], T),
+            y: pct.Tensor([K, N], T),
+            b: pct.Tensor([B, M, N], T),
+        ):
+
+            out = paddle.matmul(x, y)
+            out = out + b
+            return paddle.nn.functional.relu(out)
+
+        return matmul_add_act
+
+    def get_matmul_add_divide_multipy_add(self):
+        B = pct.DimVar(BS)
+        M = pct.DimVar(MS)
+        K = pct.DimVar(KS)
+        N = pct.DimVar(NS)
+        T = pct.DTypeVar("T", DT)
+
+        def matmul_add_divide_multipy_add(
+            x: pct.Tensor([B, M, K], T),
+            y: pct.Tensor([K, N], T),
+            bias: pct.Tensor([N], T),
+            residual: pct.Tensor([B, M, N], T),
+            mask: pct.Tensor([B, M, N], T),
+        ):
+            out = paddle.matmul(x, y)
+            out = out + bias
+            # out = out / 1.2
+            out = out * mask
+            return residual + out
+
+        return matmul_add_divide_multipy_add
+
+    def check_if_ap_variadic_exist(self, fused_foo, foo_args):
+        generated_pir_program = GetPirProgram(fused_foo, foo_args)
+        assert (
+            "pd_op.ap_variadic" in generated_pir_program
+        ), "AP fusion failed, none pd_op.ap_variadic found in the pir_program."
+
+    def check_by_profiler(self, fused_foo, foo_args):
+        paddle.device.synchronize()
+
+        iters = 10
+        with profiler.Profiler(
+            targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+            on_trace_ready=profiler.export_chrome_tracing("./profiler_log"),
+            timer_only=False,
+        ) as prof:
+            for _ in range(iters):
+                _ = fused_foo(*foo_args)
+                prof.step()
+        prof.summary(
+            sorted_by=profiler.SortedKeys.GPUTotal,
+            op_detail=True,
+            thread_sep=False,
+            time_unit="us",
+        )
+
+    def test_subgraph(self):
+        foo = self.get_matmul_add_act()
+        foo_args = (self.x, self.y, self.b)
+
+        # foo = self.get_matmul_add_divide_multipy_add()
+        # foo_args = (self.x, self.y, self.bias, self.residual, self.mask)
+
+        iluvatar_gpu_dir = Path(__file__).resolve().parent.parent.parent
+        fused_foo = pcc.compile(
+            foo,
+            ap_path=f"{iluvatar_gpu_dir}/apy/device",
+            backend_device="custom_device",
+        )
+
+        self.check_if_ap_variadic_exist(fused_foo, foo_args)
+        self.check_by_profiler(fused_foo, foo_args)
+
+        ap_outs = fused_foo(*foo_args)
+        dy_outs = foo(*foo_args)
+        for dy_out, ap_out in zip(dy_outs, ap_outs):
+            np.testing.assert_allclose(dy_out, ap_out, rtol=5e-2, atol=1e-1)
+
+
+if __name__ == "__main__":
+    unittest.main()