From 755aef2254bc3ea7a2489c7ef1adc6103aba5457 Mon Sep 17 00:00:00 2001
From: denghaodong <leondeng78750@gmail.com>
Date: Tue, 17 Mar 2026 19:05:39 +0800
Subject: [PATCH 1/6] ap support customdevice

---
 backends/iluvatar_gpu/CMakeLists.txt          |  13 ++
 backends/iluvatar_gpu/test_ap/set_env_ap.sh   |  27 +++
 .../test_ap/test_matmul_epilogue.py           | 159 ++++++++++++++++++
 3 files changed, 199 insertions(+)
 create mode 100644 backends/iluvatar_gpu/test_ap/set_env_ap.sh
 create mode 100644 backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py

diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt
index 06512de6d6..8e0379fdf9 100644
--- a/backends/iluvatar_gpu/CMakeLists.txt
+++ b/backends/iluvatar_gpu/CMakeLists.txt
@@ -41,6 +41,8 @@ if(WITH_FLAGCX)
   add_definitions("-DPADDLE_WITH_FLAGCX")
   include(external/flagcx)
 endif()
+  
+add_definitions("-DPADDLE_WITH_CINN")
 
 set(PLUGIN_VERSION ${PADDLE_VERSION})
 set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto")
@@ -817,6 +819,17 @@ list(
 file(
   GLOB_RECURSE CC_SRCS
   RELATIVE ${CMAKE_SOURCE_DIR}
+  # 增加ap依赖项
+  #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/ap_variadic_kernel.cc
+  #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/kernel_dispatch_helper.cc
+  #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/ap_infer_meta_helper.cc
+  #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/kernel_define_helper.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/axpr/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/fs/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_module/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_gen/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/kernel_dispatch/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/*.cc
   runtime/runtime.cc
   runtime/iluvatar_context.cc
   common/*.cc
diff --git a/backends/iluvatar_gpu/test_ap/set_env_ap.sh b/backends/iluvatar_gpu/test_ap/set_env_ap.sh
new file mode 100644
index 0000000000..fd7430883f
--- /dev/null
+++ b/backends/iluvatar_gpu/test_ap/set_env_ap.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+export FLAGS_prim_enable_dynamic=true 
+export FLAGS_prim_all=true
+
+# CINN related FLAG
+export FLAGS_use_cinn=false
+export FLAGS_group_schedule_tiling_first=true
+# PIR mode
+export FLAGS_enable_pir_api=true
+
+# print Program IR
+export FLAGS_print_ir=true
+
+# debug log
+export GLOG_v=0
+export GLOG_vmodule=ap_generic_drr_pass=6
+
+export CUDA_VISIBLE_DEVICES=11
+export FLAGS_enable_ap=1
+
+PADDLE_ROOT="${PADDLE_ROOT:-/path/to/your/paddle/build}"
+export PYTHONPATH="${PADDLE_ROOT}/python:$PYTHONPATH"
+export AP_WORKSPACE_DIR="/tmp/ap_workspace"
+export AP_PATH="${PADDLE_ROOT}/python/paddle/apy/sys:${PADDLE_ROOT}/python/paddle/apy/matmul_pass:$AP_PATH"
+
+python test_matmul_epilogue.py 2>&1 | tee output.log
diff --git a/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py b/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py
new file mode 100644
index 0000000000..7bc3bbd3ce
--- /dev/null
+++ b/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py
@@ -0,0 +1,159 @@
+import os
+import subprocess
+import unittest
+import time
+
+import numpy as np
+
+import paddle
+import paddle.incubate.cc as pcc
+import paddle.incubate.cc.typing as pct
+
+import paddle.profiler as profiler
+
+# os.environ["AP_WORKSPACE_DIR"] = "/tmp/paddle/ap"
+
+DT = 'float16'
+# BS = 4
+# MS = 65536
+# NS = 32
+# KS = 128
+# BS = 1
+# MS = 128
+# NS = 64
+# KS = 128
+# BS = 1
+# MS = 64
+# NS = 768
+# KS = 768
+# BS = 4
+# MS = 64
+# NS = 3072
+# KS = 768
+# BS = 4
+# MS = 128
+# NS = 32
+# KS = 128
+BS = 4
+MS = 784
+NS = 192
+KS = 768
+
+class TestMatmulEpilogue(unittest.TestCase):
+    def setUp(self):
+        dtype = DT
+        x_shape = [BS, MS, KS]
+        self.x = paddle.randn(x_shape, dtype=dtype)
+        self.x.stop_gradient = True
+
+        y_shape = [KS, NS]
+        self.y = paddle.randn(y_shape, dtype=dtype)
+        self.y.stop_gradient = True
+
+        b_shape = [BS, MS, NS]
+        self.b = paddle.randn(b_shape, dtype=dtype)
+        self.b.stop_gradient = True
+
+        b1_shape = [1]
+        self.b1 = paddle.randn(b1_shape, dtype=dtype)
+        self.b1.stop_gradient = True
+
+        bias_shape = [NS]
+        self.bias = paddle.randn(bias_shape, dtype=dtype)
+        self.bias.stop_gradient = True
+
+        residual_shape = [BS, MS, NS]
+        self.residual = paddle.randn(residual_shape, dtype=dtype)
+        self.residual.stop_gradient = True
+
+        mask_shape = [BS, MS, NS]
+        self.mask = paddle.randn(mask_shape, dtype=dtype)
+        self.mask.stop_gradient = True
+
+
+    def get_subgraph(self):
+        B = pct.DimVar(BS)
+        M = pct.DimVar(MS)
+        K = pct.DimVar(KS)
+        N = pct.DimVar(NS)
+        T = pct.DTypeVar("T", DT)
+
+        def matmul_add_act(
+            x: pct.Tensor([B, M, K], T),
+            y: pct.Tensor([K, N], T),
+            b: pct.Tensor([B, M, N], T),
+            b1: pct.Tensor([1], T),
+        ):
+
+            out = paddle.matmul(x, y)
+            out = out + b
+            # return paddle.nn.functional.sigmoid(out)
+            return paddle.nn.functional.relu(out)
+
+        #return matmul_add_act
+
+        def matmul_add_divide_multipy_add_S1(
+            x: pct.Tensor([B, M, K], T),
+            y: pct.Tensor([K, N], T),
+            bias: pct.Tensor([N], T),
+            residual: pct.Tensor([B, M, N], T),
+            mask: pct.Tensor([B, M, N], T),
+        ):
+            out = paddle.matmul(x, y)
+            out = out + bias
+            # out = out / 1.2
+            out = out * mask
+            return residual + out
+        
+        return matmul_add_divide_multipy_add_S1
+
+    def test_subgraph(self):
+        foo = self.get_subgraph()
+        fused_foo = pcc.compile(
+            foo, ap_path=f"{os.path.dirname(paddle.__file__)}/apy/matmul_pass"
+        )
+
+        # ap_outs = fused_foo(self.x, self.y, self.b, self.b1)
+        # dy_outs = foo(self.x, self.y, self.b, self.b1)
+        ap_outs = fused_foo(self.x, self.y, self.bias, self.residual, self.mask)
+        dy_outs = foo(self.x, self.y, self.bias, self.residual, self.mask)
+        #return
+
+
+        # -------- 性能测试部分 --------
+        iters = 10
+        # warmup
+        # _ = fused_foo(self.x, self.y, self.b, self.b1)
+        # _ = foo(self.x, self.y, self.b, self.b1)
+
+        # paddle.device.synchronize()
+        # start = time.time()
+        # # for _ in range(iters):
+        #     # _ = fused_foo(self.x, self.y, self.b, self.b1)
+        # paddle.device.synchronize()
+        # end = time.time()
+        # avg_time = (end - start) / iters
+        # print(f"[Performance] Avg latency per run: {avg_time:.6f} s")
+
+        # profiler (保存到 log_dir)
+        with profiler.Profiler(
+            targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+            on_trace_ready=profiler.export_chrome_tracing("./profiler_log"),
+            timer_only = False
+        ) as prof:
+            for step in range(iters):
+                # _ = fused_foo(self.x, self.y, self.b, self.b1)
+                _ = fused_foo(self.x, self.y, self.bias, self.residual, self.mask)
+                # _ = foo(self.x, self.y, self.b, self.b1)
+                prof.step()
+        print("[Profiler] Trace saved to ./profiler_log")
+        prof.summary(sorted_by=profiler.SortedKeys.GPUTotal,
+             op_detail=True,
+             thread_sep=False,
+             time_unit='us')
+        
+        for dy_out, ap_out in zip(dy_outs, ap_outs):
+            np.testing.assert_allclose(dy_out, ap_out, rtol=5e-2, atol=1e-1)
+
+if __name__ == "__main__":
+    unittest.main()

From 4d7a04ae4f4e248c5675c34ecf270c5c887eaba4 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Mon, 13 Apr 2026 15:21:08 +0800
Subject: [PATCH 2/6] Add apy.

---
 .../apy/device/compile_command_util.py        |  50 ++
 .../apy/matmul/cutlass_matmul.cuh             | 290 ++++++++++++
 .../cutlass_patch/batched_matrix_coord.h      |  37 ++
 ...1_epilogue_vectorized_perwarp_variadic.hpp | 435 ++++++++++++++++++
 .../thread/linear_combination_variadic.h      | 407 ++++++++++++++++
 .../apy/matmul/cutlass_patch/trace_device.h   |  73 +++
 .../apy/matmul/default_config_id.h            |  29 ++
 backends/iluvatar_gpu/apy/matmul/matmul.h     |  39 ++
 backends/iluvatar_gpu/apy/matmul/params.h     | 177 +++++++
 9 files changed, 1537 insertions(+)
 create mode 100644 backends/iluvatar_gpu/apy/device/compile_command_util.py
 create mode 100644 backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh
 create mode 100644 backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h
 create mode 100644 backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp
 create mode 100644 backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h
 create mode 100644 backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h
 create mode 100644 backends/iluvatar_gpu/apy/matmul/default_config_id.h
 create mode 100644 backends/iluvatar_gpu/apy/matmul/matmul.h
 create mode 100644 backends/iluvatar_gpu/apy/matmul/params.h

diff --git a/backends/iluvatar_gpu/apy/device/compile_command_util.py b/backends/iluvatar_gpu/apy/device/compile_command_util.py
new file mode 100644
index 0000000000..18741ca805
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/device/compile_command_util.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ap
+
+
+class CompileCommandGenerator:
+    def __init__(self):
+        self.file_ext = "cu"
+        self.op_type2generate_func = ap.OrderedDict(
+            [
+                ['matmul', self.generate_matmul_compile_command],
+            ]
+        )
+
+    def __call__(self, op_type, tpl_dirname, library_name):
+        return self.op_type2generate_func[op_type](tpl_dirname, library_name)
+
+    def generate_matmul_compile_command(self, tpl_dirname, library_name):
+        parent_dir = ap.dirname(ap.dirname(__file__))
+        cutlass_dir = f"{parent_dir}/matmul/cutlass"
+        matmul_source_dir = f"{parent_dir}/matmul"
+
+        compile_cmd = "clang++ -x ivcore -L/usr/local/corex/lib -lcudart --cuda-path=/usr/local/corex -std=c++17 -O3 -fPIC --cuda-gpu-arch=ivcore11 -Xclang=-fcuda-allow-variadic-functions"
+        compile_cmd = compile_cmd + " -I " + cutlass_dir + "/include"
+        compile_cmd = compile_cmd + " -I " + cutlass_dir + "/tools/util/include"
+        compile_cmd = compile_cmd + " -I " + matmul_source_dir
+        compile_cmd = (
+            compile_cmd
+            + " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -DCUTLASS_DEBUG_TRACE_LEVEL=0 -DCUTLASS_ILUVATAR"
+        )
+        compile_cmd = (
+            compile_cmd + " -DAP_ENABLE_AUTOTUNE=0 -DAP_ENABLE_DEBUG=0"
+        )
+        compile_cmd = (
+            compile_cmd
+            + f" --shared {library_name}.{self.file_ext} -o lib{library_name}.so"
+        )
+        return compile_cmd
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh b/backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh
new file mode 100644
index 0000000000..f48822fce2
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh
@@ -0,0 +1,290 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cuda.h>
+#include <cuda_bf16.h>
+
+#include "cute/atom/mma_traits.hpp"
+#include "cute/numeric/integral_constant.hpp"
+#include "cute/tensor.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/device_memory.h"
+
+#include "cutlass/detail/layout.hpp"
+
+#include "cute/atom/copy_traits_ix11_sme.hpp"
+#include "cute/atom/copy_traits_ix11.hpp"
+#include "cute/atom/copy_atom.hpp"
+
+#include "cutlass/arch/config.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+
+#include "cutlass_patch/batched_matrix_coord.h"
+#include "cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp"
+#include "cutlass_patch/epilogue/thread/linear_combination_variadic.h"
+
+#include "default_config_id.h"
+#include "params.h"
+
+#define CHECK_CUTLASS(status)                                             \
+  {                                                                       \
+    cutlass::Status error = status;                                       \
+    if (error != cutlass::Status::kSuccess) {                             \
+      std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) \
+                << " at: " << __LINE__ << std::endl;                      \
+      exit(EXIT_FAILURE);                                                 \
+    }                                                                     \
+  }
+
+namespace ap {
+using bfloat16 = nv_bfloat16;
+
+template <typename T, int N>
+using Array = cutlass::Array<T, N>;
+
+using MatrixCoord = cutlass::BatchedMatrixCoord;
+
+// Convert CUDA data type to cutlass data type
+template <typename T>
+struct CutlassDataType {
+  using Type = T;
+};
+
+template <>
+struct CutlassDataType<half> {
+  using Type = cutlass::half_t;
+};
+
+template <>
+struct CutlassDataType<__nv_bfloat16> {
+  using Type = cutlass::bfloat16_t;
+};
+
+// Convert to cutlass layout
+template <bool Transposed>
+struct MatrixLayout {
+  using Type = cutlass::layout::RowMajor;
+};
+
+template <>
+struct MatrixLayout<true> {
+  using Type = cutlass::layout::ColumnMajor;
+};
+
+static cutlass::gemm::GemmUniversalMode GetGemmMode(int batch_count) {
+  return batch_count > 1 ? cutlass::gemm::GemmUniversalMode::kBatched
+                         : cutlass::gemm::GemmUniversalMode::kGemm;
+}
+
+template <typename ElementT,
+          typename ElementComputeT,
+          template <typename T>
+          class VariadicFunctor,
+          int AlignA = 64 / sizeof(ElementT),
+          int AlignB = 64 / sizeof(ElementT),
+          int ConfigId = 0>
+void MatmulAddVariadic(
+    const GemmEpilogueParams &params,
+    const typename VariadicFunctor<ElementComputeT>::Arguments &variadic_args) {
+  
+  using namespace cute;
+
+  using ElementAccumulator =
+      typename CutlassDataType<ElementComputeT>::Type;  // <- data type of
+                                                        // accumulator
+  using ElementComputeEpilogue =
+      ElementAccumulator;  // <- data type of epilogue operations
+  using ElementA =
+      typename CutlassDataType<ElementT>::Type;  // <- data type of elements in
+                                                 // input matrix A
+  using ElementB =
+      typename CutlassDataType<ElementT>::Type;  // <- data type of elements in
+                                                 // input matrix B
+  using ElementC = ElementA;
+  using ElementD = ElementA;
+  using ElementOutput =
+      typename CutlassDataType<ElementT>::Type;  // <- data type of elements in
+                                                 // output matrix D
+
+  constexpr int AlignC = AlignB;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using EpilogueThreadOp = cutlass::epilogue::thread::LinearCombinationVariadic<
+      VariadicFunctor, ElementD, 1, ElementAccumulator, ElementAccumulator>;
+
+  // Epilogue --------------------------------
+  // Params
+  // Tile of C/D
+  using EpilogueTile = Tile<
+    Layout<Shape<_16,_4>,Stride<_1,_64>>,
+    Layout<Shape<_32,_4>,Stride<_1,_64>>
+  >;
+
+  // Layout of smem
+  using SmemLayout = 
+    ComposedLayout<
+      Swizzle<1,1,6>,
+      _0,
+      Layout<
+        Shape <Shape < _4,_2,   _2,  _4>,Shape <_16,  _2,   _4>>,
+        Stride<Stride<_32,_1,_4096,_256>,Stride< _2,_128,_1024>>
+      >
+    >;
+
+  // G2R
+  using EpiG2RAtom = Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<32>, ElementC>;
+
+  // R2S
+  using EpiR2SAtom = Copy_Atom<UniversalCopy<uint32_t>, ElementD>;
+
+  // S2R
+  using EpiS2RAtom = EpiR2SAtom;
+  using TiledCopyS2R =
+    TiledCopy<
+      EpiS2RAtom,
+      Layout<
+        Shape <Shape < _16,_4, _4,   _4>,Shape <_2, _2,_2>>,
+        Stride<Stride<_128,_1,_16,_2048>,Stride<_4,_64,_8>>>,
+      decltype(product_each(shape(EpilogueTile{})))
+    >;
+
+  // R2R
+  using CopyAtomR2R = Copy_Atom<U16_2x2Trans,ElementD>;
+
+  // R2G
+  using EpiR2GAtom = Copy_Atom<UniversalCopy<uint32_t>, ElementD>;
+
+  // Epilogue Collective
+  using CollectiveEpilogue = cutlass::epilogue::collective::EpilogueVariadic<
+      // cutlass::detail::TagToStrideC_t<LayoutC>,
+      cutlass::detail::TagToStrideC_t<LayoutD>,
+      EpilogueThreadOp,
+      EpilogueTile,
+      SmemLayout,
+      EpiG2RAtom,
+      EpiR2SAtom,
+      TiledCopyS2R,
+      CopyAtomR2R,
+      EpiR2GAtom,
+      cutlass::epilogue::EpilogueSimtVectorized>;
+
+  using CtaTiler = Shape<_256,_256,_32>;
+  using TiledMma = 
+    TiledMMA<
+      MMA_Atom<IX11_16x16x16_F32F16F16F32_TT>,
+      Layout<Shape<_4,_4,_1>>,
+      Tile<
+        Layout<Shape<_16,_4,_4>, Stride<_1,_64,_16>>,
+        Layout<Shape<_16,_4,_4>, Stride<_1,_64,_16>>,
+        _16
+      >
+    >;
+
+  using CopyA_Op = IX11_SME_I_16x512b<IX11::SMESwizzle::Row16b, IX11::CacheOP::CacheAll>;
+  using CopyA_Atom = Copy_Atom<Copy_Traits<CopyA_Op>, ElementA>;
+  using CopyA = decltype(make_tiled_copy(CopyA_Atom{0}, Layout<Shape<_16,_1>>{}, Layout<Shape<_16,_32>,Stride<_1,_16>>{}));
+  using SmemLayoutAtomA = IX11::Layout_SME_I_16x512b_Atom<IX11::SMESwizzle::Row16b, ElementA, IX11::Major::K>;
+  using SmemCopyA = Copy_Atom<UniversalCopy<uint32_t>, ElementA>;
+
+  using CopyB_Op = IX11_SME_I_16x512b<IX11::SMESwizzle::Col, IX11::CacheOP::CacheAll>;
+  using CopyB_Atom = Copy_Atom<Copy_Traits<CopyB_Op>, ElementB>;
+  using CopyB = decltype(make_tiled_copy(CopyB_Atom{0}, Layout<Shape<_16,_1>>{}, Layout<Shape<_16,_32>,Stride<_1,_16>>{}));
+  using SmemLayoutAtomB = IX11::Layout_SME_I_16x512b_Atom<IX11::SMESwizzle::Col, ElementB, IX11::Major::K>;
+  using SmemCopyB = Copy_Atom<UniversalCopy<uint32_t>, ElementB>;
+
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+    cutlass::gemm::MainloopIx11SmeUnpredicated<2>,
+    CtaTiler,
+    ElementA, cutlass::detail::TagToStrideA_t<LayoutA>,
+    ElementB, cutlass::detail::TagToStrideB_t<LayoutB>,
+    TiledMma,
+    CopyA, SmemLayoutAtomA, SmemCopyA, cute::identity,
+    CopyB, SmemLayoutAtomB, SmemCopyB, cute::identity
+  >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  ProblemShapeType problem_shape{params.m, params.n, params.k, params.batch_count};
+  
+  const ElementA *input =
+      reinterpret_cast<const ElementA *>(params.input);
+  const ElementB *weight =
+      reinterpret_cast<const ElementB *>(params.weight);
+  ElementOutput *output = reinterpret_cast<ElementOutput *>(params.output);
+  
+  cutlass::KernelHardwareInfo hw_info;
+  // TODO
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.device_id = 0;
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  typename EpilogueThreadOp::Params epilogue_op_args;
+  epilogue_op_args.variadic_args = variadic_args;
+
+  int m = params.m, n = params.n, k = params.k, l = params.batch_count;
+  
+  auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, l));
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, l));
+  auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(m, n, l));
+
+  typename Gemm::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    problem_shape,
+    {input, stride_A, weight, stride_B},
+    {epilogue_op_args, output, stride_D},
+    hw_info
+  };
+
+  Gemm device_gemm;
+
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  cudaStream_t* stream_ptr = reinterpret_cast<cudaStream_t*>(params.stream_ptr);
+
+  CHECK_CUTLASS(device_gemm.can_implement(arguments));
+  CHECK_CUTLASS(device_gemm.initialize(arguments, workspace.get(), *stream_ptr));
+  CHECK_CUTLASS(device_gemm(*stream_ptr));
+
+}
+
+}  // namespace ap
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h
new file mode 100644
index 0000000000..544670b8ad
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+
+struct BatchedMatrixCoord {
+  int batch;
+  int row;
+  int column;
+  bool is_valid;
+
+  CUTLASS_HOST_DEVICE
+  BatchedMatrixCoord() : batch(0), row(0), column(0), is_valid(false) {}
+
+  CUTLASS_HOST_DEVICE
+  BatchedMatrixCoord(int b, int r, int c) : batch(b), row(r), column(c), is_valid(true) {}
+
+  CUTLASS_HOST_DEVICE
+  BatchedMatrixCoord(int b, int r, int c, bool valid) : batch(b), row(r), column(c), is_valid(valid) {}
+};
+
+};  // namespace cutlass
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp
new file mode 100644
index 0000000000..a19167bf3a
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp
@@ -0,0 +1,435 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  // class StrideC,
+  class StrideD,
+  class ThreadEpilogueOp,
+  class EpilogueTile,
+  class SmemLayout,
+  class CopyAtomG2R_,
+  class CopyAtomR2S,
+  class TiledCopyS2R,
+  class CopyAtomR2R,
+  class CopyAtomR2G,
+  class EpilogueScheduleType = EpilogueSimtVectorized,
+  class Enable = void
+>
+class EpilogueVariadic {
+  static_assert(cute::is_same_v<EpilogueScheduleType, EpilogueSimtVectorized> ||
+                cute::is_same_v<EpilogueScheduleType, EpiloguePtrArraySimtVectorized>,
+                "Could not find an epilogue specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Iluvatar ix11/ix30 Epilogue Vectorized
+/// 1. Load C(optional), compute D = alpha * Acc + beta * C, 
+///   then transform to ElementD(8/16-bit typically)
+/// 2. Copy D from Register to Smem
+/// 3. Load D from Smem to Register
+/// 4. Register-to-Register transform
+/// 5. Store D from Register to Gmem
+///
+/// Support alpha and beta params, bias not supported yet.
+template <
+  // class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class EpilogueTile_,
+  class SmemLayout_,
+  class CopyAtomG2R_,
+  class CopyAtomR2S_,
+  class TiledCopyS2R_,
+  class CopyAtomR2R_,
+  class CopyAtomR2G_,
+  class EpilogueScheduleType_
+>
+class EpilogueVariadic<
+        // StrideC_,
+        StrideD_,
+        ThreadEpilogueOp_,
+        EpilogueTile_,
+        SmemLayout_,
+        CopyAtomG2R_,
+        CopyAtomR2S_,
+        TiledCopyS2R_,
+        CopyAtomR2R_,
+        CopyAtomR2G_,
+        EpilogueScheduleType_,
+        cute::enable_if_t<
+          cute::is_same_v<EpilogueScheduleType_, EpilogueSimtVectorized>
+        >
+      > {
+public:
+  //
+  // Type Aliases
+  //
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementD = typename ThreadEpilogueOp::ElementOutput;
+  using StrideD = StrideD_;
+  using ElementC = ElementD; // for GemmUniversal
+  using StrideC = StrideD;
+  using ElementBias = typename detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::type;
+  using EpilogueTile = EpilogueTile_;
+  using SmemLayout   = SmemLayout_;
+  using CopyAtomG2R = CopyAtomG2R_;
+  using CopyAtomR2S  = CopyAtomR2S_;
+  using TiledCopyS2R = TiledCopyS2R_;
+  using CopyAtomR2R  = CopyAtomR2R_;
+  using CopyAtomR2G  = CopyAtomR2G_;
+
+  // using GmemTiledCopyC = void;
+  using GmemTiledCopyD = CopyAtomR2G;
+
+  static constexpr bool IsEpilogueBiasSupported = detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
+  using StrideBias = cute::conditional_t<detail::is_m_major<StrideD>(), Stride<_1,_0,int64_t>, Stride<_0,_1,int64_t>>;
+
+  // static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementD, cute::cosize_v<SmemLayout>> smem_epilogue;
+  };
+
+  struct Arguments {
+    typename ThreadEpilogueOp::Params epilogue_op{};
+    // using StrideBias = decltype(thread.dBias);
+    // ElementC const* ptr_C = nullptr;
+    // StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  template<class ThreadEpiOp>
+  struct ParamsType {
+    typename ThreadEpiOp::Params epilogue_op{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  using Params = ParamsType<ThreadEpilogueOp>;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+
+    return {
+        args.epilogue_op,
+        args.ptr_D,
+        args.dD,
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  EpilogueVariadic(Params const& params_)
+      : params(params_), epilogue_op(params_.epilogue_op) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    return epilogue_op.is_source_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine,FrgLayout> const& accumulators,                   // (MMA,MMA_M,MMA_N)
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    static_assert(rank(EpilogueTile{}) == 2, "Rank of EpilogueTile should be 2");
+    static_assert(rank(SmemLayout{}) == 2, "Rank of SmemLayout should be 2");
+    CUTE_STATIC_ASSERT(size(SmemLayout{}) == size<0>(EpilogueTile{}) * size<1>(EpilogueTile{}));
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    // Represent the full output tensor
+    // Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), params.dC);             //             (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), params.dD);             //             (m,n,l)
+    // Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), make_shape(M,N,L), params.dBias);    //             (m,n,l)
+
+    // Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
+    // Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});       // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    // Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    // Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord);                                             // (BLK_M,BLK_N)
+
+    // Construct a tensor in SMEM that we can partition for rearranging data
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // Common part (with sm70 epilogue) end--------------------------------
+    // init register transform
+    // TV -> MN(logical)
+    auto layout_gd_tv = tiled_mma.thrfrg_C(make_layout(gD.shape()));
+
+    // G2R / R2S -------------------------------------
+    // In G2R / R2S tiling, we assemble the final V->MN layout out from given epilogue tile and mma layout
+
+    // epilogue tile of logical C
+    auto tE_D  = flat_divide(make_layout(product_each(gD.shape())), EpilogueTile{});
+    
+    using EpilogueTileSize = decltype(product_each(shape(EpilogueTile{})));
+
+    Tensor sAcc = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{});
+
+    auto r2s_mn_tiler_tv = take<0,2>(right_inverse(layout_gd_tv).compose(tE_D)); // epilogue logical mn->tv
+    // get tiler of TV from tiler of MN
+    auto r2s_tv_tiler_shape = product_each(layout_gd_tv.shape());
+    auto r2s_tv_zeros = repeat_like(r2s_tv_tiler_shape, Int<0>{}); // T and V
+    auto r2s_tv_tiler = cute::transform(make_seq<rank(r2s_tv_tiler_shape)>{}, [&](auto i) {
+      auto tiler_origin = filter(composition(make_layout(r2s_tv_tiler_shape, replace<i>(r2s_tv_zeros, Int<1>{})), r2s_mn_tiler_tv));
+      auto complemented = complement(tiler_origin, get<i>(r2s_tv_tiler_shape));
+      return cute::layout<1>(
+        // divide complement to make it monotonic
+        zipped_divide(make_layout(shape<i>(r2s_tv_tiler_shape)), complemented));
+    });
+
+    auto r2s_TV_D_tiler = left_inverse(take<0,2>(tE_D)).compose( // tv -> mn in epilogue tiler
+      layout_gd_tv.compose( // tv -> mn in C
+        make_layout(r2s_tv_tiler_shape).compose(r2s_tv_tiler))); // tv_tiler -> tv coord
+
+    // G2R ----------------------------------------
+    using TiledCopyG2R = TiledCopy<CopyAtomG2R, decltype(r2s_TV_D_tiler), EpilogueTileSize>;
+    auto thread_g2r = TiledCopyG2R::get_slice(thread_idx);
+    // auto tEgD = flat_divide(gC, EpilogueTile{});
+    // auto tGR_gC = thread_g2r.partition_S(tEgD);
+    auto cC = make_identity_tensor(make_shape(size<0>(gD), size<1>(gD))); // same shape as gC/gD
+    auto cCt = flat_divide(cC, EpilogueTile{});                           
+    auto tRS_cC = thread_g2r.partition_S(cCt);
+    
+    // R2S ----------------------------------------
+    using TiledCopyR2S = TiledCopy<CopyAtomR2S, decltype(r2s_TV_D_tiler), EpilogueTileSize>;
+    auto tiled_r2s = TiledCopyR2S{};
+
+    auto tile_RS_R = 
+      cute::layout<1>( // just V
+        make_layout(
+          make_shape(typename TiledCopyR2S::TiledNumThr{},typename TiledCopyR2S::TiledNumVal{}),make_stride(_0{},_1{}))
+            .compose(group<1,3>(                            // V_copy -> V_fragment
+              right_inverse(layout_gd_tv)                   // TV_copy -> TV_fragment
+                .compose(tiled_r2s.tidfrg_S(tE_D)))))
+                  (_,repeat<rank_v<decltype(tE_D)>>(_));    // TV_copy -> MN
+
+    auto thread_r2s = tiled_r2s.get_slice(thread_idx);
+    auto tErAcc = make_tensor(accumulators.data(), tile_RS_R);
+    auto tRS_sAcc = thread_r2s.partition_D(sAcc);
+
+    // S2R ----------------------------------------
+    auto tiled_s2r = TiledCopyS2R{};
+    auto thread_s2r = tiled_s2r.get_slice(thread_idx);
+    auto tSR_sAcc = thread_s2r.partition_S(sAcc);
+    auto tSR_rAcc = make_tensor<ElementD>(make_layout(tSR_sAcc.shape()));
+
+    // R2R --------------------------------------------
+    auto tiled_r2r = 
+      make_tiled_copy(CopyAtomR2R{},typename CopyAtomR2R::ThrID{},make_layout(typename TiledCopyR2S::TiledNumVal{}));
+    auto thread_r2r = tiled_r2r.get_slice(thread_idx);
+    auto tRR_rAcc = make_tensor<ElementD>(tSR_rAcc.shape());
+    auto tRR_rSrc = thread_r2r.retile_S(tSR_rAcc);
+    auto tRR_rDst = thread_r2r.retile_D(tRR_rAcc);
+
+    // R2G -------------------------------------------
+    auto r2r_vlayout = filter(cute::layout<1>(tiled_r2r.get_layoutS_TV()).compose(cute::layout<1>(tiled_r2r.get_layoutD_TV())));
+    auto r2g_tv_layout = (typename TiledCopyS2R::TiledLayout_TV{}).compose(_,r2r_vlayout);
+    auto tiled_r2g = TiledCopy<CopyAtomR2G, decltype(r2g_tv_layout), EpilogueTileSize>{};
+    auto thread_r2g = tiled_r2g.get_slice(thread_idx);
+
+    auto tRG_rAcc = thread_r2g.retile_S(tRR_rDst);
+    auto tEgD = flat_divide(gD, EpilogueTile{});
+    auto tRG_gD = thread_r2g.partition_D(tEgD);
+
+    // Repeat the D-partitioning for coordinates and predication
+    Tensor cD   = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD))); // (BLK_M,BLK_N) -> (blk_m,blk_n)
+    Tensor cDt  = flat_divide(cD, EpilogueTile{});                           // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor tRG_cD = thread_r2g.partition_D(cDt);
+
+#if 0
+    if (thread_idx == 0 && m_coord == 0 && n_coord == 0) {
+      print("aC   : "); print(accumulators.layout()); print("\n");
+      // print("gC   : "); print(gC.layout()); print("\n");
+      print("gD   : "); print(gD.layout()); print("\n");
+      // print("gBias   : "); print(gBias.layout()); print("\n");
+      print("sAcc   : "); print(sAcc.layout()); print("\n");
+      // print("rAcc   : "); print(rAcc.layout()); print("\n");
+      print("\n");
+      // print("tRS_rAcc : "); print(tRS_rAcc.layout()); print("\n");
+      print("tRS_sAcc : "); print(tRS_sAcc.layout()); print("\n");
+      print("\n");
+      print("tSR_sAcc : "); print(tSR_sAcc.layout()); print("\n");
+      print("tSR_rAcc : "); print(tSR_rAcc.layout()); print("\n");
+      print("\n");
+      print("tRR_rSrc : "); print(tRR_rSrc.layout()); print("\n");
+      print("tRR_rDst : "); print(tRR_rDst.layout()); print("\n");
+      print("\n");
+      print("tRG_rAcc : "); print(tRG_rAcc.layout()); print("\n");
+      print("tRG_gD   : "); print(tRG_gD.layout()); print("\n");
+      print("tRS_cC   : "); print(tRS_cC.layout()); print("\n");
+      print("cCt      : "); print(cCt.layout()); print("\n");
+      print("cD       : "); print(cD.layout()); print("\n");
+      print("tE_D     : "); print(tE_D.layout()); print("\n");
+      print("sAcc     : "); print(sAcc.layout()); print("\n");
+      print("tErAcc    : "); print(tErAcc.layout()); print("\n");
+    }
+#endif
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_tile_m = 0; epi_tile_m < size<2>(tEgD).value; ++epi_tile_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_tile_n = 0; epi_tile_n < size<3>(tEgD).value; ++epi_tile_n) {
+        Tensor tRS_rAccmn = tErAcc(_,_,_,epi_tile_m,epi_tile_n); // ((2, (2, 2)), 1, 1)
+        Tensor cC_mn = tRS_cC(_,_,_,epi_tile_m, epi_tile_n);
+        Tensor tRS_rD = make_tensor_like<ElementD>(tRS_rAccmn);
+                        
+        int m_cta_coord_base = m_coord * size<0>(gD);
+        int n_cta_coord_base = n_coord * size<1>(gD);
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tRS_rAccmn); ++i) {
+          auto cta_coord = cC_mn(i);
+          tRS_rD(i) = epilogue_op(tRS_rAccmn(i),
+                                  l_coord, 
+                                  m_cta_coord_base + get<0>(cta_coord), 
+                                  n_cta_coord_base + get<1>(cta_coord),
+                                  elem_less(cta_coord, take<0,2>(residue_mnk)));
+        }
+
+        copy(CopyAtomR2S{}, tRS_rD, tRS_sAcc);
+        copy(TiledCopyS2R{}, tSR_sAcc, tSR_rAcc);
+        copy(CopyAtomR2R{}, tRR_rSrc, tRR_rDst);
+
+        Tensor tRG_gDmn = tRG_gD(_,_,_,epi_tile_m,epi_tile_n);
+        Tensor tRG_cDmn = tRG_cD(_,_,_,epi_tile_m,epi_tile_n);
+        CUTLASS_PRAGMA_UNROLL
+        for (int atom_i = 0; atom_i < size<0,1>(tRR_rAcc); ++atom_i) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tRR_rAcc); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+
+            for (int n = 0; n < size<2>(tRR_rAcc); ++n) {
+              
+              if (elem_less(tRG_cDmn(make_coord(0,atom_i),m,n), take<0,2>(residue_mnk))) {
+                copy(CopyAtomR2G{}, tRG_rAcc(make_coord(_,atom_i),m,n), tRG_gDmn(make_coord(_,atom_i),m,n));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+  ThreadEpilogueOp epilogue_op;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h
new file mode 100644
index 0000000000..1acd9cd667
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h
@@ -0,0 +1,407 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass_patch/batched_matrix_coord.h"
+#include "cutlass_patch/trace_device.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+template <class VariadicOp, class = void>
+struct GenericVariadicTraits {
+  static constexpr bool IsArgumentsNeeded = false;
+  struct Arguments {};
+};
+
+template <class VariadicOp>
+struct GenericVariadicTraits<VariadicOp,
+                             decltype(typename VariadicOp::Arguments(),
+                                      void())> {
+  static constexpr bool IsArgumentsNeeded = true;
+  using Arguments = typename VariadicOp::Arguments;
+};
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = VariadicOp(alpha * accumulator + beta * source)
+///
+template <
+    template <typename T>
+    class VariadicOp,
+    typename ElementOutput_,  ///< Data type used to load and store tensors
+    int ElementsPerAccess,    ///< Number of elements computed per operation.
+                            ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                            ///< but we use 64 or 32 sometimes when there are
+                            ///< not enough data to store
+    typename ElementAccumulator_ = ElementOutput_,  ///< Accumulator data type
+    typename ElementCompute_ =
+        ElementOutput_,  ///< Data type used to compute linear combination
+    ScaleType::Kind Scale =
+        ScaleType::Default,  ///< Control Alpha and Beta scaling
+    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+    bool IsHeavy = false>
+class LinearCombinationVariadic {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementC = ElementOutput_;
+  using ElementD = ElementOutput_;
+
+  using VariadicArguments =
+      typename GenericVariadicTraits<VariadicOp<ElementCompute>>::Arguments;
+
+  static bool const kIsHeavy = IsHeavy;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = ElementsPerAccess;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentSource = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute alpha;             ///< scales accumulators
+    ElementCompute beta;              ///< scales source tensor
+    ElementCompute const *alpha_ptr;  ///< pointer to accumulator scalar - if
+                                      ///< not null, loads it from memory
+    ElementCompute const *beta_ptr;   ///< pointer to source scalar - if not
+                                      ///< null, loads it from memory
+    VariadicArguments variadic_args;
+
+    CUTLASS_HOST_DEVICE
+    Params()
+        : alpha(ElementCompute(1)),
+          beta(ElementCompute(0)),
+          alpha_ptr(nullptr),
+          beta_ptr(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha,
+           ElementCompute beta,
+           VariadicArguments variadic_args_ = VariadicArguments{})
+        : alpha(alpha),
+          beta(beta),
+          alpha_ptr(nullptr),
+          beta_ptr(nullptr),
+          variadic_args(variadic_args_) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  Params params_;
+  bool skip_elementwise_;
+
+ public:
+  /// Constructs the function object, possibly loading from pointers in host
+  /// memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationVariadic(Params const &params) {
+    params_ = params;
+    params_.alpha = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    params_.beta = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling)
+      return params_.beta != ElementCompute(0);
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return params_.beta != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      params_.beta = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Computes linear scaling with source: D = alpha * accumulator + beta *
+  /// source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator,
+                            FragmentSource const &source,
+                            int batch,
+                            int row_offset,
+                            int column_offset,
+                            bool valid) const {
+    CUTLASS_TRACE_DEVICE(
+        "kElementsPerAccess: %d, row_offset: %d, column_offset: %d",
+        kElementsPerAccess,
+        row_offset,
+        column_offset);
+
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute,
+                          ElementOutput,
+                          kElementsPerAccess,
+                          Round>
+        source_converter;
+    NumericArrayConverter<ElementCompute,
+                          ElementAccumulator,
+                          kElementsPerAccess,
+                          Round>
+        accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    VariadicOp<ElementCompute> variadic_op;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      // D = alpha * Accum + X
+      intermediate = mul_add_accumulator(
+          params_.alpha, converted_accumulator, intermediate);
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      // X =  beta * C + uniform
+      intermediate = mul_add_source(params_.beta, converted_source);
+      // D = alpha * Accum + X
+      intermediate = mul_add_accumulator(
+          params_.alpha, converted_accumulator, intermediate);
+    }
+
+    if constexpr (GenericVariadicTraits<
+                      VariadicOp<ElementCompute>>::IsArgumentsNeeded) {
+      if (!skip_elementwise_) {
+#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE
+        intermediate = variadic_op.Compute<kElementsPerAccess>(
+            intermediate,
+            params_.variadic_args,
+            BatchedMatrixCoord(batch, row_offset, column_offset, valid));
+#else
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kElementsPerAccess; ++i) {
+          intermediate[i] = variadic_op(
+              intermediate[i],
+              params_.variadic_args,
+              BatchedMatrixCoord(batch, row_offset, column_offset + i, valid));
+        }
+#endif
+      }
+    } else {
+      if (!skip_elementwise_) {
+#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE
+        intermediate = variadic_op.Compute<kElementsPerAccess>(intermediate);
+#else
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kElementsPerAccess; ++i) {
+          intermediate[i] = variadic_op(intermediate[i]);
+        }
+#endif
+      }
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput,
+                          ElementCompute,
+                          kElementsPerAccess,
+                          Round>
+        destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator,
+                            int batch,
+                            int row_offset,
+                            int column_offset,
+                            bool valid) const {
+    CUTLASS_TRACE_DEVICE(
+        "kElementsPerAccess: %d, row_offset: %d, column_offset: %d",
+        kElementsPerAccess,
+        row_offset,
+        column_offset);
+
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute,
+                          ElementAccumulator,
+                          kElementsPerAccess,
+                          Round>
+        accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+    VariadicOp<ElementCompute> variadic_op;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      // D = alpha * Accum
+      intermediate = mul_accumulator(params_.alpha, converted_accumulator);
+    }
+
+    if constexpr (GenericVariadicTraits<
+                      VariadicOp<FragmentCompute>>::IsArgumentsNeeded) {
+      if (!skip_elementwise_) {
+#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE
+        intermediate = variadic_op.Compute<kElementsPerAccess>(
+            intermediate,
+            params_.variadic_args,
+            BatchedMatrixCoord(batch, row_offset, column_offset, valid));
+#else
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kElementsPerAccess; ++i) {
+          intermediate[i] = variadic_op(
+              intermediate[i],
+              params_.variadic_args,
+              BatchedMatrixCoord(batch, row_offset, column_offset + i, valid));
+        }
+#endif
+      }
+    } else {
+      if (!skip_elementwise_) {
+#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE
+        intermediate = variadic_op.Compute<kElementsPerAccess>(intermediate);
+#else
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kElementsPerAccess; ++i) {
+          intermediate[i] = variadic_op(intermediate[i]);
+        }
+#endif
+      }
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  //
+  // Specializations for scalar (for use with cute::collective::DefaultEpilogue)
+  //
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator, 
+                      ElementC const source,
+                      int batch,
+                      int row_offset,
+                      int column_offset,
+                      bool valid) const {
+                    
+    // Convert everything to Compute type, do compute, and then store to output type
+    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    [[maybe_unused]] NumericConverter<ElementCompute, ElementC, Round> source_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+
+    // Convert to destination numeric type
+
+    ElementCompute converted_accumulator = accumulator_converter(accumulator);
+    if constexpr (Scale == ScaleType::Nothing) {
+      return destination_converter(converted_accumulator);
+    }
+
+    // Perform binary operations
+    ElementCompute intermediate;
+    multiplies<ElementCompute> multiply;
+    multiply_add<ElementCompute> madd;
+    VariadicOp<ElementCompute> variadic_op;
+
+    if constexpr (Scale == ScaleType::NoBetaScaling) {
+      intermediate = source_converter(source);
+    }
+    else {
+      intermediate = multiply(params_.beta, source);                            // X =  beta * C + uniform
+    }
+
+    intermediate = madd(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    intermediate = variadic_op(
+                  intermediate, 
+                  params_.variadic_args, 
+                  BatchedMatrixCoord(batch, row_offset, column_offset, valid));
+    return destination_converter(intermediate);
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator,
+                      int batch,
+                      int row_offset,
+                      int column_offset,
+                      bool valid) const {
+                      
+    // Convert everything to Compute type, do compute, and then store to output type
+    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+    ElementCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Convert to destination numeric type
+    if constexpr (Scale == ScaleType::Nothing) {
+      return destination_converter(converted_accumulator);
+    }
+
+    // Perform binary operations
+    ElementCompute intermediate;
+    multiplies<ElementCompute> multiply;
+    VariadicOp<ElementCompute> variadic_op;
+
+    intermediate = multiply(params_.alpha, accumulator);    // D = alpha * Accum
+    intermediate = variadic_op(
+                      intermediate, 
+                      params_.variadic_args, 
+                      BatchedMatrixCoord(batch, row_offset, column_offset, valid));
+    return destination_converter(intermediate);
+  }
+  
+};
+
+}  // namespace thread
+}  // namespace epilogue
+}  // namespace cutlass
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h
new file mode 100644
index 0000000000..470e17b102
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if CUTLASS_DEBUG_TRACE_LEVEL
+
+#ifndef CUTLASS_TRACE_DEVICE
+#define CUTLASS_TRACE_DEVICE(format, ...)                        \
+  {                                                              \
+    if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && \
+        threadIdx.x == 0 && threadIdx.y == 0) {                  \
+      printf("[DEVICE][%s:%d, %s]" format "\n",                  \
+             __FILE__,                                           \
+             __LINE__,                                           \
+             __FUNCTION__,                                       \
+             ##__VA_ARGS__);                                     \
+    }                                                            \
+  }
+#endif
+
+#ifndef CUTLASS_TRACE_DEVICE_TID_DETAIL
+#define CUTLASS_TRACE_DEVICE_TID_DETAIL(bidz, bidx, tidx, format, ...)    \
+  {                                                                       \
+    if (blockIdx.x == bidx && blockIdx.y == 0 && blockIdx.z == bidz &&    \
+        threadIdx.x == tidx && threadIdx.y == 0) {                        \
+      printf("[DEVICE][%s:%d, %s][bid={%d,%d,%d}, tid={%d,%d,%d}]" format \
+             "\n",                                                        \
+             __FILE__,                                                    \
+             __LINE__,                                                    \
+             __FUNCTION__,                                                \
+             blockIdx.x,                                                  \
+             blockIdx.y,                                                  \
+             blockIdx.z,                                                  \
+             threadIdx.x,                                                 \
+             threadIdx.y,                                                 \
+             threadIdx.z,                                                 \
+             ##__VA_ARGS__);                                              \
+    }                                                                     \
+  }
+#endif
+
+#ifndef CUTLASS_TRACE_DEVICE_TID
+#define CUTLASS_TRACE_DEVICE_TID(format, ...)                       \
+  {                                                                 \
+    CUTLASS_TRACE_DEVICE_TID_DETAIL(0, 0, 0, format, ##__VA_ARGS__) \
+    CUTLASS_TRACE_DEVICE_TID_DETAIL(0, 0, 1, format, ##__VA_ARGS__) \
+    CUTLASS_TRACE_DEVICE_TID_DETAIL(0, 1, 0, format, ##__VA_ARGS__) \
+  }
+#endif
+
+#else
+
+#ifndef CUTLASS_TRACE_DEVICE
+#define CUTLASS_TRACE_DEVICE(format, ...)
+#endif
+
+#ifndef CUTLASS_TRACE_DEVICE_TID
+#define CUTLASS_TRACE_DEVICE_TID(format, ...)
+#endif
+
+#endif
diff --git a/backends/iluvatar_gpu/apy/matmul/default_config_id.h b/backends/iluvatar_gpu/apy/matmul/default_config_id.h
new file mode 100644
index 0000000000..d160fad04e
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/default_config_id.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+//#include "all_tuning_configs.h"  // NOLINT
+
+#pragma once
+
+namespace ap {
+
+struct DefaultConfig {
+  static constexpr int kConfigId = 0;
+  static constexpr int kSwizzleFactor = 1;
+  static constexpr bool kBatched = false;
+};
+
+}  // namespace ap
diff --git a/backends/iluvatar_gpu/apy/matmul/matmul.h b/backends/iluvatar_gpu/apy/matmul/matmul.h
new file mode 100644
index 0000000000..dcd8b5caae
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/matmul.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <map>
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#define CHECK_CUDA(func)                                                      \
+  {                                                                           \
+    cudaError_t err = func;                                                   \
+    if (err != cudaSuccess) {                                                 \
+      std::cerr << "[" << __FILE__ << ":" << __LINE__ << ", " << __FUNCTION__ \
+                << "] "                                                       \
+                << "CUDA error(" << err << "), " << cudaGetErrorString(err)   \
+                << " when call " << #func << std::endl;                       \
+      exit(EXIT_FAILURE);                                                     \
+    }                                                                         \
+  }
+
+#include "cutlass_matmul.cuh"  // NOLINT
+//#include "math_function.h"     // NOLINT
+//#include "profile.h"           // NOLINT
diff --git a/backends/iluvatar_gpu/apy/matmul/params.h b/backends/iluvatar_gpu/apy/matmul/params.h
new file mode 100644
index 0000000000..035ceb01a0
--- /dev/null
+++ b/backends/iluvatar_gpu/apy/matmul/params.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#define ASSERT_CHECK(__cond)                                            \
+  do {                                                                  \
+    const bool __cond_var = (__cond);                                   \
+    if (!__cond_var) {                                                  \
+      ::std::string __err_msg = ::std::string("`") + #__cond +          \
+                                "` check failed at " + __FILE__ + ":" + \
+                                ::std::to_string(__LINE__);             \
+      throw std::runtime_error(__err_msg);                              \
+    }                                                                   \
+  } while (0)
+
+namespace ap {
+
+template <typename T, int Dim>
+struct Alignment {
+  static constexpr int kValue =
+      ((Dim % 8) == 0) ? 8
+                       : (((Dim % 4) == 0) ? 4 : (((Dim % 2) == 0) ? 2 : 1));
+};
+
+template <int Dim>
+struct Alignment<float, Dim> {
+  static constexpr int kValue =
+      ((Dim % 4) == 0) ? 4 : (((Dim % 2) == 0) ? 2 : 1);
+};
+
+struct GemmEpilogueParams {
+  int batch_count;
+  int m;
+  int n;
+  int k;
+
+  bool transpose_a;
+  bool transpose_b;
+
+  // Shape related aruguments
+  struct ShapeArguments {
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+    int64_t lda;
+    int64_t ldb;
+    int64_t ldc_bias;
+    int64_t ldd;
+  };
+
+  ShapeArguments shape_args;
+
+  const void *input;
+  const void *weight;
+  const void *bias;
+  void *output;
+
+  cudaStream_t* stream_ptr;
+
+  std::vector<int64_t> input0_shape;
+  std::vector<int64_t> input1_shape;
+  std::vector<const void *> epilogue_in_ptrs;
+  std::vector<void *> epilogue_out_ptrs;
+  std::vector<std::vector<int64_t>> epilogue_in_shapes;
+  std::vector<std::vector<int64_t>> epilogue_out_shapes;
+
+  GemmEpilogueParams() {}
+  GemmEpilogueParams(void* stream_ptr,
+                     const void *input,
+                     const void *weight,
+                     const void *bias,
+                     void *output,
+                     const std::vector<int64_t> &input_shape,
+                     const std::vector<int64_t> &weight_shape,
+                     const std::vector<int64_t> &bias_shape,
+                     bool transpose_a = false,
+                     bool transpose_b = false)
+      : stream_ptr(reinterpret_cast<cudaStream_t*>(stream_ptr)),
+        input(input),
+        weight(weight),
+        bias(bias),
+        output(output),
+        transpose_a(transpose_a),
+        transpose_b(transpose_b) {
+    ASSERT_CHECK(input_shape.size() >= 2U);
+    ASSERT_CHECK(weight_shape.size() >= 2U);
+
+    input0_shape = input_shape;
+    input1_shape = weight_shape;
+
+    batch_count = 1;
+    for (size_t i = 0; i < input_shape.size() - 2; ++i) {
+      batch_count *= input_shape[i];
+    }
+
+    if (transpose_a) {
+      m = input_shape[input_shape.size() - 1];
+      k = input_shape[input_shape.size() - 2];
+    } else {
+      m = input_shape[input_shape.size() - 2];
+      k = input_shape[input_shape.size() - 1];
+    }
+    if (transpose_b) {
+      ASSERT_CHECK(weight_shape[weight_shape.size() - 1] == k);
+      n = weight_shape[weight_shape.size() - 2];
+    } else {
+      ASSERT_CHECK(weight_shape[weight_shape.size() - 2] == k);
+      n = weight_shape[weight_shape.size() - 1];
+    }
+
+    if (bias) {
+      ASSERT_CHECK(bias_shape.size() >= 1U);
+      ASSERT_CHECK(bias_shape[bias_shape.size() - 1] == n);
+    }
+
+#if AP_ENABLE_DEBUG
+    std::cout << "-- [GemmEpilogueParams] batch_count: " << batch_count
+              << ", m: " << m << ", n: " << n << ", k: " << k << std::endl;
+    std::cout << "-- [GemmEpilogueParams] input: " << input << std::endl;
+    std::cout << "-- [GemmEpilogueParams] weight: " << weight << std::endl;
+    std::cout << "-- [GemmEpilogueParams] bias: " << bias << std::endl;
+    std::cout << "-- [GemmEpilogueParams] output: " << output << std::endl;
+    std::cout << "-- [GemmEpilogueParams] stream_str: " << stream_str << std::endl;
+#endif
+
+    shape_args.batch_stride_A = m * k;
+    shape_args.batch_stride_B = (weight_shape.size() == 2) ? 0 : n * k;
+    shape_args.batch_stride_D = m * n;
+
+    shape_args.lda = transpose_a ? m : k;
+    shape_args.ldb = transpose_b ? k : n;
+    shape_args.ldd = n;
+
+    bool is_C_bias = bias_shape.size() == 1UL;
+
+    /// Only available in RRR format
+    shape_args.batch_stride_C = (!bias || is_C_bias) ? 0 : m * n;
+    shape_args.ldc_bias = (!bias || is_C_bias) ? 0 : n;
+  }
+
+  void SetEpilogues(const std::vector<const void *> &in_ptrs,
+                    const std::vector<void *> &out_ptrs) {
+    epilogue_in_ptrs = in_ptrs;
+    epilogue_out_ptrs = out_ptrs;
+  }
+
+  void SetEpilogueAndShapes(
+      const std::vector<const void *> &in_ptrs,
+      const std::vector<std::vector<int64_t>> &in_shapes,
+      const std::vector<void *> &out_ptrs,
+      const std::vector<std::vector<int64_t>> &out_shapes) {
+    ASSERT_CHECK(in_ptrs.size() == in_shapes.size());
+    epilogue_in_ptrs = in_ptrs;
+    epilogue_in_shapes = in_shapes;
+    ASSERT_CHECK(out_ptrs.size() == out_shapes.size());
+    epilogue_out_ptrs = out_ptrs;
+    epilogue_out_shapes = out_shapes;
+  }
+};
+
+}  // namespace ap

From cdd2a1854f7aa2e0899c1bbfcee1f5df2f57fd81 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Tue, 14 Apr 2026 10:28:33 +0800
Subject: [PATCH 3/6] Update cmake.

---
 backends/iluvatar_gpu/CMakeLists.txt | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt
index 8e0379fdf9..27bf8d03d8 100644
--- a/backends/iluvatar_gpu/CMakeLists.txt
+++ b/backends/iluvatar_gpu/CMakeLists.txt
@@ -41,8 +41,6 @@ if(WITH_FLAGCX)
   add_definitions("-DPADDLE_WITH_FLAGCX")
   include(external/flagcx)
 endif()
-  
-add_definitions("-DPADDLE_WITH_CINN")
 
 set(PLUGIN_VERSION ${PADDLE_VERSION})
 set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto")
@@ -817,19 +815,19 @@ list(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/sparse_attention_kernel.cu)
 
 file(
-  GLOB_RECURSE CC_SRCS
-  RELATIVE ${CMAKE_SOURCE_DIR}
-  # 增加ap依赖项
-  #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/ap_variadic_kernel.cc
-  #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/kernel_dispatch_helper.cc
-  #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/ap_infer_meta_helper.cc
-  #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/kernel_define_helper.cc
+  GLOB
+  AP_CC_SRCS
   ${PADDLE_SOURCE_DIR}/paddle/ap/src/axpr/*.cc
   ${PADDLE_SOURCE_DIR}/paddle/ap/src/fs/*.cc
   ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_module/*.cc
   ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_gen/*.cc
   ${PADDLE_SOURCE_DIR}/paddle/ap/src/kernel_dispatch/*.cc
-  ${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/*.cc
+  ${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/*.cc)
+
+file(
+  GLOB_RECURSE CC_SRCS
+  RELATIVE ${CMAKE_SOURCE_DIR}
+  ${AP_CC_SRCS}
   runtime/runtime.cc
   runtime/iluvatar_context.cc
   common/*.cc

From fb78dbb580d74aef416ca1ab945d08e8e7f3a73b Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Tue, 14 Apr 2026 10:43:32 +0800
Subject: [PATCH 4/6] Format codes.

---
 .../apy/device/compile_command_util.py        |   6 +-
 .../cutlass_patch/batched_matrix_coord.h      |   6 +-
 ...1_epilogue_vectorized_perwarp_variadic.hpp | 400 ++++++++++--------
 .../thread/linear_combination_variadic.h      |  44 +-
 backends/iluvatar_gpu/apy/matmul/matmul.h     |   8 +-
 backends/iluvatar_gpu/apy/matmul/params.h     |   9 +-
 backends/iluvatar_gpu/test_ap/set_env_ap.sh   |  14 +
 .../test_ap/test_matmul_epilogue.py           |  42 +-
 8 files changed, 299 insertions(+), 230 deletions(-)

diff --git a/backends/iluvatar_gpu/apy/device/compile_command_util.py b/backends/iluvatar_gpu/apy/device/compile_command_util.py
index 18741ca805..e0037f84ea 100644
--- a/backends/iluvatar_gpu/apy/device/compile_command_util.py
+++ b/backends/iluvatar_gpu/apy/device/compile_command_util.py
@@ -20,7 +20,7 @@ def __init__(self):
         self.file_ext = "cu"
         self.op_type2generate_func = ap.OrderedDict(
             [
-                ['matmul', self.generate_matmul_compile_command],
+                ["matmul", self.generate_matmul_compile_command],
             ]
         )
 
@@ -40,9 +40,7 @@ def generate_matmul_compile_command(self, tpl_dirname, library_name):
             compile_cmd
             + " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -DCUTLASS_DEBUG_TRACE_LEVEL=0 -DCUTLASS_ILUVATAR"
         )
-        compile_cmd = (
-            compile_cmd + " -DAP_ENABLE_AUTOTUNE=0 -DAP_ENABLE_DEBUG=0"
-        )
+        compile_cmd = compile_cmd + " -DAP_ENABLE_AUTOTUNE=0 -DAP_ENABLE_DEBUG=0"
         compile_cmd = (
             compile_cmd
             + f" --shared {library_name}.{self.file_ext} -o lib{library_name}.so"
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h
index 544670b8ad..11c08461eb 100644
--- a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h
@@ -28,10 +28,12 @@ struct BatchedMatrixCoord {
   BatchedMatrixCoord() : batch(0), row(0), column(0), is_valid(false) {}
 
   CUTLASS_HOST_DEVICE
-  BatchedMatrixCoord(int b, int r, int c) : batch(b), row(r), column(c), is_valid(true) {}
+  BatchedMatrixCoord(int b, int r, int c)
+      : batch(b), row(r), column(c), is_valid(true) {}
 
   CUTLASS_HOST_DEVICE
-  BatchedMatrixCoord(int b, int r, int c, bool valid) : batch(b), row(r), column(c), is_valid(valid) {}
+  BatchedMatrixCoord(int b, int r, int c, bool valid)
+      : batch(b), row(r), column(c), is_valid(valid) {}
 };
 
 };  // namespace cutlass
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp
index a19167bf3a..e714983847 100644
--- a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp
@@ -1,12 +1,12 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
@@ -18,14 +18,15 @@
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 /*! \file
@@ -34,9 +35,8 @@
 
 #pragma once
 
-#include "cutlass/cutlass.h"
-
 #include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -46,28 +46,28 @@ namespace collective {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 template <
-  // class StrideC,
-  class StrideD,
-  class ThreadEpilogueOp,
-  class EpilogueTile,
-  class SmemLayout,
-  class CopyAtomG2R_,
-  class CopyAtomR2S,
-  class TiledCopyS2R,
-  class CopyAtomR2R,
-  class CopyAtomR2G,
-  class EpilogueScheduleType = EpilogueSimtVectorized,
-  class Enable = void
->
+    // class StrideC,
+    class StrideD,
+    class ThreadEpilogueOp,
+    class EpilogueTile,
+    class SmemLayout,
+    class CopyAtomG2R_,
+    class CopyAtomR2S,
+    class TiledCopyS2R,
+    class CopyAtomR2R,
+    class CopyAtomR2G,
+    class EpilogueScheduleType = EpilogueSimtVectorized,
+    class Enable = void>
 class EpilogueVariadic {
-  static_assert(cute::is_same_v<EpilogueScheduleType, EpilogueSimtVectorized> ||
-                cute::is_same_v<EpilogueScheduleType, EpiloguePtrArraySimtVectorized>,
-                "Could not find an epilogue specialization.");
+  static_assert(
+      cute::is_same_v<EpilogueScheduleType, EpilogueSimtVectorized> ||
+          cute::is_same_v<EpilogueScheduleType, EpiloguePtrArraySimtVectorized>,
+      "Could not find an epilogue specialization.");
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// Iluvatar ix11/ix30 Epilogue Vectorized
-/// 1. Load C(optional), compute D = alpha * Acc + beta * C, 
+/// 1. Load C(optional), compute D = alpha * Acc + beta * C,
 ///   then transform to ElementD(8/16-bit typically)
 /// 2. Copy D from Register to Smem
 /// 3. Load D from Smem to Register
@@ -76,35 +76,32 @@ class EpilogueVariadic {
 ///
 /// Support alpha and beta params, bias not supported yet.
 template <
-  // class StrideC_,
-  class StrideD_,
-  class ThreadEpilogueOp_,
-  class EpilogueTile_,
-  class SmemLayout_,
-  class CopyAtomG2R_,
-  class CopyAtomR2S_,
-  class TiledCopyS2R_,
-  class CopyAtomR2R_,
-  class CopyAtomR2G_,
-  class EpilogueScheduleType_
->
+    // class StrideC_,
+    class StrideD_,
+    class ThreadEpilogueOp_,
+    class EpilogueTile_,
+    class SmemLayout_,
+    class CopyAtomG2R_,
+    class CopyAtomR2S_,
+    class TiledCopyS2R_,
+    class CopyAtomR2R_,
+    class CopyAtomR2G_,
+    class EpilogueScheduleType_>
 class EpilogueVariadic<
-        // StrideC_,
-        StrideD_,
-        ThreadEpilogueOp_,
-        EpilogueTile_,
-        SmemLayout_,
-        CopyAtomG2R_,
-        CopyAtomR2S_,
-        TiledCopyS2R_,
-        CopyAtomR2R_,
-        CopyAtomR2G_,
-        EpilogueScheduleType_,
-        cute::enable_if_t<
-          cute::is_same_v<EpilogueScheduleType_, EpilogueSimtVectorized>
-        >
-      > {
-public:
+    // StrideC_,
+    StrideD_,
+    ThreadEpilogueOp_,
+    EpilogueTile_,
+    SmemLayout_,
+    CopyAtomG2R_,
+    CopyAtomR2S_,
+    TiledCopyS2R_,
+    CopyAtomR2R_,
+    CopyAtomR2G_,
+    EpilogueScheduleType_,
+    cute::enable_if_t<
+        cute::is_same_v<EpilogueScheduleType_, EpilogueSimtVectorized>>> {
+ public:
   //
   // Type Aliases
   //
@@ -116,28 +113,33 @@ class EpilogueVariadic<
   using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
   using ElementD = typename ThreadEpilogueOp::ElementOutput;
   using StrideD = StrideD_;
-  using ElementC = ElementD; // for GemmUniversal
+  using ElementC = ElementD;  // for GemmUniversal
   using StrideC = StrideD;
-  using ElementBias = typename detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::type;
+  using ElementBias =
+      typename detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::type;
   using EpilogueTile = EpilogueTile_;
-  using SmemLayout   = SmemLayout_;
+  using SmemLayout = SmemLayout_;
   using CopyAtomG2R = CopyAtomG2R_;
-  using CopyAtomR2S  = CopyAtomR2S_;
+  using CopyAtomR2S = CopyAtomR2S_;
   using TiledCopyS2R = TiledCopyS2R_;
-  using CopyAtomR2R  = CopyAtomR2R_;
-  using CopyAtomR2G  = CopyAtomR2G_;
+  using CopyAtomR2R = CopyAtomR2R_;
+  using CopyAtomR2G = CopyAtomR2G_;
 
   // using GmemTiledCopyC = void;
   using GmemTiledCopyD = CopyAtomR2G;
 
-  static constexpr bool IsEpilogueBiasSupported = detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
-  using StrideBias = cute::conditional_t<detail::is_m_major<StrideD>(), Stride<_1,_0,int64_t>, Stride<_0,_1,int64_t>>;
+  static constexpr bool IsEpilogueBiasSupported =
+      detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
+  using StrideBias = cute::conditional_t<detail::is_m_major<StrideD>(),
+                                         Stride<_1, _0, int64_t>,
+                                         Stride<_0, _1, int64_t>>;
 
-  // static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  // static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N,
+  // L]");
+  static_assert(cute::rank(StrideD{}) == 3,
+                "StrideCD must be rank-3: [M, N, L]");
 
-  struct SharedStorage
-  {
+  struct SharedStorage {
     cute::array_aligned<ElementD, cute::cosize_v<SmemLayout>> smem_epilogue;
   };
 
@@ -151,7 +153,7 @@ class EpilogueVariadic<
   };
 
   // Device side epilogue params
-  template<class ThreadEpiOp>
+  template <class ThreadEpiOp>
   struct ParamsType {
     typename ThreadEpiOp::Params epilogue_op{};
     ElementD* ptr_D = nullptr;
@@ -165,12 +167,10 @@ class EpilogueVariadic<
   //
 
   template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(
+  static constexpr Params to_underlying_arguments(
       [[maybe_unused]] ProblemShape const& _,
       Arguments const& args,
       [[maybe_unused]] void* workspace) {
-
     return {
         args.epilogue_op,
         args.ptr_D,
@@ -179,65 +179,67 @@ class EpilogueVariadic<
   }
 
   template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+  static size_t get_workspace_size(ProblemShape const& problem_shape,
+                                   Arguments const& args) {
     return 0;
   }
 
   template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
+  static cutlass::Status initialize_workspace(
+      ProblemShape const& problem_shape,
+      Arguments const& args,
+      void* workspace,
+      cudaStream_t stream,
+      CudaHostAdapter* cuda_adapter = nullptr) {
     return cutlass::Status::kSuccess;
   }
 
   template <class ProblemShape>
-  static bool
-  can_implement(
-      [[maybe_unused]] ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
+  static bool can_implement([[maybe_unused]] ProblemShape const& problem_shape,
+                            [[maybe_unused]] Arguments const& args) {
     return true;
   }
 
   CUTLASS_HOST_DEVICE
   EpilogueVariadic(Params const& params_)
-      : params(params_), epilogue_op(params_.epilogue_op) { }
+      : params(params_), epilogue_op(params_.epilogue_op) {}
 
   CUTLASS_DEVICE
-  bool
-  is_source_needed() {
-    return epilogue_op.is_source_needed();
-  }
-
-  template<
-    class ProblemShapeMNKL,
-    class BlockShapeMNK,
-    class BlockCoordMNKL,
-    class FrgEngine, class FrgLayout,
-    class TiledMma,
-    class ResidueMNK
-  >
-  CUTLASS_DEVICE void
-  operator()(
-      ProblemShapeMNKL problem_shape_mnkl,
-      BlockShapeMNK blk_shape_MNK,
-      BlockCoordMNKL blk_coord_mnkl,
-      cute::Tensor<FrgEngine,FrgLayout> const& accumulators,                   // (MMA,MMA_M,MMA_N)
-      TiledMma tiled_mma,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      char* smem_buf) {
+  bool is_source_needed() { return epilogue_op.is_source_needed(); }
+
+  template <class ProblemShapeMNKL,
+            class BlockShapeMNK,
+            class BlockCoordMNKL,
+            class FrgEngine,
+            class FrgLayout,
+            class TiledMma,
+            class ResidueMNK>
+  CUTLASS_DEVICE void operator()(ProblemShapeMNKL problem_shape_mnkl,
+                                 BlockShapeMNK blk_shape_MNK,
+                                 BlockCoordMNKL blk_coord_mnkl,
+                                 cute::Tensor<FrgEngine, FrgLayout> const&
+                                     accumulators,  // (MMA,MMA_M,MMA_N)
+                                 TiledMma tiled_mma,
+                                 ResidueMNK residue_mnk,
+                                 int thread_idx,
+                                 char* smem_buf) {
     using namespace cute;
     using X = Underscore;
 
-    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
-    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
-    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
-    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
-
-    static_assert(rank(EpilogueTile{}) == 2, "Rank of EpilogueTile should be 2");
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4,
+                  "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value,
+                  "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3,
+                  "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4,
+                  "BlockCoordMNKL must be rank 3");
+
+    static_assert(rank(EpilogueTile{}) == 2,
+                  "Rank of EpilogueTile should be 2");
     static_assert(rank(SmemLayout{}) == 2, "Rank of SmemLayout should be 2");
-    CUTE_STATIC_ASSERT(size(SmemLayout{}) == size<0>(EpilogueTile{}) * size<1>(EpilogueTile{}));
+    CUTE_STATIC_ASSERT(size(SmemLayout{}) ==
+                       size<0>(EpilogueTile{}) * size<1>(EpilogueTile{}));
 
     // Separate out problem shape for convenience
     auto M = get<0>(problem_shape_mnkl);
@@ -245,19 +247,28 @@ class EpilogueVariadic<
     auto L = get<3>(problem_shape_mnkl);
 
     // Represent the full output tensor
-    // Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), params.dC);             //             (m,n,l)
-    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), params.dD);             //             (m,n,l)
-    // Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), make_shape(M,N,L), params.dBias);    //             (m,n,l)
-
-    // Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
-    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
-    // Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});       // (BLK_M,BLK_N,m,n,l)
+    // Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C),
+    // make_shape(M,N,L), params.dC);             //             (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D),
+                                make_shape(M, N, L),
+                                params.dD);  //             (m,n,l)
+    // Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias),
+    // make_shape(M,N,L), params.dBias);    //             (m,n,l)
+
+    // Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_),
+    // Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl,
+                               blk_shape_MNK,
+                               make_coord(_, _, _),
+                               Step<_1, _1, X>{});  // (BLK_M,BLK_N,m,n,l)
+    // Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK,
+    // make_coord(_,_,_), Step<_1,_1, X>{});       // (BLK_M,BLK_N,m,n,l)
 
     // Slice to get the tile this CTA is responsible for
     auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
-    // Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
-    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
-    // Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord);                                             // (BLK_M,BLK_N)
+    // Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_, _, m_coord, n_coord, l_coord);  // (BLK_M,BLK_N)
+    // Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N)
 
     // Construct a tensor in SMEM that we can partition for rearranging data
     SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
@@ -268,52 +279,68 @@ class EpilogueVariadic<
     auto layout_gd_tv = tiled_mma.thrfrg_C(make_layout(gD.shape()));
 
     // G2R / R2S -------------------------------------
-    // In G2R / R2S tiling, we assemble the final V->MN layout out from given epilogue tile and mma layout
+    // In G2R / R2S tiling, we assemble the final V->MN layout out from given
+    // epilogue tile and mma layout
 
     // epilogue tile of logical C
-    auto tE_D  = flat_divide(make_layout(product_each(gD.shape())), EpilogueTile{});
-    
+    auto tE_D =
+        flat_divide(make_layout(product_each(gD.shape())), EpilogueTile{});
+
     using EpilogueTileSize = decltype(product_each(shape(EpilogueTile{})));
 
-    Tensor sAcc = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{});
+    Tensor sAcc =
+        make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{});
 
-    auto r2s_mn_tiler_tv = take<0,2>(right_inverse(layout_gd_tv).compose(tE_D)); // epilogue logical mn->tv
+    auto r2s_mn_tiler_tv = take<0, 2>(
+        right_inverse(layout_gd_tv).compose(tE_D));  // epilogue logical mn->tv
     // get tiler of TV from tiler of MN
     auto r2s_tv_tiler_shape = product_each(layout_gd_tv.shape());
-    auto r2s_tv_zeros = repeat_like(r2s_tv_tiler_shape, Int<0>{}); // T and V
-    auto r2s_tv_tiler = cute::transform(make_seq<rank(r2s_tv_tiler_shape)>{}, [&](auto i) {
-      auto tiler_origin = filter(composition(make_layout(r2s_tv_tiler_shape, replace<i>(r2s_tv_zeros, Int<1>{})), r2s_mn_tiler_tv));
-      auto complemented = complement(tiler_origin, get<i>(r2s_tv_tiler_shape));
-      return cute::layout<1>(
-        // divide complement to make it monotonic
-        zipped_divide(make_layout(shape<i>(r2s_tv_tiler_shape)), complemented));
-    });
-
-    auto r2s_TV_D_tiler = left_inverse(take<0,2>(tE_D)).compose( // tv -> mn in epilogue tiler
-      layout_gd_tv.compose( // tv -> mn in C
-        make_layout(r2s_tv_tiler_shape).compose(r2s_tv_tiler))); // tv_tiler -> tv coord
+    auto r2s_tv_zeros = repeat_like(r2s_tv_tiler_shape, Int<0>{});  // T and V
+    auto r2s_tv_tiler =
+        cute::transform(make_seq<rank(r2s_tv_tiler_shape)>{}, [&](auto i) {
+          auto tiler_origin = filter(
+              composition(make_layout(r2s_tv_tiler_shape,
+                                      replace<i>(r2s_tv_zeros, Int<1>{})),
+                          r2s_mn_tiler_tv));
+          auto complemented =
+              complement(tiler_origin, get<i>(r2s_tv_tiler_shape));
+          return cute::layout<1>(
+              // divide complement to make it monotonic
+              zipped_divide(make_layout(shape<i>(r2s_tv_tiler_shape)),
+                            complemented));
+        });
+
+    auto r2s_TV_D_tiler =
+        left_inverse(take<0, 2>(tE_D))
+            .compose(                  // tv -> mn in epilogue tiler
+                layout_gd_tv.compose(  // tv -> mn in C
+                    make_layout(r2s_tv_tiler_shape)
+                        .compose(r2s_tv_tiler)));  // tv_tiler -> tv coord
 
     // G2R ----------------------------------------
-    using TiledCopyG2R = TiledCopy<CopyAtomG2R, decltype(r2s_TV_D_tiler), EpilogueTileSize>;
+    using TiledCopyG2R =
+        TiledCopy<CopyAtomG2R, decltype(r2s_TV_D_tiler), EpilogueTileSize>;
     auto thread_g2r = TiledCopyG2R::get_slice(thread_idx);
     // auto tEgD = flat_divide(gC, EpilogueTile{});
     // auto tGR_gC = thread_g2r.partition_S(tEgD);
-    auto cC = make_identity_tensor(make_shape(size<0>(gD), size<1>(gD))); // same shape as gC/gD
-    auto cCt = flat_divide(cC, EpilogueTile{});                           
+    auto cC = make_identity_tensor(
+        make_shape(size<0>(gD), size<1>(gD)));  // same shape as gC/gD
+    auto cCt = flat_divide(cC, EpilogueTile{});
     auto tRS_cC = thread_g2r.partition_S(cCt);
-    
+
     // R2S ----------------------------------------
-    using TiledCopyR2S = TiledCopy<CopyAtomR2S, decltype(r2s_TV_D_tiler), EpilogueTileSize>;
+    using TiledCopyR2S =
+        TiledCopy<CopyAtomR2S, decltype(r2s_TV_D_tiler), EpilogueTileSize>;
     auto tiled_r2s = TiledCopyR2S{};
 
-    auto tile_RS_R = 
-      cute::layout<1>( // just V
-        make_layout(
-          make_shape(typename TiledCopyR2S::TiledNumThr{},typename TiledCopyR2S::TiledNumVal{}),make_stride(_0{},_1{}))
-            .compose(group<1,3>(                            // V_copy -> V_fragment
-              right_inverse(layout_gd_tv)                   // TV_copy -> TV_fragment
-                .compose(tiled_r2s.tidfrg_S(tE_D)))))
-                  (_,repeat<rank_v<decltype(tE_D)>>(_));    // TV_copy -> MN
+    auto tile_RS_R = cute::layout<1>(  // just V
+        make_layout(make_shape(typename TiledCopyR2S::TiledNumThr{},
+                               typename TiledCopyR2S::TiledNumVal{}),
+                    make_stride(_0{}, _1{}))
+            .compose(group<1, 3>(            // V_copy -> V_fragment
+                right_inverse(layout_gd_tv)  // TV_copy -> TV_fragment
+                    .compose(tiled_r2s.tidfrg_S(tE_D)))))(
+        _, repeat<rank_v<decltype(tE_D)>>(_));  // TV_copy -> MN
 
     auto thread_r2s = tiled_r2s.get_slice(thread_idx);
     auto tErAcc = make_tensor(accumulators.data(), tile_RS_R);
@@ -326,17 +353,23 @@ class EpilogueVariadic<
     auto tSR_rAcc = make_tensor<ElementD>(make_layout(tSR_sAcc.shape()));
 
     // R2R --------------------------------------------
-    auto tiled_r2r = 
-      make_tiled_copy(CopyAtomR2R{},typename CopyAtomR2R::ThrID{},make_layout(typename TiledCopyR2S::TiledNumVal{}));
+    auto tiled_r2r =
+        make_tiled_copy(CopyAtomR2R{},
+                        typename CopyAtomR2R::ThrID{},
+                        make_layout(typename TiledCopyR2S::TiledNumVal{}));
     auto thread_r2r = tiled_r2r.get_slice(thread_idx);
     auto tRR_rAcc = make_tensor<ElementD>(tSR_rAcc.shape());
     auto tRR_rSrc = thread_r2r.retile_S(tSR_rAcc);
     auto tRR_rDst = thread_r2r.retile_D(tRR_rAcc);
 
     // R2G -------------------------------------------
-    auto r2r_vlayout = filter(cute::layout<1>(tiled_r2r.get_layoutS_TV()).compose(cute::layout<1>(tiled_r2r.get_layoutD_TV())));
-    auto r2g_tv_layout = (typename TiledCopyS2R::TiledLayout_TV{}).compose(_,r2r_vlayout);
-    auto tiled_r2g = TiledCopy<CopyAtomR2G, decltype(r2g_tv_layout), EpilogueTileSize>{};
+    auto r2r_vlayout =
+        filter(cute::layout<1>(tiled_r2r.get_layoutS_TV())
+                   .compose(cute::layout<1>(tiled_r2r.get_layoutD_TV())));
+    auto r2g_tv_layout =
+        (typename TiledCopyS2R::TiledLayout_TV{}).compose(_, r2r_vlayout);
+    auto tiled_r2g =
+        TiledCopy<CopyAtomR2G, decltype(r2g_tv_layout), EpilogueTileSize>{};
     auto thread_r2g = tiled_r2g.get_slice(thread_idx);
 
     auto tRG_rAcc = thread_r2g.retile_S(tRR_rDst);
@@ -344,8 +377,10 @@ class EpilogueVariadic<
     auto tRG_gD = thread_r2g.partition_D(tEgD);
 
     // Repeat the D-partitioning for coordinates and predication
-    Tensor cD   = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD))); // (BLK_M,BLK_N) -> (blk_m,blk_n)
-    Tensor cDt  = flat_divide(cD, EpilogueTile{});                           // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor cD = make_identity_tensor(make_shape(
+        size<0>(gD), size<1>(gD)));  // (BLK_M,BLK_N) -> (blk_m,blk_n)
+    Tensor cDt =
+        flat_divide(cD, EpilogueTile{});  // (SMEM_M,SMEM_N,TILE_M,TILE_N)
     Tensor tRG_cD = thread_r2g.partition_D(cDt);
 
 #if 0
@@ -380,38 +415,42 @@ class EpilogueVariadic<
     for (int epi_tile_m = 0; epi_tile_m < size<2>(tEgD).value; ++epi_tile_m) {
       CUTLASS_PRAGMA_UNROLL
       for (int epi_tile_n = 0; epi_tile_n < size<3>(tEgD).value; ++epi_tile_n) {
-        Tensor tRS_rAccmn = tErAcc(_,_,_,epi_tile_m,epi_tile_n); // ((2, (2, 2)), 1, 1)
-        Tensor cC_mn = tRS_cC(_,_,_,epi_tile_m, epi_tile_n);
+        Tensor tRS_rAccmn =
+            tErAcc(_, _, _, epi_tile_m, epi_tile_n);  // ((2, (2, 2)), 1, 1)
+        Tensor cC_mn = tRS_cC(_, _, _, epi_tile_m, epi_tile_n);
         Tensor tRS_rD = make_tensor_like<ElementD>(tRS_rAccmn);
-                        
+
         int m_cta_coord_base = m_coord * size<0>(gD);
         int n_cta_coord_base = n_coord * size<1>(gD);
         CUTLASS_PRAGMA_UNROLL
         for (int i = 0; i < size(tRS_rAccmn); ++i) {
           auto cta_coord = cC_mn(i);
-          tRS_rD(i) = epilogue_op(tRS_rAccmn(i),
-                                  l_coord, 
-                                  m_cta_coord_base + get<0>(cta_coord), 
-                                  n_cta_coord_base + get<1>(cta_coord),
-                                  elem_less(cta_coord, take<0,2>(residue_mnk)));
+          tRS_rD(i) =
+              epilogue_op(tRS_rAccmn(i),
+                          l_coord,
+                          m_cta_coord_base + get<0>(cta_coord),
+                          n_cta_coord_base + get<1>(cta_coord),
+                          elem_less(cta_coord, take<0, 2>(residue_mnk)));
         }
 
         copy(CopyAtomR2S{}, tRS_rD, tRS_sAcc);
         copy(TiledCopyS2R{}, tSR_sAcc, tSR_rAcc);
         copy(CopyAtomR2R{}, tRR_rSrc, tRR_rDst);
 
-        Tensor tRG_gDmn = tRG_gD(_,_,_,epi_tile_m,epi_tile_n);
-        Tensor tRG_cDmn = tRG_cD(_,_,_,epi_tile_m,epi_tile_n);
+        Tensor tRG_gDmn = tRG_gD(_, _, _, epi_tile_m, epi_tile_n);
+        Tensor tRG_cDmn = tRG_cD(_, _, _, epi_tile_m, epi_tile_n);
         CUTLASS_PRAGMA_UNROLL
-        for (int atom_i = 0; atom_i < size<0,1>(tRR_rAcc); ++atom_i) {
+        for (int atom_i = 0; atom_i < size<0, 1>(tRR_rAcc); ++atom_i) {
           CUTLASS_PRAGMA_UNROLL
           for (int m = 0; m < size<1>(tRR_rAcc); ++m) {
             CUTLASS_PRAGMA_UNROLL
 
             for (int n = 0; n < size<2>(tRR_rAcc); ++n) {
-              
-              if (elem_less(tRG_cDmn(make_coord(0,atom_i),m,n), take<0,2>(residue_mnk))) {
-                copy(CopyAtomR2G{}, tRG_rAcc(make_coord(_,atom_i),m,n), tRG_gDmn(make_coord(_,atom_i),m,n));
+              if (elem_less(tRG_cDmn(make_coord(0, atom_i), m, n),
+                            take<0, 2>(residue_mnk))) {
+                copy(CopyAtomR2G{},
+                     tRG_rAcc(make_coord(_, atom_i), m, n),
+                     tRG_gDmn(make_coord(_, atom_i), m, n));
               }
             }
           }
@@ -420,16 +459,15 @@ class EpilogueVariadic<
     }
   }
 
-private:
+ private:
   Params params;
   ThreadEpilogueOp epilogue_op;
 };
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-} // namespace collective
-} // namespace epilogue
-} // namespace cutlass
+}  // namespace collective
+}  // namespace epilogue
+}  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h
index 1acd9cd667..ffda0b076b 100644
--- a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h
@@ -24,7 +24,6 @@
 #include "cutlass/functional.h"
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/numeric_types.h"
-
 #include "cutlass_patch/batched_matrix_coord.h"
 #include "cutlass_patch/trace_device.h"
 
@@ -330,16 +329,18 @@ class LinearCombinationVariadic {
   // Specializations for scalar (for use with cute::collective::DefaultEpilogue)
   //
   CUTLASS_HOST_DEVICE
-  ElementD operator()(ElementAccumulator const accumulator, 
+  ElementD operator()(ElementAccumulator const accumulator,
                       ElementC const source,
                       int batch,
                       int row_offset,
                       int column_offset,
                       bool valid) const {
-                    
-    // Convert everything to Compute type, do compute, and then store to output type
-    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
-    [[maybe_unused]] NumericConverter<ElementCompute, ElementC, Round> source_converter;
+    // Convert everything to Compute type, do compute, and then store to output
+    // type
+    NumericConverter<ElementCompute, ElementAccumulator, Round>
+        accumulator_converter;
+    [[maybe_unused]] NumericConverter<ElementCompute, ElementC, Round>
+        source_converter;
     NumericConverter<ElementD, ElementCompute, Round> destination_converter;
 
     // Convert to destination numeric type
@@ -357,16 +358,17 @@ class LinearCombinationVariadic {
 
     if constexpr (Scale == ScaleType::NoBetaScaling) {
       intermediate = source_converter(source);
-    }
-    else {
-      intermediate = multiply(params_.beta, source);                            // X =  beta * C + uniform
+    } else {
+      intermediate = multiply(params_.beta, source);  // X =  beta * C + uniform
     }
 
-    intermediate = madd(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    intermediate = madd(params_.alpha,
+                        converted_accumulator,
+                        intermediate);  // D = alpha * Accum + X
     intermediate = variadic_op(
-                  intermediate, 
-                  params_.variadic_args, 
-                  BatchedMatrixCoord(batch, row_offset, column_offset, valid));
+        intermediate,
+        params_.variadic_args,
+        BatchedMatrixCoord(batch, row_offset, column_offset, valid));
     return destination_converter(intermediate);
   }
 
@@ -376,9 +378,10 @@ class LinearCombinationVariadic {
                       int row_offset,
                       int column_offset,
                       bool valid) const {
-                      
-    // Convert everything to Compute type, do compute, and then store to output type
-    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    // Convert everything to Compute type, do compute, and then store to output
+    // type
+    NumericConverter<ElementCompute, ElementAccumulator, Round>
+        accumulator_converter;
     NumericConverter<ElementD, ElementCompute, Round> destination_converter;
     ElementCompute converted_accumulator = accumulator_converter(accumulator);
 
@@ -392,14 +395,13 @@ class LinearCombinationVariadic {
     multiplies<ElementCompute> multiply;
     VariadicOp<ElementCompute> variadic_op;
 
-    intermediate = multiply(params_.alpha, accumulator);    // D = alpha * Accum
+    intermediate = multiply(params_.alpha, accumulator);  // D = alpha * Accum
     intermediate = variadic_op(
-                      intermediate, 
-                      params_.variadic_args, 
-                      BatchedMatrixCoord(batch, row_offset, column_offset, valid));
+        intermediate,
+        params_.variadic_args,
+        BatchedMatrixCoord(batch, row_offset, column_offset, valid));
     return destination_converter(intermediate);
   }
-  
 };
 
 }  // namespace thread
diff --git a/backends/iluvatar_gpu/apy/matmul/matmul.h b/backends/iluvatar_gpu/apy/matmul/matmul.h
index dcd8b5caae..d2794859f0 100644
--- a/backends/iluvatar_gpu/apy/matmul/matmul.h
+++ b/backends/iluvatar_gpu/apy/matmul/matmul.h
@@ -14,14 +14,14 @@
 
 #pragma once
 
-#include <iostream>
-#include <map>
-#include <vector>
-
 #include <cuda.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 
+#include <iostream>
+#include <map>
+#include <vector>
+
 #define CHECK_CUDA(func)                                                      \
   {                                                                           \
     cudaError_t err = func;                                                   \
diff --git a/backends/iluvatar_gpu/apy/matmul/params.h b/backends/iluvatar_gpu/apy/matmul/params.h
index 035ceb01a0..6be8a242b6 100644
--- a/backends/iluvatar_gpu/apy/matmul/params.h
+++ b/backends/iluvatar_gpu/apy/matmul/params.h
@@ -71,7 +71,7 @@ struct GemmEpilogueParams {
   const void *bias;
   void *output;
 
-  cudaStream_t* stream_ptr;
+  cudaStream_t *stream_ptr;
 
   std::vector<int64_t> input0_shape;
   std::vector<int64_t> input1_shape;
@@ -81,7 +81,7 @@ struct GemmEpilogueParams {
   std::vector<std::vector<int64_t>> epilogue_out_shapes;
 
   GemmEpilogueParams() {}
-  GemmEpilogueParams(void* stream_ptr,
+  GemmEpilogueParams(void *stream_ptr,
                      const void *input,
                      const void *weight,
                      const void *bias,
@@ -91,7 +91,7 @@ struct GemmEpilogueParams {
                      const std::vector<int64_t> &bias_shape,
                      bool transpose_a = false,
                      bool transpose_b = false)
-      : stream_ptr(reinterpret_cast<cudaStream_t*>(stream_ptr)),
+      : stream_ptr(reinterpret_cast<cudaStream_t *>(stream_ptr)),
         input(input),
         weight(weight),
         bias(bias),
@@ -136,7 +136,8 @@ struct GemmEpilogueParams {
     std::cout << "-- [GemmEpilogueParams] weight: " << weight << std::endl;
     std::cout << "-- [GemmEpilogueParams] bias: " << bias << std::endl;
     std::cout << "-- [GemmEpilogueParams] output: " << output << std::endl;
-    std::cout << "-- [GemmEpilogueParams] stream_str: " << stream_str << std::endl;
+    std::cout << "-- [GemmEpilogueParams] stream_str: " << stream_str
+              << std::endl;
 #endif
 
     shape_args.batch_stride_A = m * k;
diff --git a/backends/iluvatar_gpu/test_ap/set_env_ap.sh b/backends/iluvatar_gpu/test_ap/set_env_ap.sh
index fd7430883f..ac011cb74a 100644
--- a/backends/iluvatar_gpu/test_ap/set_env_ap.sh
+++ b/backends/iluvatar_gpu/test_ap/set_env_ap.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 export FLAGS_prim_enable_dynamic=true 
 export FLAGS_prim_all=true
 
diff --git a/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py b/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py
index 7bc3bbd3ce..63ea7bc6f9 100644
--- a/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py
+++ b/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py
@@ -1,7 +1,19 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
-import subprocess
 import unittest
-import time
 
 import numpy as np
 
@@ -13,7 +25,7 @@
 
 # os.environ["AP_WORKSPACE_DIR"] = "/tmp/paddle/ap"
 
-DT = 'float16'
+DT = "float16"
 # BS = 4
 # MS = 65536
 # NS = 32
@@ -39,6 +51,7 @@
 NS = 192
 KS = 768
 
+
 class TestMatmulEpilogue(unittest.TestCase):
     def setUp(self):
         dtype = DT
@@ -70,7 +83,6 @@ def setUp(self):
         self.mask = paddle.randn(mask_shape, dtype=dtype)
         self.mask.stop_gradient = True
 
-
     def get_subgraph(self):
         B = pct.DimVar(BS)
         M = pct.DimVar(MS)
@@ -90,7 +102,7 @@ def matmul_add_act(
             # return paddle.nn.functional.sigmoid(out)
             return paddle.nn.functional.relu(out)
 
-        #return matmul_add_act
+        # return matmul_add_act
 
         def matmul_add_divide_multipy_add_S1(
             x: pct.Tensor([B, M, K], T),
@@ -104,7 +116,7 @@ def matmul_add_divide_multipy_add_S1(
             # out = out / 1.2
             out = out * mask
             return residual + out
-        
+
         return matmul_add_divide_multipy_add_S1
 
     def test_subgraph(self):
@@ -117,8 +129,7 @@ def test_subgraph(self):
         # dy_outs = foo(self.x, self.y, self.b, self.b1)
         ap_outs = fused_foo(self.x, self.y, self.bias, self.residual, self.mask)
         dy_outs = foo(self.x, self.y, self.bias, self.residual, self.mask)
-        #return
-
+        # return
 
         # -------- 性能测试部分 --------
         iters = 10
@@ -139,7 +150,7 @@ def test_subgraph(self):
         with profiler.Profiler(
             targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
             on_trace_ready=profiler.export_chrome_tracing("./profiler_log"),
-            timer_only = False
+            timer_only=False,
         ) as prof:
             for step in range(iters):
                 # _ = fused_foo(self.x, self.y, self.b, self.b1)
@@ -147,13 +158,16 @@ def test_subgraph(self):
                 # _ = foo(self.x, self.y, self.b, self.b1)
                 prof.step()
         print("[Profiler] Trace saved to ./profiler_log")
-        prof.summary(sorted_by=profiler.SortedKeys.GPUTotal,
-             op_detail=True,
-             thread_sep=False,
-             time_unit='us')
-        
+        prof.summary(
+            sorted_by=profiler.SortedKeys.GPUTotal,
+            op_detail=True,
+            thread_sep=False,
+            time_unit="us",
+        )
+
         for dy_out, ap_out in zip(dy_outs, ap_outs):
             np.testing.assert_allclose(dy_out, ap_out, rtol=5e-2, atol=1e-1)
 
+
 if __name__ == "__main__":
     unittest.main()

From 4ff9c62231ffd7806b0a0cc276b16e0c04089444 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Tue, 14 Apr 2026 13:25:04 +0800
Subject: [PATCH 5/6] Move test to the unittest directory.

---
 ...1_epilogue_vectorized_perwarp_variadic.hpp |  32 +----
 backends/iluvatar_gpu/test_ap/set_env_ap.sh   |  41 ------
 .../test_ap_matmul_epilogue_iluvatar.py}      | 117 ++++++++----------
 3 files changed, 51 insertions(+), 139 deletions(-)
 delete mode 100644 backends/iluvatar_gpu/test_ap/set_env_ap.sh
 rename backends/iluvatar_gpu/{test_ap/test_matmul_epilogue.py => tests/unittests/test_ap_matmul_epilogue_iluvatar.py} (59%)

diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp
index e714983847..269399222e 100644
--- a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp
+++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp
@@ -201,7 +201,7 @@ class EpilogueVariadic<
   }
 
   CUTLASS_HOST_DEVICE
-  EpilogueVariadic(Params const& params_)
+  explicit EpilogueVariadic(Params const& params_)
       : params(params_), epilogue_op(params_.epilogue_op) {}
 
   CUTLASS_DEVICE
@@ -223,7 +223,7 @@ class EpilogueVariadic<
                                  ResidueMNK residue_mnk,
                                  int thread_idx,
                                  char* smem_buf) {
-    using namespace cute;
+    using namespace cute;  // NOLINT
     using X = Underscore;
 
     static_assert(cute::rank(ProblemShapeMNKL{}) == 4,
@@ -383,34 +383,6 @@ class EpilogueVariadic<
         flat_divide(cD, EpilogueTile{});  // (SMEM_M,SMEM_N,TILE_M,TILE_N)
     Tensor tRG_cD = thread_r2g.partition_D(cDt);
 
-#if 0
-    if (thread_idx == 0 && m_coord == 0 && n_coord == 0) {
-      print("aC   : "); print(accumulators.layout()); print("\n");
-      // print("gC   : "); print(gC.layout()); print("\n");
-      print("gD   : "); print(gD.layout()); print("\n");
-      // print("gBias   : "); print(gBias.layout()); print("\n");
-      print("sAcc   : "); print(sAcc.layout()); print("\n");
-      // print("rAcc   : "); print(rAcc.layout()); print("\n");
-      print("\n");
-      // print("tRS_rAcc : "); print(tRS_rAcc.layout()); print("\n");
-      print("tRS_sAcc : "); print(tRS_sAcc.layout()); print("\n");
-      print("\n");
-      print("tSR_sAcc : "); print(tSR_sAcc.layout()); print("\n");
-      print("tSR_rAcc : "); print(tSR_rAcc.layout()); print("\n");
-      print("\n");
-      print("tRR_rSrc : "); print(tRR_rSrc.layout()); print("\n");
-      print("tRR_rDst : "); print(tRR_rDst.layout()); print("\n");
-      print("\n");
-      print("tRG_rAcc : "); print(tRG_rAcc.layout()); print("\n");
-      print("tRG_gD   : "); print(tRG_gD.layout()); print("\n");
-      print("tRS_cC   : "); print(tRS_cC.layout()); print("\n");
-      print("cCt      : "); print(cCt.layout()); print("\n");
-      print("cD       : "); print(cD.layout()); print("\n");
-      print("tE_D     : "); print(tE_D.layout()); print("\n");
-      print("sAcc     : "); print(sAcc.layout()); print("\n");
-      print("tErAcc    : "); print(tErAcc.layout()); print("\n");
-    }
-#endif
     CUTLASS_PRAGMA_UNROLL
     for (int epi_tile_m = 0; epi_tile_m < size<2>(tEgD).value; ++epi_tile_m) {
       CUTLASS_PRAGMA_UNROLL
diff --git a/backends/iluvatar_gpu/test_ap/set_env_ap.sh b/backends/iluvatar_gpu/test_ap/set_env_ap.sh
deleted file mode 100644
index ac011cb74a..0000000000
--- a/backends/iluvatar_gpu/test_ap/set_env_ap.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-export FLAGS_prim_enable_dynamic=true 
-export FLAGS_prim_all=true
-
-# CINN related FLAG
-export FLAGS_use_cinn=false
-export FLAGS_group_schedule_tiling_first=true
-# PIR mode
-export FLAGS_enable_pir_api=true
-
-# print Program IR
-export FLAGS_print_ir=true
-
-# debug log
-export GLOG_v=0
-export GLOG_vmodule=ap_generic_drr_pass=6
-
-export CUDA_VISIBLE_DEVICES=11
-export FLAGS_enable_ap=1
-
-PADDLE_ROOT="${PADDLE_ROOT:-/path/to/your/paddle/build}"
-export PYTHONPATH="${PADDLE_ROOT}/python:$PYTHONPATH"
-export AP_WORKSPACE_DIR="/tmp/ap_workspace"
-export AP_PATH="${PADDLE_ROOT}/python/paddle/apy/sys:${PADDLE_ROOT}/python/paddle/apy/matmul_pass:$AP_PATH"
-
-python test_matmul_epilogue.py 2>&1 | tee output.log
diff --git a/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py
similarity index 59%
rename from backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py
rename to backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py
index 63ea7bc6f9..1b49c13a31 100644
--- a/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py
+++ b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py
@@ -14,38 +14,25 @@
 
 import os
 import unittest
-
 import numpy as np
+from pathlib import Path
 
 import paddle
 import paddle.incubate.cc as pcc
 import paddle.incubate.cc.typing as pct
-
 import paddle.profiler as profiler
 
-# os.environ["AP_WORKSPACE_DIR"] = "/tmp/paddle/ap"
+
+os.environ["AP_WORKSPACE_DIR"] = "/tmp/paddle/ap_workspace"
+
+
+def GetPirProgram(fused_func, tensor_args):
+    dtypes = tuple(tensor.dtype for tensor in tensor_args)
+    func = fused_func.func_overload_ctx.dtypes2func.get(dtypes, None)
+    return str(func.infer_program.forward_program)
+
 
 DT = "float16"
-# BS = 4
-# MS = 65536
-# NS = 32
-# KS = 128
-# BS = 1
-# MS = 128
-# NS = 64
-# KS = 128
-# BS = 1
-# MS = 64
-# NS = 768
-# KS = 768
-# BS = 4
-# MS = 64
-# NS = 3072
-# KS = 768
-# BS = 4
-# MS = 128
-# NS = 32
-# KS = 128
 BS = 4
 MS = 784
 NS = 192
@@ -67,10 +54,6 @@ def setUp(self):
         self.b = paddle.randn(b_shape, dtype=dtype)
         self.b.stop_gradient = True
 
-        b1_shape = [1]
-        self.b1 = paddle.randn(b1_shape, dtype=dtype)
-        self.b1.stop_gradient = True
-
         bias_shape = [NS]
         self.bias = paddle.randn(bias_shape, dtype=dtype)
         self.bias.stop_gradient = True
@@ -83,7 +66,7 @@ def setUp(self):
         self.mask = paddle.randn(mask_shape, dtype=dtype)
         self.mask.stop_gradient = True
 
-    def get_subgraph(self):
+    def get_matmul_add_act(self):
         B = pct.DimVar(BS)
         M = pct.DimVar(MS)
         K = pct.DimVar(KS)
@@ -94,17 +77,22 @@ def matmul_add_act(
             x: pct.Tensor([B, M, K], T),
             y: pct.Tensor([K, N], T),
             b: pct.Tensor([B, M, N], T),
-            b1: pct.Tensor([1], T),
         ):
 
             out = paddle.matmul(x, y)
             out = out + b
-            # return paddle.nn.functional.sigmoid(out)
             return paddle.nn.functional.relu(out)
 
-        # return matmul_add_act
+        return matmul_add_act
+
+    def get_matmul_add_divide_multipy_add(self):
+        B = pct.DimVar(BS)
+        M = pct.DimVar(MS)
+        K = pct.DimVar(KS)
+        N = pct.DimVar(NS)
+        T = pct.DTypeVar("T", DT)
 
-        def matmul_add_divide_multipy_add_S1(
+        def matmul_add_divide_multipy_add(
             x: pct.Tensor([B, M, K], T),
             y: pct.Tensor([K, N], T),
             bias: pct.Tensor([N], T),
@@ -117,54 +105,47 @@ def matmul_add_divide_multipy_add_S1(
             out = out * mask
             return residual + out
 
-        return matmul_add_divide_multipy_add_S1
+        return matmul_add_divide_multipy_add
 
-    def test_subgraph(self):
-        foo = self.get_subgraph()
-        fused_foo = pcc.compile(
-            foo, ap_path=f"{os.path.dirname(paddle.__file__)}/apy/matmul_pass"
-        )
+    def check_if_ap_variadic_exist(self, fused_foo, foo_args):
+        generated_pir_program = GetPirProgram(fused_foo, foo_args)
+        assert (
+            "pd_op.ap_variadic" in generated_pir_program
+        ), "AP fusion failed, none pd_op.ap_variadic found in the pir_program."
 
-        # ap_outs = fused_foo(self.x, self.y, self.b, self.b1)
-        # dy_outs = foo(self.x, self.y, self.b, self.b1)
-        ap_outs = fused_foo(self.x, self.y, self.bias, self.residual, self.mask)
-        dy_outs = foo(self.x, self.y, self.bias, self.residual, self.mask)
-        # return
+    def check_by_profiler(self, fused_foo, foo_args):
+        paddle.device.synchronize()
 
-        # -------- 性能测试部分 --------
         iters = 10
-        # warmup
-        # _ = fused_foo(self.x, self.y, self.b, self.b1)
-        # _ = foo(self.x, self.y, self.b, self.b1)
-
-        # paddle.device.synchronize()
-        # start = time.time()
-        # # for _ in range(iters):
-        #     # _ = fused_foo(self.x, self.y, self.b, self.b1)
-        # paddle.device.synchronize()
-        # end = time.time()
-        # avg_time = (end - start) / iters
-        # print(f"[Performance] Avg latency per run: {avg_time:.6f} s")
-
-        # profiler (保存到 log_dir)
         with profiler.Profiler(
             targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
             on_trace_ready=profiler.export_chrome_tracing("./profiler_log"),
             timer_only=False,
         ) as prof:
-            for step in range(iters):
-                # _ = fused_foo(self.x, self.y, self.b, self.b1)
-                _ = fused_foo(self.x, self.y, self.bias, self.residual, self.mask)
-                # _ = foo(self.x, self.y, self.b, self.b1)
+            for _ in range(iters):
+                _ = fused_foo(*foo_args)
                 prof.step()
-        print("[Profiler] Trace saved to ./profiler_log")
-        prof.summary(
-            sorted_by=profiler.SortedKeys.GPUTotal,
-            op_detail=True,
-            thread_sep=False,
-            time_unit="us",
+            prof.summary()
+
+    def test_subgraph(self):
+        foo = self.get_matmul_add_act()
+        foo_args = (self.x, self.y, self.b)
+
+        # foo = self.get_matmul_add_divide_multipy_add()
+        # foo_args = (self.x, self.y, self.bias, self.residual, self.mask)
+
+        iluvatar_gpu_dir = Path(__file__).resolve().parent.parent.parent
+        fused_foo = pcc.compile(
+            foo,
+            ap_path=f"{iluvatar_gpu_dir}/apy/device",
+            backend_device="custom_device",
         )
 
+        self.check_if_ap_variadic_exist(fused_foo, foo_args)
+        self.check_by_profiler(fused_foo, foo_args)
+
+        ap_outs = fused_foo(*foo_args)
+        dy_outs = foo(*foo_args)
         for dy_out, ap_out in zip(dy_outs, ap_outs):
             np.testing.assert_allclose(dy_out, ap_out, rtol=5e-2, atol=1e-1)
 

From 8156bf3268c48d02da2368af5bd2715392d4d327 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Wed, 15 Apr 2026 10:32:41 +0800
Subject: [PATCH 6/6] Update profiler.

---
 .../tests/unittests/test_ap_matmul_epilogue_iluvatar.py    | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py
index 1b49c13a31..ccb41f116a 100644
--- a/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py
+++ b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py
@@ -125,7 +125,12 @@ def check_by_profiler(self, fused_foo, foo_args):
             for _ in range(iters):
                 _ = fused_foo(*foo_args)
                 prof.step()
-            prof.summary()
+        prof.summary(
+            sorted_by=profiler.SortedKeys.GPUTotal,
+            op_detail=True,
+            thread_sep=False,
+            time_unit="us",
+        )
 
     def test_subgraph(self):
         foo = self.get_matmul_add_act()