From 755aef2254bc3ea7a2489c7ef1adc6103aba5457 Mon Sep 17 00:00:00 2001 From: denghaodong Date: Tue, 17 Mar 2026 19:05:39 +0800 Subject: [PATCH 1/6] ap support customdevice --- backends/iluvatar_gpu/CMakeLists.txt | 13 ++ backends/iluvatar_gpu/test_ap/set_env_ap.sh | 27 +++ .../test_ap/test_matmul_epilogue.py | 159 ++++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 backends/iluvatar_gpu/test_ap/set_env_ap.sh create mode 100644 backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt index 06512de6d6..8e0379fdf9 100644 --- a/backends/iluvatar_gpu/CMakeLists.txt +++ b/backends/iluvatar_gpu/CMakeLists.txt @@ -41,6 +41,8 @@ if(WITH_FLAGCX) add_definitions("-DPADDLE_WITH_FLAGCX") include(external/flagcx) endif() + +add_definitions("-DPADDLE_WITH_CINN") set(PLUGIN_VERSION ${PADDLE_VERSION}) set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto") @@ -817,6 +819,17 @@ list( file( GLOB_RECURSE CC_SRCS RELATIVE ${CMAKE_SOURCE_DIR} + # 增加ap依赖项 + #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/ap_variadic_kernel.cc + #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/kernel_dispatch_helper.cc + #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/ap_infer_meta_helper.cc + #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/kernel_define_helper.cc + ${PADDLE_SOURCE_DIR}/paddle/ap/src/axpr/*.cc + ${PADDLE_SOURCE_DIR}/paddle/ap/src/fs/*.cc + ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_module/*.cc + ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_gen/*.cc + ${PADDLE_SOURCE_DIR}/paddle/ap/src/kernel_dispatch/*.cc + ${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/*.cc runtime/runtime.cc runtime/iluvatar_context.cc common/*.cc diff --git a/backends/iluvatar_gpu/test_ap/set_env_ap.sh b/backends/iluvatar_gpu/test_ap/set_env_ap.sh new file mode 100644 index 0000000000..fd7430883f --- /dev/null +++ b/backends/iluvatar_gpu/test_ap/set_env_ap.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +export FLAGS_prim_enable_dynamic=true +export FLAGS_prim_all=true + +# CINN related FLAG +export FLAGS_use_cinn=false +export FLAGS_group_schedule_tiling_first=true +# PIR mode +export FLAGS_enable_pir_api=true + +# print Program IR +export FLAGS_print_ir=true + +# debug log +export GLOG_v=0 +export GLOG_vmodule=ap_generic_drr_pass=6 + +export CUDA_VISIBLE_DEVICES=11 +export FLAGS_enable_ap=1 + +PADDLE_ROOT="${PADDLE_ROOT:-/path/to/your/paddle/build}" +export PYTHONPATH="${PADDLE_ROOT}/python:$PYTHONPATH" +export AP_WORKSPACE_DIR="/tmp/ap_workspace" +export AP_PATH="${PADDLE_ROOT}/python/paddle/apy/sys:${PADDLE_ROOT}/python/paddle/apy/matmul_pass:$AP_PATH" + +python test_matmul_epilogue.py 2>&1 | tee output.log diff --git a/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py b/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py new file mode 100644 index 0000000000..7bc3bbd3ce --- /dev/null +++ b/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py @@ -0,0 +1,159 @@ +import os +import subprocess +import unittest +import time + +import numpy as np + +import paddle +import paddle.incubate.cc as pcc +import paddle.incubate.cc.typing as pct + +import paddle.profiler as profiler + +# os.environ["AP_WORKSPACE_DIR"] = "/tmp/paddle/ap" + +DT = 'float16' +# BS = 4 +# MS = 65536 +# NS = 32 +# KS = 128 +# BS = 1 +# MS = 128 +# NS = 64 +# KS = 128 +# BS = 1 +# MS = 64 +# NS = 768 +# KS = 768 +# BS = 4 +# MS = 64 +# NS = 3072 +# KS = 768 +# BS = 4 +# MS = 128 +# NS = 32 +# KS = 128 +BS = 4 +MS = 784 +NS = 192 +KS = 768 + +class TestMatmulEpilogue(unittest.TestCase): + def setUp(self): + dtype = DT + x_shape = [BS, MS, KS] + self.x = paddle.randn(x_shape, dtype=dtype) + self.x.stop_gradient = True + + y_shape = [KS, NS] + self.y = paddle.randn(y_shape, dtype=dtype) + self.y.stop_gradient = True + + b_shape = [BS, MS, NS] + self.b = paddle.randn(b_shape, dtype=dtype) + self.b.stop_gradient = True + + b1_shape = [1] + self.b1 = paddle.randn(b1_shape, dtype=dtype) + self.b1.stop_gradient = True + + bias_shape = [NS] + self.bias = paddle.randn(bias_shape, dtype=dtype) + self.bias.stop_gradient = True + + residual_shape = [BS, MS, NS] + self.residual = paddle.randn(residual_shape, dtype=dtype) + self.residual.stop_gradient = True + + mask_shape = [BS, MS, NS] + self.mask = paddle.randn(mask_shape, dtype=dtype) + self.mask.stop_gradient = True + + + def get_subgraph(self): + B = pct.DimVar(BS) + M = pct.DimVar(MS) + K = pct.DimVar(KS) + N = pct.DimVar(NS) + T = pct.DTypeVar("T", DT) + + def matmul_add_act( + x: pct.Tensor([B, M, K], T), + y: pct.Tensor([K, N], T), + b: pct.Tensor([B, M, N], T), + b1: pct.Tensor([1], T), + ): + + out = paddle.matmul(x, y) + out = out + b + # return paddle.nn.functional.sigmoid(out) + return paddle.nn.functional.relu(out) + + #return matmul_add_act + + def matmul_add_divide_multipy_add_S1( + x: pct.Tensor([B, M, K], T), + y: pct.Tensor([K, N], T), + bias: pct.Tensor([N], T), + residual: pct.Tensor([B, M, N], T), + mask: pct.Tensor([B, M, N], T), + ): + out = paddle.matmul(x, y) + out = out + bias + # out = out / 1.2 + out = out * mask + return residual + out + + return matmul_add_divide_multipy_add_S1 + + def test_subgraph(self): + foo = self.get_subgraph() + fused_foo = pcc.compile( + foo, ap_path=f"{os.path.dirname(paddle.__file__)}/apy/matmul_pass" + ) + + # ap_outs = fused_foo(self.x, self.y, self.b, self.b1) + # dy_outs = foo(self.x, self.y, self.b, self.b1) + ap_outs = fused_foo(self.x, self.y, self.bias, self.residual, self.mask) + dy_outs = foo(self.x, self.y, self.bias, self.residual, self.mask) + #return + + + # -------- 性能测试部分 -------- + iters = 10 + # warmup + # _ = fused_foo(self.x, self.y, self.b, self.b1) + # _ = foo(self.x, self.y, self.b, self.b1) + + # paddle.device.synchronize() + # start = time.time() + # # for _ in range(iters): + # # _ = fused_foo(self.x, self.y, self.b, self.b1) + # paddle.device.synchronize() + # end = time.time() + # avg_time = (end - start) / iters + # print(f"[Performance] Avg latency per run: {avg_time:.6f} s") + + # profiler (保存到 log_dir) + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU], + on_trace_ready=profiler.export_chrome_tracing("./profiler_log"), + timer_only = False + ) as prof: + for step in range(iters): + # _ = fused_foo(self.x, self.y, self.b, self.b1) + _ = fused_foo(self.x, self.y, self.bias, self.residual, self.mask) + # _ = foo(self.x, self.y, self.b, self.b1) + prof.step() + print("[Profiler] Trace saved to ./profiler_log") + prof.summary(sorted_by=profiler.SortedKeys.GPUTotal, + op_detail=True, + thread_sep=False, + time_unit='us') + + for dy_out, ap_out in zip(dy_outs, ap_outs): + np.testing.assert_allclose(dy_out, ap_out, rtol=5e-2, atol=1e-1) + +if __name__ == "__main__": + unittest.main() From 4d7a04ae4f4e248c5675c34ecf270c5c887eaba4 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 13 Apr 2026 15:21:08 +0800 Subject: [PATCH 2/6] Add apy. --- .../apy/device/compile_command_util.py | 50 ++ .../apy/matmul/cutlass_matmul.cuh | 290 ++++++++++++ .../cutlass_patch/batched_matrix_coord.h | 37 ++ ...1_epilogue_vectorized_perwarp_variadic.hpp | 435 ++++++++++++++++++ .../thread/linear_combination_variadic.h | 407 ++++++++++++++++ .../apy/matmul/cutlass_patch/trace_device.h | 73 +++ .../apy/matmul/default_config_id.h | 29 ++ backends/iluvatar_gpu/apy/matmul/matmul.h | 39 ++ backends/iluvatar_gpu/apy/matmul/params.h | 177 +++++++ 9 files changed, 1537 insertions(+) create mode 100644 backends/iluvatar_gpu/apy/device/compile_command_util.py create mode 100644 backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh create mode 100644 backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h create mode 100644 backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp create mode 100644 backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h create mode 100644 backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h create mode 100644 backends/iluvatar_gpu/apy/matmul/default_config_id.h create mode 100644 backends/iluvatar_gpu/apy/matmul/matmul.h create mode 100644 backends/iluvatar_gpu/apy/matmul/params.h diff --git a/backends/iluvatar_gpu/apy/device/compile_command_util.py b/backends/iluvatar_gpu/apy/device/compile_command_util.py new file mode 100644 index 0000000000..18741ca805 --- /dev/null +++ b/backends/iluvatar_gpu/apy/device/compile_command_util.py @@ -0,0 +1,50 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ap + + +class CompileCommandGenerator: + def __init__(self): + self.file_ext = "cu" + self.op_type2generate_func = ap.OrderedDict( + [ + ['matmul', self.generate_matmul_compile_command], + ] + ) + + def __call__(self, op_type, tpl_dirname, library_name): + return self.op_type2generate_func[op_type](tpl_dirname, library_name) + + def generate_matmul_compile_command(self, tpl_dirname, library_name): + parent_dir = ap.dirname(ap.dirname(__file__)) + cutlass_dir = f"{parent_dir}/matmul/cutlass" + matmul_source_dir = f"{parent_dir}/matmul" + + compile_cmd = "clang++ -x ivcore -L/usr/local/corex/lib -lcudart --cuda-path=/usr/local/corex -std=c++17 -O3 -fPIC --cuda-gpu-arch=ivcore11 -Xclang=-fcuda-allow-variadic-functions" + compile_cmd = compile_cmd + " -I " + cutlass_dir + "/include" + compile_cmd = compile_cmd + " -I " + cutlass_dir + "/tools/util/include" + compile_cmd = compile_cmd + " -I " + matmul_source_dir + compile_cmd = ( + compile_cmd + + " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -DCUTLASS_DEBUG_TRACE_LEVEL=0 -DCUTLASS_ILUVATAR" + ) + compile_cmd = ( + compile_cmd + " -DAP_ENABLE_AUTOTUNE=0 -DAP_ENABLE_DEBUG=0" + ) + compile_cmd = ( + compile_cmd + + f" --shared {library_name}.{self.file_ext} -o lib{library_name}.so" + ) + return compile_cmd diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh b/backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh new file mode 100644 index 0000000000..f48822fce2 --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_matmul.cuh @@ -0,0 +1,290 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "cute/atom/mma_traits.hpp" +#include "cute/numeric/integral_constant.hpp" +#include "cute/tensor.hpp" + +#include "cutlass/cutlass.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/device_memory.h" + +#include "cutlass/detail/layout.hpp" + +#include "cute/atom/copy_traits_ix11_sme.hpp" +#include "cute/atom/copy_traits_ix11.hpp" +#include "cute/atom/copy_atom.hpp" + +#include "cutlass/arch/config.h" +#include "cutlass/epilogue/dispatch_policy.hpp" + +#include "cutlass_patch/batched_matrix_coord.h" +#include "cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp" +#include "cutlass_patch/epilogue/thread/linear_combination_variadic.h" + +#include "default_config_id.h" +#include "params.h" + +#define CHECK_CUTLASS(status) \ + { \ + cutlass::Status error = status; \ + if (error != cutlass::Status::kSuccess) { \ + std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) \ + << " at: " << __LINE__ << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } + +namespace ap { +using bfloat16 = nv_bfloat16; + +template +using Array = cutlass::Array; + +using MatrixCoord = cutlass::BatchedMatrixCoord; + +// Convert CUDA data type to cutlass data type +template +struct CutlassDataType { + using Type = T; +}; + +template <> +struct CutlassDataType { + using Type = cutlass::half_t; +}; + +template <> +struct CutlassDataType<__nv_bfloat16> { + using Type = cutlass::bfloat16_t; +}; + +// Convert to cutlass layout +template +struct MatrixLayout { + using Type = cutlass::layout::RowMajor; +}; + +template <> +struct MatrixLayout { + using Type = cutlass::layout::ColumnMajor; +}; + +static cutlass::gemm::GemmUniversalMode GetGemmMode(int batch_count) { + return batch_count > 1 ? cutlass::gemm::GemmUniversalMode::kBatched + : cutlass::gemm::GemmUniversalMode::kGemm; +} + +template + class VariadicFunctor, + int AlignA = 64 / sizeof(ElementT), + int AlignB = 64 / sizeof(ElementT), + int ConfigId = 0> +void MatmulAddVariadic( + const GemmEpilogueParams ¶ms, + const typename VariadicFunctor::Arguments &variadic_args) { + + using namespace cute; + + using ElementAccumulator = + typename CutlassDataType::Type; // <- data type of + // accumulator + using ElementComputeEpilogue = + ElementAccumulator; // <- data type of epilogue operations + using ElementA = + typename CutlassDataType::Type; // <- data type of elements in + // input matrix A + using ElementB = + typename CutlassDataType::Type; // <- data type of elements in + // input matrix B + using ElementC = ElementA; + using ElementD = ElementA; + using ElementOutput = + typename CutlassDataType::Type; // <- data type of elements in + // output matrix D + + constexpr int AlignC = AlignB; + + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + using LayoutC = cutlass::layout::RowMajor; + using LayoutD = cutlass::layout::RowMajor; + + using EpilogueThreadOp = cutlass::epilogue::thread::LinearCombinationVariadic< + VariadicFunctor, ElementD, 1, ElementAccumulator, ElementAccumulator>; + + // Epilogue -------------------------------- + // Params + // Tile of C/D + using EpilogueTile = Tile< + Layout,Stride<_1,_64>>, + Layout,Stride<_1,_64>> + >; + + // Layout of smem + using SmemLayout = + ComposedLayout< + Swizzle<1,1,6>, + _0, + Layout< + Shape ,Shape <_16, _2, _4>>, + Stride,Stride< _2,_128,_1024>> + > + >; + + // G2R + using EpiG2RAtom = Copy_Atom, ElementC>; + + // R2S + using EpiR2SAtom = Copy_Atom, ElementD>; + + // S2R + using EpiS2RAtom = EpiR2SAtom; + using TiledCopyS2R = + TiledCopy< + EpiS2RAtom, + Layout< + Shape ,Shape <_2, _2,_2>>, + Stride,Stride<_4,_64,_8>>>, + decltype(product_each(shape(EpilogueTile{}))) + >; + + // R2R + using CopyAtomR2R = Copy_Atom; + + // R2G + using EpiR2GAtom = Copy_Atom, ElementD>; + + // Epilogue Collective + using CollectiveEpilogue = cutlass::epilogue::collective::EpilogueVariadic< + // cutlass::detail::TagToStrideC_t, + cutlass::detail::TagToStrideC_t, + EpilogueThreadOp, + EpilogueTile, + SmemLayout, + EpiG2RAtom, + EpiR2SAtom, + TiledCopyS2R, + CopyAtomR2R, + EpiR2GAtom, + cutlass::epilogue::EpilogueSimtVectorized>; + + using CtaTiler = Shape<_256,_256,_32>; + using TiledMma = + TiledMMA< + MMA_Atom, + Layout>, + Tile< + Layout, Stride<_1,_64,_16>>, + Layout, Stride<_1,_64,_16>>, + _16 + > + >; + + using CopyA_Op = IX11_SME_I_16x512b; + using CopyA_Atom = Copy_Atom, ElementA>; + using CopyA = decltype(make_tiled_copy(CopyA_Atom{0}, Layout>{}, Layout,Stride<_1,_16>>{})); + using SmemLayoutAtomA = IX11::Layout_SME_I_16x512b_Atom; + using SmemCopyA = Copy_Atom, ElementA>; + + using CopyB_Op = IX11_SME_I_16x512b; + using CopyB_Atom = Copy_Atom, ElementB>; + using CopyB = decltype(make_tiled_copy(CopyB_Atom{0}, Layout>{}, Layout,Stride<_1,_16>>{})); + using SmemLayoutAtomB = IX11::Layout_SME_I_16x512b_Atom; + using SmemCopyB = Copy_Atom, ElementB>; + + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + cutlass::gemm::MainloopIx11SmeUnpredicated<2>, + CtaTiler, + ElementA, cutlass::detail::TagToStrideA_t, + ElementB, cutlass::detail::TagToStrideB_t, + TiledMma, + CopyA, SmemLayoutAtomA, SmemCopyA, cute::identity, + CopyB, SmemLayoutAtomB, SmemCopyB, cute::identity + >; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveMainloop, + CollectiveEpilogue + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + ProblemShapeType problem_shape{params.m, params.n, params.k, params.batch_count}; + + const ElementA *input = + reinterpret_cast(params.input); + const ElementB *weight = + reinterpret_cast(params.weight); + ElementOutput *output = reinterpret_cast(params.output); + + cutlass::KernelHardwareInfo hw_info; + // TODO + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.device_id = 0; + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + + typename EpilogueThreadOp::Params epilogue_op_args; + epilogue_op_args.variadic_args = variadic_args; + + int m = params.m, n = params.n, k = params.k, l = params.batch_count; + + auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, l)); + auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, l)); + auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(m, n, l)); + + typename Gemm::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_shape, + {input, stride_A, weight, stride_B}, + {epilogue_op_args, output, stride_D}, + hw_info + }; + + Gemm device_gemm; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + cudaStream_t* stream_ptr = reinterpret_cast(params.stream_ptr); + + CHECK_CUTLASS(device_gemm.can_implement(arguments)); + CHECK_CUTLASS(device_gemm.initialize(arguments, workspace.get(), *stream_ptr)); + CHECK_CUTLASS(device_gemm(*stream_ptr)); + +} + +} // namespace ap diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h new file mode 100644 index 0000000000..544670b8ad --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h @@ -0,0 +1,37 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "cutlass/cutlass.h" + +namespace cutlass { + +struct BatchedMatrixCoord { + int batch; + int row; + int column; + bool is_valid; + + CUTLASS_HOST_DEVICE + BatchedMatrixCoord() : batch(0), row(0), column(0), is_valid(false) {} + + CUTLASS_HOST_DEVICE + BatchedMatrixCoord(int b, int r, int c) : batch(b), row(r), column(c), is_valid(true) {} + + CUTLASS_HOST_DEVICE + BatchedMatrixCoord(int b, int r, int c, bool valid) : batch(b), row(r), column(c), is_valid(valid) {} +}; + +}; // namespace cutlass diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp new file mode 100644 index 0000000000..a19167bf3a --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp @@ -0,0 +1,435 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Functor performing elementwise operations used by epilogues. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cute/tensor.hpp" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace collective { + +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + // class StrideC, + class StrideD, + class ThreadEpilogueOp, + class EpilogueTile, + class SmemLayout, + class CopyAtomG2R_, + class CopyAtomR2S, + class TiledCopyS2R, + class CopyAtomR2R, + class CopyAtomR2G, + class EpilogueScheduleType = EpilogueSimtVectorized, + class Enable = void +> +class EpilogueVariadic { + static_assert(cute::is_same_v || + cute::is_same_v, + "Could not find an epilogue specialization."); +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Iluvatar ix11/ix30 Epilogue Vectorized +/// 1. Load C(optional), compute D = alpha * Acc + beta * C, +/// then transform to ElementD(8/16-bit typically) +/// 2. Copy D from Register to Smem +/// 3. Load D from Smem to Register +/// 4. Register-to-Register transform +/// 5. Store D from Register to Gmem +/// +/// Support alpha and beta params, bias not supported yet. +template < + // class StrideC_, + class StrideD_, + class ThreadEpilogueOp_, + class EpilogueTile_, + class SmemLayout_, + class CopyAtomG2R_, + class CopyAtomR2S_, + class TiledCopyS2R_, + class CopyAtomR2R_, + class CopyAtomR2G_, + class EpilogueScheduleType_ +> +class EpilogueVariadic< + // StrideC_, + StrideD_, + ThreadEpilogueOp_, + EpilogueTile_, + SmemLayout_, + CopyAtomG2R_, + CopyAtomR2S_, + TiledCopyS2R_, + CopyAtomR2R_, + CopyAtomR2G_, + EpilogueScheduleType_, + cute::enable_if_t< + cute::is_same_v + > + > { +public: + // + // Type Aliases + // + // derived types of output thread level operator + using ThreadEpilogueOp = ThreadEpilogueOp_; + using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator; + using ElementCompute = typename ThreadEpilogueOp::ElementCompute; + using ElementScalar = ElementCompute; + using ElementOutput = typename ThreadEpilogueOp::ElementOutput; + using ElementD = typename ThreadEpilogueOp::ElementOutput; + using StrideD = StrideD_; + using ElementC = ElementD; // for GemmUniversal + using StrideC = StrideD; + using ElementBias = typename detail::IsThreadEpilogueOpWithBias::type; + using EpilogueTile = EpilogueTile_; + using SmemLayout = SmemLayout_; + using CopyAtomG2R = CopyAtomG2R_; + using CopyAtomR2S = CopyAtomR2S_; + using TiledCopyS2R = TiledCopyS2R_; + using CopyAtomR2R = CopyAtomR2R_; + using CopyAtomR2G = CopyAtomR2G_; + + // using GmemTiledCopyC = void; + using GmemTiledCopyD = CopyAtomR2G; + + static constexpr bool IsEpilogueBiasSupported = detail::IsThreadEpilogueOpWithBias::value; + using StrideBias = cute::conditional_t(), Stride<_1,_0,int64_t>, Stride<_0,_1,int64_t>>; + + // static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]"); + static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]"); + + struct SharedStorage + { + cute::array_aligned> smem_epilogue; + }; + + struct Arguments { + typename ThreadEpilogueOp::Params epilogue_op{}; + // using StrideBias = decltype(thread.dBias); + // ElementC const* ptr_C = nullptr; + // StrideC dC{}; + ElementD* ptr_D = nullptr; + StrideD dD{}; + }; + + // Device side epilogue params + template + struct ParamsType { + typename ThreadEpiOp::Params epilogue_op{}; + ElementD* ptr_D = nullptr; + StrideD dD{}; + }; + + using Params = ParamsType; + + // + // Methods + // + + template + static constexpr Params + to_underlying_arguments( + [[maybe_unused]] ProblemShape const& _, + Arguments const& args, + [[maybe_unused]] void* workspace) { + + return { + args.epilogue_op, + args.ptr_D, + args.dD, + }; + } + + template + static size_t + get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) { + return 0; + } + + template + static cutlass::Status + initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, + CudaHostAdapter* cuda_adapter = nullptr) { + return cutlass::Status::kSuccess; + } + + template + static bool + can_implement( + [[maybe_unused]] ProblemShape const& problem_shape, + [[maybe_unused]] Arguments const& args) { + return true; + } + + CUTLASS_HOST_DEVICE + EpilogueVariadic(Params const& params_) + : params(params_), epilogue_op(params_.epilogue_op) { } + + CUTLASS_DEVICE + bool + is_source_needed() { + return epilogue_op.is_source_needed(); + } + + template< + class ProblemShapeMNKL, + class BlockShapeMNK, + class BlockCoordMNKL, + class FrgEngine, class FrgLayout, + class TiledMma, + class ResidueMNK + > + CUTLASS_DEVICE void + operator()( + ProblemShapeMNKL problem_shape_mnkl, + BlockShapeMNK blk_shape_MNK, + BlockCoordMNKL blk_coord_mnkl, + cute::Tensor const& accumulators, // (MMA,MMA_M,MMA_N) + TiledMma tiled_mma, + ResidueMNK residue_mnk, + int thread_idx, + char* smem_buf) { + using namespace cute; + using X = Underscore; + + static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4"); + static_assert(is_static::value, "ThreadBlock tile shape must be static"); + static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3"); + static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3"); + + static_assert(rank(EpilogueTile{}) == 2, "Rank of EpilogueTile should be 2"); + static_assert(rank(SmemLayout{}) == 2, "Rank of SmemLayout should be 2"); + CUTE_STATIC_ASSERT(size(SmemLayout{}) == size<0>(EpilogueTile{}) * size<1>(EpilogueTile{})); + + // Separate out problem shape for convenience + auto M = get<0>(problem_shape_mnkl); + auto N = get<1>(problem_shape_mnkl); + auto L = get<3>(problem_shape_mnkl); + + // Represent the full output tensor + // Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), params.dC); // (m,n,l) + Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), params.dD); // (m,n,l) + // Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), make_shape(M,N,L), params.dBias); // (m,n,l) + + // Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + // Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + + // Slice to get the tile this CTA is responsible for + auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl; + // Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + // Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + + // Construct a tensor in SMEM that we can partition for rearranging data + SharedStorage& storage = *reinterpret_cast(smem_buf); + + // Common part (with sm70 epilogue) end-------------------------------- + // init register transform + // TV -> MN(logical) + auto layout_gd_tv = tiled_mma.thrfrg_C(make_layout(gD.shape())); + + // G2R / R2S ------------------------------------- + // In G2R / R2S tiling, we assemble the final V->MN layout out from given epilogue tile and mma layout + + // epilogue tile of logical C + auto tE_D = flat_divide(make_layout(product_each(gD.shape())), EpilogueTile{}); + + using EpilogueTileSize = decltype(product_each(shape(EpilogueTile{}))); + + Tensor sAcc = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{}); + + auto r2s_mn_tiler_tv = take<0,2>(right_inverse(layout_gd_tv).compose(tE_D)); // epilogue logical mn->tv + // get tiler of TV from tiler of MN + auto r2s_tv_tiler_shape = product_each(layout_gd_tv.shape()); + auto r2s_tv_zeros = repeat_like(r2s_tv_tiler_shape, Int<0>{}); // T and V + auto r2s_tv_tiler = cute::transform(make_seq{}, [&](auto i) { + auto tiler_origin = filter(composition(make_layout(r2s_tv_tiler_shape, replace(r2s_tv_zeros, Int<1>{})), r2s_mn_tiler_tv)); + auto complemented = complement(tiler_origin, get(r2s_tv_tiler_shape)); + return cute::layout<1>( + // divide complement to make it monotonic + zipped_divide(make_layout(shape(r2s_tv_tiler_shape)), complemented)); + }); + + auto r2s_TV_D_tiler = left_inverse(take<0,2>(tE_D)).compose( // tv -> mn in epilogue tiler + layout_gd_tv.compose( // tv -> mn in C + make_layout(r2s_tv_tiler_shape).compose(r2s_tv_tiler))); // tv_tiler -> tv coord + + // G2R ---------------------------------------- + using TiledCopyG2R = TiledCopy; + auto thread_g2r = TiledCopyG2R::get_slice(thread_idx); + // auto tEgD = flat_divide(gC, EpilogueTile{}); + // auto tGR_gC = thread_g2r.partition_S(tEgD); + auto cC = make_identity_tensor(make_shape(size<0>(gD), size<1>(gD))); // same shape as gC/gD + auto cCt = flat_divide(cC, EpilogueTile{}); + auto tRS_cC = thread_g2r.partition_S(cCt); + + // R2S ---------------------------------------- + using TiledCopyR2S = TiledCopy; + auto tiled_r2s = TiledCopyR2S{}; + + auto tile_RS_R = + cute::layout<1>( // just V + make_layout( + make_shape(typename TiledCopyR2S::TiledNumThr{},typename TiledCopyR2S::TiledNumVal{}),make_stride(_0{},_1{})) + .compose(group<1,3>( // V_copy -> V_fragment + right_inverse(layout_gd_tv) // TV_copy -> TV_fragment + .compose(tiled_r2s.tidfrg_S(tE_D))))) + (_,repeat>(_)); // TV_copy -> MN + + auto thread_r2s = tiled_r2s.get_slice(thread_idx); + auto tErAcc = make_tensor(accumulators.data(), tile_RS_R); + auto tRS_sAcc = thread_r2s.partition_D(sAcc); + + // S2R ---------------------------------------- + auto tiled_s2r = TiledCopyS2R{}; + auto thread_s2r = tiled_s2r.get_slice(thread_idx); + auto tSR_sAcc = thread_s2r.partition_S(sAcc); + auto tSR_rAcc = make_tensor(make_layout(tSR_sAcc.shape())); + + // R2R -------------------------------------------- + auto tiled_r2r = + make_tiled_copy(CopyAtomR2R{},typename CopyAtomR2R::ThrID{},make_layout(typename TiledCopyR2S::TiledNumVal{})); + auto thread_r2r = tiled_r2r.get_slice(thread_idx); + auto tRR_rAcc = make_tensor(tSR_rAcc.shape()); + auto tRR_rSrc = thread_r2r.retile_S(tSR_rAcc); + auto tRR_rDst = thread_r2r.retile_D(tRR_rAcc); + + // R2G ------------------------------------------- + auto r2r_vlayout = filter(cute::layout<1>(tiled_r2r.get_layoutS_TV()).compose(cute::layout<1>(tiled_r2r.get_layoutD_TV()))); + auto r2g_tv_layout = (typename TiledCopyS2R::TiledLayout_TV{}).compose(_,r2r_vlayout); + auto tiled_r2g = TiledCopy{}; + auto thread_r2g = tiled_r2g.get_slice(thread_idx); + + auto tRG_rAcc = thread_r2g.retile_S(tRR_rDst); + auto tEgD = flat_divide(gD, EpilogueTile{}); + auto tRG_gD = thread_r2g.partition_D(tEgD); + + // Repeat the D-partitioning for coordinates and predication + Tensor cD = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD))); // (BLK_M,BLK_N) -> (blk_m,blk_n) + Tensor cDt = flat_divide(cD, EpilogueTile{}); // (SMEM_M,SMEM_N,TILE_M,TILE_N) + Tensor tRG_cD = thread_r2g.partition_D(cDt); + +#if 0 + if (thread_idx == 0 && m_coord == 0 && n_coord == 0) { + print("aC : "); print(accumulators.layout()); print("\n"); + // print("gC : "); print(gC.layout()); print("\n"); + print("gD : "); print(gD.layout()); print("\n"); + // print("gBias : "); print(gBias.layout()); print("\n"); + print("sAcc : "); print(sAcc.layout()); print("\n"); + // print("rAcc : "); print(rAcc.layout()); print("\n"); + print("\n"); + // print("tRS_rAcc : "); print(tRS_rAcc.layout()); print("\n"); + print("tRS_sAcc : "); print(tRS_sAcc.layout()); print("\n"); + print("\n"); + print("tSR_sAcc : "); print(tSR_sAcc.layout()); print("\n"); + print("tSR_rAcc : "); print(tSR_rAcc.layout()); print("\n"); + print("\n"); + print("tRR_rSrc : "); print(tRR_rSrc.layout()); print("\n"); + print("tRR_rDst : "); print(tRR_rDst.layout()); print("\n"); + print("\n"); + print("tRG_rAcc : "); print(tRG_rAcc.layout()); print("\n"); + print("tRG_gD : "); print(tRG_gD.layout()); print("\n"); + print("tRS_cC : "); print(tRS_cC.layout()); print("\n"); + print("cCt : "); print(cCt.layout()); print("\n"); + print("cD : "); print(cD.layout()); print("\n"); + print("tE_D : "); print(tE_D.layout()); print("\n"); + print("sAcc : "); print(sAcc.layout()); print("\n"); + print("tErAcc : "); print(tErAcc.layout()); print("\n"); + } +#endif + CUTLASS_PRAGMA_UNROLL + for (int epi_tile_m = 0; epi_tile_m < size<2>(tEgD).value; ++epi_tile_m) { + CUTLASS_PRAGMA_UNROLL + for (int epi_tile_n = 0; epi_tile_n < size<3>(tEgD).value; ++epi_tile_n) { + Tensor tRS_rAccmn = tErAcc(_,_,_,epi_tile_m,epi_tile_n); // ((2, (2, 2)), 1, 1) + Tensor cC_mn = tRS_cC(_,_,_,epi_tile_m, epi_tile_n); + Tensor tRS_rD = make_tensor_like(tRS_rAccmn); + + int m_cta_coord_base = m_coord * size<0>(gD); + int n_cta_coord_base = n_coord * size<1>(gD); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(tRS_rAccmn); ++i) { + auto cta_coord = cC_mn(i); + tRS_rD(i) = epilogue_op(tRS_rAccmn(i), + l_coord, + m_cta_coord_base + get<0>(cta_coord), + n_cta_coord_base + get<1>(cta_coord), + elem_less(cta_coord, take<0,2>(residue_mnk))); + } + + copy(CopyAtomR2S{}, tRS_rD, tRS_sAcc); + copy(TiledCopyS2R{}, tSR_sAcc, tSR_rAcc); + copy(CopyAtomR2R{}, tRR_rSrc, tRR_rDst); + + Tensor tRG_gDmn = tRG_gD(_,_,_,epi_tile_m,epi_tile_n); + Tensor tRG_cDmn = tRG_cD(_,_,_,epi_tile_m,epi_tile_n); + CUTLASS_PRAGMA_UNROLL + for (int atom_i = 0; atom_i < size<0,1>(tRR_rAcc); ++atom_i) { + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < size<1>(tRR_rAcc); ++m) { + CUTLASS_PRAGMA_UNROLL + + for (int n = 0; n < size<2>(tRR_rAcc); ++n) { + + if (elem_less(tRG_cDmn(make_coord(0,atom_i),m,n), take<0,2>(residue_mnk))) { + copy(CopyAtomR2G{}, tRG_rAcc(make_coord(_,atom_i),m,n), tRG_gDmn(make_coord(_,atom_i),m,n)); + } + } + } + } + } + } + } + +private: + Params params; + ThreadEpilogueOp epilogue_op; +}; + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace collective +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h new file mode 100644 index 0000000000..1acd9cd667 --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h @@ -0,0 +1,407 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/*! \file + \brief Functor performing linear combination operations used by epilogues. +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/cutlass.h" +#include "cutlass/epilogue/thread/scale_type.h" +#include "cutlass/functional.h" +#include "cutlass/numeric_conversion.h" +#include "cutlass/numeric_types.h" + +#include "cutlass_patch/batched_matrix_coord.h" +#include "cutlass_patch/trace_device.h" + +namespace cutlass { +namespace epilogue { +namespace thread { + +template +struct GenericVariadicTraits { + static constexpr bool IsArgumentsNeeded = false; + struct Arguments {}; +}; + +template +struct GenericVariadicTraits { + static constexpr bool IsArgumentsNeeded = true; + using Arguments = typename VariadicOp::Arguments; +}; + +/// Applies a linear combination operator to an array of elements. +/// +/// D = VariadicOp(alpha * accumulator + beta * source) +/// +template < + template + class VariadicOp, + typename ElementOutput_, ///< Data type used to load and store tensors + int ElementsPerAccess, ///< Number of elements computed per operation. + ///< Usually it is 128/sizeof_bits, + ///< but we use 64 or 32 sometimes when there are + ///< not enough data to store + typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type + typename ElementCompute_ = + ElementOutput_, ///< Data type used to compute linear combination + ScaleType::Kind Scale = + ScaleType::Default, ///< Control Alpha and Beta scaling + FloatRoundStyle Round = FloatRoundStyle::round_to_nearest, + bool IsHeavy = false> +class LinearCombinationVariadic { + public: + using ElementOutput = ElementOutput_; + using ElementAccumulator = ElementAccumulator_; + using ElementCompute = ElementCompute_; + using ElementC = ElementOutput_; + using ElementD = ElementOutput_; + + using VariadicArguments = + typename GenericVariadicTraits>::Arguments; + + static bool const kIsHeavy = IsHeavy; + static int const kElementsPerAccess = ElementsPerAccess; + static int const kCount = ElementsPerAccess; + static const ScaleType::Kind kScale = Scale; + + using FragmentOutput = Array; + using FragmentAccumulator = Array; + using FragmentSource = Array; + using FragmentCompute = Array; + + static FloatRoundStyle const kRound = Round; + + /// Host-constructable parameters structure + struct Params { + ElementCompute alpha; ///< scales accumulators + ElementCompute beta; ///< scales source tensor + ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if + ///< not null, loads it from memory + ElementCompute const *beta_ptr; ///< pointer to source scalar - if not + ///< null, loads it from memory + VariadicArguments variadic_args; + + CUTLASS_HOST_DEVICE + Params() + : alpha(ElementCompute(1)), + beta(ElementCompute(0)), + alpha_ptr(nullptr), + beta_ptr(nullptr) {} + + CUTLASS_HOST_DEVICE + Params(ElementCompute alpha, + ElementCompute beta, + VariadicArguments variadic_args_ = VariadicArguments{}) + : alpha(alpha), + beta(beta), + alpha_ptr(nullptr), + beta_ptr(nullptr), + variadic_args(variadic_args_) {} + }; + + private: + // + // Data members + // + + Params params_; + bool skip_elementwise_; + + public: + /// Constructs the function object, possibly loading from pointers in host + /// memory + CUTLASS_HOST_DEVICE + LinearCombinationVariadic(Params const ¶ms) { + params_ = params; + params_.alpha = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); + params_.beta = (params.beta_ptr ? *params.beta_ptr : params.beta); + skip_elementwise_ = false; + } + + /// Returns true if source is needed + CUTLASS_HOST_DEVICE + bool is_source_needed() const { + if (Scale == ScaleType::NoBetaScaling) + return params_.beta != ElementCompute(0); + + if (Scale == ScaleType::OnlyAlphaScaling) return false; + + if (Scale == ScaleType::Nothing) return false; + + return params_.beta != ElementCompute(0); + } + + /// Functionally required for serial reduction in the epilogue + CUTLASS_HOST_DEVICE + void set_k_partition(int k_partition, int k_partition_count) { + if (k_partition) { + params_.beta = ElementCompute(1); + } + + if (k_partition != k_partition_count - 1) { + skip_elementwise_ = true; + } + } + + /// Computes linear scaling with source: D = alpha * accumulator + beta * + /// source + CUTLASS_HOST_DEVICE + FragmentOutput operator()(FragmentAccumulator const &accumulator, + FragmentSource const &source, + int batch, + int row_offset, + int column_offset, + bool valid) const { + CUTLASS_TRACE_DEVICE( + "kElementsPerAccess: %d, row_offset: %d, column_offset: %d", + kElementsPerAccess, + row_offset, + column_offset); + + // Convert source to internal compute numeric type + NumericArrayConverter + source_converter; + NumericArrayConverter + accumulator_converter; + + FragmentCompute converted_source = source_converter(source); + FragmentCompute converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + FragmentCompute intermediate; + + multiplies mul_add_source; + multiply_add mul_add_accumulator; + VariadicOp variadic_op; + + if (Scale == ScaleType::NoBetaScaling) { + intermediate = converted_source; + // D = alpha * Accum + X + intermediate = mul_add_accumulator( + params_.alpha, converted_accumulator, intermediate); + } else if (Scale == ScaleType::Nothing) { + intermediate = converted_accumulator; + } else { + // X = beta * C + uniform + intermediate = mul_add_source(params_.beta, converted_source); + // D = alpha * Accum + X + intermediate = mul_add_accumulator( + params_.alpha, converted_accumulator, intermediate); + } + + if constexpr (GenericVariadicTraits< + VariadicOp>::IsArgumentsNeeded) { + if (!skip_elementwise_) { +#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE + intermediate = variadic_op.Compute( + intermediate, + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset, valid)); +#else + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kElementsPerAccess; ++i) { + intermediate[i] = variadic_op( + intermediate[i], + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset + i, valid)); + } +#endif + } + } else { + if (!skip_elementwise_) { +#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE + intermediate = variadic_op.Compute(intermediate); +#else + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kElementsPerAccess; ++i) { + intermediate[i] = variadic_op(intermediate[i]); + } +#endif + } + } + + // Convert to destination numeric type + NumericArrayConverter + destination_converter; + + return destination_converter(intermediate); + } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()(FragmentAccumulator const &accumulator, + int batch, + int row_offset, + int column_offset, + bool valid) const { + CUTLASS_TRACE_DEVICE( + "kElementsPerAccess: %d, row_offset: %d, column_offset: %d", + kElementsPerAccess, + row_offset, + column_offset); + + // Convert source to internal compute numeric type + NumericArrayConverter + accumulator_converter; + + FragmentCompute converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + FragmentCompute intermediate; + + multiplies mul_accumulator; + VariadicOp variadic_op; + + if (Scale == ScaleType::Nothing) { + intermediate = converted_accumulator; + } else { + // D = alpha * Accum + intermediate = mul_accumulator(params_.alpha, converted_accumulator); + } + + if constexpr (GenericVariadicTraits< + VariadicOp>::IsArgumentsNeeded) { + if (!skip_elementwise_) { +#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE + intermediate = variadic_op.Compute( + intermediate, + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset, valid)); +#else + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kElementsPerAccess; ++i) { + intermediate[i] = variadic_op( + intermediate[i], + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset + i, valid)); + } +#endif + } + } else { + if (!skip_elementwise_) { +#if CUTLASS_EPILOGUE_ENABLE_VECTORIZE + intermediate = variadic_op.Compute(intermediate); +#else + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kElementsPerAccess; ++i) { + intermediate[i] = variadic_op(intermediate[i]); + } +#endif + } + } + + // Convert to destination numeric type + NumericArrayConverter + destination_converter; + + return destination_converter(intermediate); + } + + // + // Specializations for scalar (for use with cute::collective::DefaultEpilogue) + // + CUTLASS_HOST_DEVICE + ElementD operator()(ElementAccumulator const accumulator, + ElementC const source, + int batch, + int row_offset, + int column_offset, + bool valid) const { + + // Convert everything to Compute type, do compute, and then store to output type + NumericConverter accumulator_converter; + [[maybe_unused]] NumericConverter source_converter; + NumericConverter destination_converter; + + // Convert to destination numeric type + + ElementCompute converted_accumulator = accumulator_converter(accumulator); + if constexpr (Scale == ScaleType::Nothing) { + return destination_converter(converted_accumulator); + } + + // Perform binary operations + ElementCompute intermediate; + multiplies multiply; + multiply_add madd; + VariadicOp variadic_op; + + if constexpr (Scale == ScaleType::NoBetaScaling) { + intermediate = source_converter(source); + } + else { + intermediate = multiply(params_.beta, source); // X = beta * C + uniform + } + + intermediate = madd(params_.alpha, converted_accumulator, intermediate); // D = alpha * Accum + X + intermediate = variadic_op( + intermediate, + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset, valid)); + return destination_converter(intermediate); + } + + CUTLASS_HOST_DEVICE + ElementD operator()(ElementAccumulator const accumulator, + int batch, + int row_offset, + int column_offset, + bool valid) const { + + // Convert everything to Compute type, do compute, and then store to output type + NumericConverter accumulator_converter; + NumericConverter destination_converter; + ElementCompute converted_accumulator = accumulator_converter(accumulator); + + // Convert to destination numeric type + if constexpr (Scale == ScaleType::Nothing) { + return destination_converter(converted_accumulator); + } + + // Perform binary operations + ElementCompute intermediate; + multiplies multiply; + VariadicOp variadic_op; + + intermediate = multiply(params_.alpha, accumulator); // D = alpha * Accum + intermediate = variadic_op( + intermediate, + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset, valid)); + return destination_converter(intermediate); + } + +}; + +} // namespace thread +} // namespace epilogue +} // namespace cutlass diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h new file mode 100644 index 0000000000..470e17b102 --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/trace_device.h @@ -0,0 +1,73 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if CUTLASS_DEBUG_TRACE_LEVEL + +#ifndef CUTLASS_TRACE_DEVICE +#define CUTLASS_TRACE_DEVICE(format, ...) \ + { \ + if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && \ + threadIdx.x == 0 && threadIdx.y == 0) { \ + printf("[DEVICE][%s:%d, %s]" format "\n", \ + __FILE__, \ + __LINE__, \ + __FUNCTION__, \ + ##__VA_ARGS__); \ + } \ + } +#endif + +#ifndef CUTLASS_TRACE_DEVICE_TID_DETAIL +#define CUTLASS_TRACE_DEVICE_TID_DETAIL(bidz, bidx, tidx, format, ...) \ + { \ + if (blockIdx.x == bidx && blockIdx.y == 0 && blockIdx.z == bidz && \ + threadIdx.x == tidx && threadIdx.y == 0) { \ + printf("[DEVICE][%s:%d, %s][bid={%d,%d,%d}, tid={%d,%d,%d}]" format \ + "\n", \ + __FILE__, \ + __LINE__, \ + __FUNCTION__, \ + blockIdx.x, \ + blockIdx.y, \ + blockIdx.z, \ + threadIdx.x, \ + threadIdx.y, \ + threadIdx.z, \ + ##__VA_ARGS__); \ + } \ + } +#endif + +#ifndef CUTLASS_TRACE_DEVICE_TID +#define CUTLASS_TRACE_DEVICE_TID(format, ...) \ + { \ + CUTLASS_TRACE_DEVICE_TID_DETAIL(0, 0, 0, format, ##__VA_ARGS__) \ + CUTLASS_TRACE_DEVICE_TID_DETAIL(0, 0, 1, format, ##__VA_ARGS__) \ + CUTLASS_TRACE_DEVICE_TID_DETAIL(0, 1, 0, format, ##__VA_ARGS__) \ + } +#endif + +#else + +#ifndef CUTLASS_TRACE_DEVICE +#define CUTLASS_TRACE_DEVICE(format, ...) +#endif + +#ifndef CUTLASS_TRACE_DEVICE_TID +#define CUTLASS_TRACE_DEVICE_TID(format, ...) +#endif + +#endif diff --git a/backends/iluvatar_gpu/apy/matmul/default_config_id.h b/backends/iluvatar_gpu/apy/matmul/default_config_id.h new file mode 100644 index 0000000000..d160fad04e --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/default_config_id.h @@ -0,0 +1,29 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +//#include "all_tuning_configs.h" // NOLINT + +#pragma once + +namespace ap { + +struct DefaultConfig { + static constexpr int kConfigId = 0; + static constexpr int kSwizzleFactor = 1; + static constexpr bool kBatched = false; +}; + +} // namespace ap diff --git a/backends/iluvatar_gpu/apy/matmul/matmul.h b/backends/iluvatar_gpu/apy/matmul/matmul.h new file mode 100644 index 0000000000..dcd8b5caae --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/matmul.h @@ -0,0 +1,39 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include +#include +#include + +#define CHECK_CUDA(func) \ + { \ + cudaError_t err = func; \ + if (err != cudaSuccess) { \ + std::cerr << "[" << __FILE__ << ":" << __LINE__ << ", " << __FUNCTION__ \ + << "] " \ + << "CUDA error(" << err << "), " << cudaGetErrorString(err) \ + << " when call " << #func << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } + +#include "cutlass_matmul.cuh" // NOLINT +//#include "math_function.h" // NOLINT +//#include "profile.h" // NOLINT diff --git a/backends/iluvatar_gpu/apy/matmul/params.h b/backends/iluvatar_gpu/apy/matmul/params.h new file mode 100644 index 0000000000..035ceb01a0 --- /dev/null +++ b/backends/iluvatar_gpu/apy/matmul/params.h @@ -0,0 +1,177 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#define ASSERT_CHECK(__cond) \ + do { \ + const bool __cond_var = (__cond); \ + if (!__cond_var) { \ + ::std::string __err_msg = ::std::string("`") + #__cond + \ + "` check failed at " + __FILE__ + ":" + \ + ::std::to_string(__LINE__); \ + throw std::runtime_error(__err_msg); \ + } \ + } while (0) + +namespace ap { + +template +struct Alignment { + static constexpr int kValue = + ((Dim % 8) == 0) ? 8 + : (((Dim % 4) == 0) ? 4 : (((Dim % 2) == 0) ? 2 : 1)); +}; + +template +struct Alignment { + static constexpr int kValue = + ((Dim % 4) == 0) ? 4 : (((Dim % 2) == 0) ? 2 : 1); +}; + +struct GemmEpilogueParams { + int batch_count; + int m; + int n; + int k; + + bool transpose_a; + bool transpose_b; + + // Shape related aruguments + struct ShapeArguments { + int64_t batch_stride_A; + int64_t batch_stride_B; + int64_t batch_stride_C; + int64_t batch_stride_D; + int64_t lda; + int64_t ldb; + int64_t ldc_bias; + int64_t ldd; + }; + + ShapeArguments shape_args; + + const void *input; + const void *weight; + const void *bias; + void *output; + + cudaStream_t* stream_ptr; + + std::vector input0_shape; + std::vector input1_shape; + std::vector epilogue_in_ptrs; + std::vector epilogue_out_ptrs; + std::vector> epilogue_in_shapes; + std::vector> epilogue_out_shapes; + + GemmEpilogueParams() {} + GemmEpilogueParams(void* stream_ptr, + const void *input, + const void *weight, + const void *bias, + void *output, + const std::vector &input_shape, + const std::vector &weight_shape, + const std::vector &bias_shape, + bool transpose_a = false, + bool transpose_b = false) + : stream_ptr(reinterpret_cast(stream_ptr)), + input(input), + weight(weight), + bias(bias), + output(output), + transpose_a(transpose_a), + transpose_b(transpose_b) { + ASSERT_CHECK(input_shape.size() >= 2U); + ASSERT_CHECK(weight_shape.size() >= 2U); + + input0_shape = input_shape; + input1_shape = weight_shape; + + batch_count = 1; + for (size_t i = 0; i < input_shape.size() - 2; ++i) { + batch_count *= input_shape[i]; + } + + if (transpose_a) { + m = input_shape[input_shape.size() - 1]; + k = input_shape[input_shape.size() - 2]; + } else { + m = input_shape[input_shape.size() - 2]; + k = input_shape[input_shape.size() - 1]; + } + if (transpose_b) { + ASSERT_CHECK(weight_shape[weight_shape.size() - 1] == k); + n = weight_shape[weight_shape.size() - 2]; + } else { + ASSERT_CHECK(weight_shape[weight_shape.size() - 2] == k); + n = weight_shape[weight_shape.size() - 1]; + } + + if (bias) { + ASSERT_CHECK(bias_shape.size() >= 1U); + ASSERT_CHECK(bias_shape[bias_shape.size() - 1] == n); + } + +#if AP_ENABLE_DEBUG + std::cout << "-- [GemmEpilogueParams] batch_count: " << batch_count + << ", m: " << m << ", n: " << n << ", k: " << k << std::endl; + std::cout << "-- [GemmEpilogueParams] input: " << input << std::endl; + std::cout << "-- [GemmEpilogueParams] weight: " << weight << std::endl; + std::cout << "-- [GemmEpilogueParams] bias: " << bias << std::endl; + std::cout << "-- [GemmEpilogueParams] output: " << output << std::endl; + std::cout << "-- [GemmEpilogueParams] stream_str: " << stream_str << std::endl; +#endif + + shape_args.batch_stride_A = m * k; + shape_args.batch_stride_B = (weight_shape.size() == 2) ? 0 : n * k; + shape_args.batch_stride_D = m * n; + + shape_args.lda = transpose_a ? m : k; + shape_args.ldb = transpose_b ? k : n; + shape_args.ldd = n; + + bool is_C_bias = bias_shape.size() == 1UL; + + /// Only available in RRR format + shape_args.batch_stride_C = (!bias || is_C_bias) ? 0 : m * n; + shape_args.ldc_bias = (!bias || is_C_bias) ? 0 : n; + } + + void SetEpilogues(const std::vector &in_ptrs, + const std::vector &out_ptrs) { + epilogue_in_ptrs = in_ptrs; + epilogue_out_ptrs = out_ptrs; + } + + void SetEpilogueAndShapes( + const std::vector &in_ptrs, + const std::vector> &in_shapes, + const std::vector &out_ptrs, + const std::vector> &out_shapes) { + ASSERT_CHECK(in_ptrs.size() == in_shapes.size()); + epilogue_in_ptrs = in_ptrs; + epilogue_in_shapes = in_shapes; + ASSERT_CHECK(out_ptrs.size() == out_shapes.size()); + epilogue_out_ptrs = out_ptrs; + epilogue_out_shapes = out_shapes; + } +}; + +} // namespace ap From cdd2a1854f7aa2e0899c1bbfcee1f5df2f57fd81 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 14 Apr 2026 10:28:33 +0800 Subject: [PATCH 3/6] Update cmake. --- backends/iluvatar_gpu/CMakeLists.txt | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt index 8e0379fdf9..27bf8d03d8 100644 --- a/backends/iluvatar_gpu/CMakeLists.txt +++ b/backends/iluvatar_gpu/CMakeLists.txt @@ -41,8 +41,6 @@ if(WITH_FLAGCX) add_definitions("-DPADDLE_WITH_FLAGCX") include(external/flagcx) endif() - -add_definitions("-DPADDLE_WITH_CINN") set(PLUGIN_VERSION ${PADDLE_VERSION}) set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto") @@ -817,19 +815,19 @@ list( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/sparse_attention_kernel.cu) file( - GLOB_RECURSE CC_SRCS - RELATIVE ${CMAKE_SOURCE_DIR} - # 增加ap依赖项 - #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/ap_variadic_kernel.cc - #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/kernel_dispatch_helper.cc - #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/ap_infer_meta_helper.cc - #${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/kernel_define_helper.cc + GLOB + AP_CC_SRCS ${PADDLE_SOURCE_DIR}/paddle/ap/src/axpr/*.cc ${PADDLE_SOURCE_DIR}/paddle/ap/src/fs/*.cc ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_module/*.cc ${PADDLE_SOURCE_DIR}/paddle/ap/src/code_gen/*.cc ${PADDLE_SOURCE_DIR}/paddle/ap/src/kernel_dispatch/*.cc - ${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/*.cc + ${PADDLE_SOURCE_DIR}/paddle/ap/src/paddle/phi/*.cc) + +file( + GLOB_RECURSE CC_SRCS + RELATIVE ${CMAKE_SOURCE_DIR} + ${AP_CC_SRCS} runtime/runtime.cc runtime/iluvatar_context.cc common/*.cc From fb78dbb580d74aef416ca1ab945d08e8e7f3a73b Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 14 Apr 2026 10:43:32 +0800 Subject: [PATCH 4/6] Format codes. --- .../apy/device/compile_command_util.py | 6 +- .../cutlass_patch/batched_matrix_coord.h | 6 +- ...1_epilogue_vectorized_perwarp_variadic.hpp | 400 ++++++++++-------- .../thread/linear_combination_variadic.h | 44 +- backends/iluvatar_gpu/apy/matmul/matmul.h | 8 +- backends/iluvatar_gpu/apy/matmul/params.h | 9 +- backends/iluvatar_gpu/test_ap/set_env_ap.sh | 14 + .../test_ap/test_matmul_epilogue.py | 42 +- 8 files changed, 299 insertions(+), 230 deletions(-) diff --git a/backends/iluvatar_gpu/apy/device/compile_command_util.py b/backends/iluvatar_gpu/apy/device/compile_command_util.py index 18741ca805..e0037f84ea 100644 --- a/backends/iluvatar_gpu/apy/device/compile_command_util.py +++ b/backends/iluvatar_gpu/apy/device/compile_command_util.py @@ -20,7 +20,7 @@ def __init__(self): self.file_ext = "cu" self.op_type2generate_func = ap.OrderedDict( [ - ['matmul', self.generate_matmul_compile_command], + ["matmul", self.generate_matmul_compile_command], ] ) @@ -40,9 +40,7 @@ def generate_matmul_compile_command(self, tpl_dirname, library_name): compile_cmd + " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -DCUTLASS_DEBUG_TRACE_LEVEL=0 -DCUTLASS_ILUVATAR" ) - compile_cmd = ( - compile_cmd + " -DAP_ENABLE_AUTOTUNE=0 -DAP_ENABLE_DEBUG=0" - ) + compile_cmd = compile_cmd + " -DAP_ENABLE_AUTOTUNE=0 -DAP_ENABLE_DEBUG=0" compile_cmd = ( compile_cmd + f" --shared {library_name}.{self.file_ext} -o lib{library_name}.so" diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h index 544670b8ad..11c08461eb 100644 --- a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/batched_matrix_coord.h @@ -28,10 +28,12 @@ struct BatchedMatrixCoord { BatchedMatrixCoord() : batch(0), row(0), column(0), is_valid(false) {} CUTLASS_HOST_DEVICE - BatchedMatrixCoord(int b, int r, int c) : batch(b), row(r), column(c), is_valid(true) {} + BatchedMatrixCoord(int b, int r, int c) + : batch(b), row(r), column(c), is_valid(true) {} CUTLASS_HOST_DEVICE - BatchedMatrixCoord(int b, int r, int c, bool valid) : batch(b), row(r), column(c), is_valid(valid) {} + BatchedMatrixCoord(int b, int r, int c, bool valid) + : batch(b), row(r), column(c), is_valid(valid) {} }; }; // namespace cutlass diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp index a19167bf3a..e714983847 100644 --- a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp @@ -1,12 +1,12 @@ /*************************************************************************************************** - * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation @@ -18,14 +18,15 @@ * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /*! \file @@ -34,9 +35,8 @@ #pragma once -#include "cutlass/cutlass.h" - #include "cute/tensor.hpp" +#include "cutlass/cutlass.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -46,28 +46,28 @@ namespace collective { ///////////////////////////////////////////////////////////////////////////////////////////////// template < - // class StrideC, - class StrideD, - class ThreadEpilogueOp, - class EpilogueTile, - class SmemLayout, - class CopyAtomG2R_, - class CopyAtomR2S, - class TiledCopyS2R, - class CopyAtomR2R, - class CopyAtomR2G, - class EpilogueScheduleType = EpilogueSimtVectorized, - class Enable = void -> + // class StrideC, + class StrideD, + class ThreadEpilogueOp, + class EpilogueTile, + class SmemLayout, + class CopyAtomG2R_, + class CopyAtomR2S, + class TiledCopyS2R, + class CopyAtomR2R, + class CopyAtomR2G, + class EpilogueScheduleType = EpilogueSimtVectorized, + class Enable = void> class EpilogueVariadic { - static_assert(cute::is_same_v || - cute::is_same_v, - "Could not find an epilogue specialization."); + static_assert( + cute::is_same_v || + cute::is_same_v, + "Could not find an epilogue specialization."); }; ///////////////////////////////////////////////////////////////////////////////////////////////// /// Iluvatar ix11/ix30 Epilogue Vectorized -/// 1. Load C(optional), compute D = alpha * Acc + beta * C, +/// 1. Load C(optional), compute D = alpha * Acc + beta * C, /// then transform to ElementD(8/16-bit typically) /// 2. Copy D from Register to Smem /// 3. Load D from Smem to Register @@ -76,35 +76,32 @@ class EpilogueVariadic { /// /// Support alpha and beta params, bias not supported yet. template < - // class StrideC_, - class StrideD_, - class ThreadEpilogueOp_, - class EpilogueTile_, - class SmemLayout_, - class CopyAtomG2R_, - class CopyAtomR2S_, - class TiledCopyS2R_, - class CopyAtomR2R_, - class CopyAtomR2G_, - class EpilogueScheduleType_ -> + // class StrideC_, + class StrideD_, + class ThreadEpilogueOp_, + class EpilogueTile_, + class SmemLayout_, + class CopyAtomG2R_, + class CopyAtomR2S_, + class TiledCopyS2R_, + class CopyAtomR2R_, + class CopyAtomR2G_, + class EpilogueScheduleType_> class EpilogueVariadic< - // StrideC_, - StrideD_, - ThreadEpilogueOp_, - EpilogueTile_, - SmemLayout_, - CopyAtomG2R_, - CopyAtomR2S_, - TiledCopyS2R_, - CopyAtomR2R_, - CopyAtomR2G_, - EpilogueScheduleType_, - cute::enable_if_t< - cute::is_same_v - > - > { -public: + // StrideC_, + StrideD_, + ThreadEpilogueOp_, + EpilogueTile_, + SmemLayout_, + CopyAtomG2R_, + CopyAtomR2S_, + TiledCopyS2R_, + CopyAtomR2R_, + CopyAtomR2G_, + EpilogueScheduleType_, + cute::enable_if_t< + cute::is_same_v>> { + public: // // Type Aliases // @@ -116,28 +113,33 @@ class EpilogueVariadic< using ElementOutput = typename ThreadEpilogueOp::ElementOutput; using ElementD = typename ThreadEpilogueOp::ElementOutput; using StrideD = StrideD_; - using ElementC = ElementD; // for GemmUniversal + using ElementC = ElementD; // for GemmUniversal using StrideC = StrideD; - using ElementBias = typename detail::IsThreadEpilogueOpWithBias::type; + using ElementBias = + typename detail::IsThreadEpilogueOpWithBias::type; using EpilogueTile = EpilogueTile_; - using SmemLayout = SmemLayout_; + using SmemLayout = SmemLayout_; using CopyAtomG2R = CopyAtomG2R_; - using CopyAtomR2S = CopyAtomR2S_; + using CopyAtomR2S = CopyAtomR2S_; using TiledCopyS2R = TiledCopyS2R_; - using CopyAtomR2R = CopyAtomR2R_; - using CopyAtomR2G = CopyAtomR2G_; + using CopyAtomR2R = CopyAtomR2R_; + using CopyAtomR2G = CopyAtomR2G_; // using GmemTiledCopyC = void; using GmemTiledCopyD = CopyAtomR2G; - static constexpr bool IsEpilogueBiasSupported = detail::IsThreadEpilogueOpWithBias::value; - using StrideBias = cute::conditional_t(), Stride<_1,_0,int64_t>, Stride<_0,_1,int64_t>>; + static constexpr bool IsEpilogueBiasSupported = + detail::IsThreadEpilogueOpWithBias::value; + using StrideBias = cute::conditional_t(), + Stride<_1, _0, int64_t>, + Stride<_0, _1, int64_t>>; - // static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]"); - static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]"); + // static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, + // L]"); + static_assert(cute::rank(StrideD{}) == 3, + "StrideCD must be rank-3: [M, N, L]"); - struct SharedStorage - { + struct SharedStorage { cute::array_aligned> smem_epilogue; }; @@ -151,7 +153,7 @@ class EpilogueVariadic< }; // Device side epilogue params - template + template struct ParamsType { typename ThreadEpiOp::Params epilogue_op{}; ElementD* ptr_D = nullptr; @@ -165,12 +167,10 @@ class EpilogueVariadic< // template - static constexpr Params - to_underlying_arguments( + static constexpr Params to_underlying_arguments( [[maybe_unused]] ProblemShape const& _, Arguments const& args, [[maybe_unused]] void* workspace) { - return { args.epilogue_op, args.ptr_D, @@ -179,65 +179,67 @@ class EpilogueVariadic< } template - static size_t - get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) { + static size_t get_workspace_size(ProblemShape const& problem_shape, + Arguments const& args) { return 0; } template - static cutlass::Status - initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, - CudaHostAdapter* cuda_adapter = nullptr) { + static cutlass::Status initialize_workspace( + ProblemShape const& problem_shape, + Arguments const& args, + void* workspace, + cudaStream_t stream, + CudaHostAdapter* cuda_adapter = nullptr) { return cutlass::Status::kSuccess; } template - static bool - can_implement( - [[maybe_unused]] ProblemShape const& problem_shape, - [[maybe_unused]] Arguments const& args) { + static bool can_implement([[maybe_unused]] ProblemShape const& problem_shape, + [[maybe_unused]] Arguments const& args) { return true; } CUTLASS_HOST_DEVICE EpilogueVariadic(Params const& params_) - : params(params_), epilogue_op(params_.epilogue_op) { } + : params(params_), epilogue_op(params_.epilogue_op) {} CUTLASS_DEVICE - bool - is_source_needed() { - return epilogue_op.is_source_needed(); - } - - template< - class ProblemShapeMNKL, - class BlockShapeMNK, - class BlockCoordMNKL, - class FrgEngine, class FrgLayout, - class TiledMma, - class ResidueMNK - > - CUTLASS_DEVICE void - operator()( - ProblemShapeMNKL problem_shape_mnkl, - BlockShapeMNK blk_shape_MNK, - BlockCoordMNKL blk_coord_mnkl, - cute::Tensor const& accumulators, // (MMA,MMA_M,MMA_N) - TiledMma tiled_mma, - ResidueMNK residue_mnk, - int thread_idx, - char* smem_buf) { + bool is_source_needed() { return epilogue_op.is_source_needed(); } + + template + CUTLASS_DEVICE void operator()(ProblemShapeMNKL problem_shape_mnkl, + BlockShapeMNK blk_shape_MNK, + BlockCoordMNKL blk_coord_mnkl, + cute::Tensor const& + accumulators, // (MMA,MMA_M,MMA_N) + TiledMma tiled_mma, + ResidueMNK residue_mnk, + int thread_idx, + char* smem_buf) { using namespace cute; using X = Underscore; - static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4"); - static_assert(is_static::value, "ThreadBlock tile shape must be static"); - static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3"); - static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3"); - - static_assert(rank(EpilogueTile{}) == 2, "Rank of EpilogueTile should be 2"); + static_assert(cute::rank(ProblemShapeMNKL{}) == 4, + "ProblemShapeMNKL must be rank 4"); + static_assert(is_static::value, + "ThreadBlock tile shape must be static"); + static_assert(cute::rank(BlockShapeMNK{}) == 3, + "BlockShapeMNK must be rank 3"); + static_assert(cute::rank(BlockCoordMNKL{}) == 4, + "BlockCoordMNKL must be rank 3"); + + static_assert(rank(EpilogueTile{}) == 2, + "Rank of EpilogueTile should be 2"); static_assert(rank(SmemLayout{}) == 2, "Rank of SmemLayout should be 2"); - CUTE_STATIC_ASSERT(size(SmemLayout{}) == size<0>(EpilogueTile{}) * size<1>(EpilogueTile{})); + CUTE_STATIC_ASSERT(size(SmemLayout{}) == + size<0>(EpilogueTile{}) * size<1>(EpilogueTile{})); // Separate out problem shape for convenience auto M = get<0>(problem_shape_mnkl); @@ -245,19 +247,28 @@ class EpilogueVariadic< auto L = get<3>(problem_shape_mnkl); // Represent the full output tensor - // Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), params.dC); // (m,n,l) - Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), params.dD); // (m,n,l) - // Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), make_shape(M,N,L), params.dBias); // (m,n,l) - - // Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) - Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) - // Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + // Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), + // make_shape(M,N,L), params.dC); // (m,n,l) + Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), + make_shape(M, N, L), + params.dD); // (m,n,l) + // Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), + // make_shape(M,N,L), params.dBias); // (m,n,l) + + // Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), + // Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) + Tensor gD_mnl = local_tile(mD_mnl, + blk_shape_MNK, + make_coord(_, _, _), + Step<_1, _1, X>{}); // (BLK_M,BLK_N,m,n,l) + // Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, + // make_coord(_,_,_), Step<_1,_1, X>{}); // (BLK_M,BLK_N,m,n,l) // Slice to get the tile this CTA is responsible for auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl; - // Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) - Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) - // Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + // Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) + Tensor gD = gD_mnl(_, _, m_coord, n_coord, l_coord); // (BLK_M,BLK_N) + // Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord); // (BLK_M,BLK_N) // Construct a tensor in SMEM that we can partition for rearranging data SharedStorage& storage = *reinterpret_cast(smem_buf); @@ -268,52 +279,68 @@ class EpilogueVariadic< auto layout_gd_tv = tiled_mma.thrfrg_C(make_layout(gD.shape())); // G2R / R2S ------------------------------------- - // In G2R / R2S tiling, we assemble the final V->MN layout out from given epilogue tile and mma layout + // In G2R / R2S tiling, we assemble the final V->MN layout out from given + // epilogue tile and mma layout // epilogue tile of logical C - auto tE_D = flat_divide(make_layout(product_each(gD.shape())), EpilogueTile{}); - + auto tE_D = + flat_divide(make_layout(product_each(gD.shape())), EpilogueTile{}); + using EpilogueTileSize = decltype(product_each(shape(EpilogueTile{}))); - Tensor sAcc = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{}); + Tensor sAcc = + make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{}); - auto r2s_mn_tiler_tv = take<0,2>(right_inverse(layout_gd_tv).compose(tE_D)); // epilogue logical mn->tv + auto r2s_mn_tiler_tv = take<0, 2>( + right_inverse(layout_gd_tv).compose(tE_D)); // epilogue logical mn->tv // get tiler of TV from tiler of MN auto r2s_tv_tiler_shape = product_each(layout_gd_tv.shape()); - auto r2s_tv_zeros = repeat_like(r2s_tv_tiler_shape, Int<0>{}); // T and V - auto r2s_tv_tiler = cute::transform(make_seq{}, [&](auto i) { - auto tiler_origin = filter(composition(make_layout(r2s_tv_tiler_shape, replace(r2s_tv_zeros, Int<1>{})), r2s_mn_tiler_tv)); - auto complemented = complement(tiler_origin, get(r2s_tv_tiler_shape)); - return cute::layout<1>( - // divide complement to make it monotonic - zipped_divide(make_layout(shape(r2s_tv_tiler_shape)), complemented)); - }); - - auto r2s_TV_D_tiler = left_inverse(take<0,2>(tE_D)).compose( // tv -> mn in epilogue tiler - layout_gd_tv.compose( // tv -> mn in C - make_layout(r2s_tv_tiler_shape).compose(r2s_tv_tiler))); // tv_tiler -> tv coord + auto r2s_tv_zeros = repeat_like(r2s_tv_tiler_shape, Int<0>{}); // T and V + auto r2s_tv_tiler = + cute::transform(make_seq{}, [&](auto i) { + auto tiler_origin = filter( + composition(make_layout(r2s_tv_tiler_shape, + replace(r2s_tv_zeros, Int<1>{})), + r2s_mn_tiler_tv)); + auto complemented = + complement(tiler_origin, get(r2s_tv_tiler_shape)); + return cute::layout<1>( + // divide complement to make it monotonic + zipped_divide(make_layout(shape(r2s_tv_tiler_shape)), + complemented)); + }); + + auto r2s_TV_D_tiler = + left_inverse(take<0, 2>(tE_D)) + .compose( // tv -> mn in epilogue tiler + layout_gd_tv.compose( // tv -> mn in C + make_layout(r2s_tv_tiler_shape) + .compose(r2s_tv_tiler))); // tv_tiler -> tv coord // G2R ---------------------------------------- - using TiledCopyG2R = TiledCopy; + using TiledCopyG2R = + TiledCopy; auto thread_g2r = TiledCopyG2R::get_slice(thread_idx); // auto tEgD = flat_divide(gC, EpilogueTile{}); // auto tGR_gC = thread_g2r.partition_S(tEgD); - auto cC = make_identity_tensor(make_shape(size<0>(gD), size<1>(gD))); // same shape as gC/gD - auto cCt = flat_divide(cC, EpilogueTile{}); + auto cC = make_identity_tensor( + make_shape(size<0>(gD), size<1>(gD))); // same shape as gC/gD + auto cCt = flat_divide(cC, EpilogueTile{}); auto tRS_cC = thread_g2r.partition_S(cCt); - + // R2S ---------------------------------------- - using TiledCopyR2S = TiledCopy; + using TiledCopyR2S = + TiledCopy; auto tiled_r2s = TiledCopyR2S{}; - auto tile_RS_R = - cute::layout<1>( // just V - make_layout( - make_shape(typename TiledCopyR2S::TiledNumThr{},typename TiledCopyR2S::TiledNumVal{}),make_stride(_0{},_1{})) - .compose(group<1,3>( // V_copy -> V_fragment - right_inverse(layout_gd_tv) // TV_copy -> TV_fragment - .compose(tiled_r2s.tidfrg_S(tE_D))))) - (_,repeat>(_)); // TV_copy -> MN + auto tile_RS_R = cute::layout<1>( // just V + make_layout(make_shape(typename TiledCopyR2S::TiledNumThr{}, + typename TiledCopyR2S::TiledNumVal{}), + make_stride(_0{}, _1{})) + .compose(group<1, 3>( // V_copy -> V_fragment + right_inverse(layout_gd_tv) // TV_copy -> TV_fragment + .compose(tiled_r2s.tidfrg_S(tE_D)))))( + _, repeat>(_)); // TV_copy -> MN auto thread_r2s = tiled_r2s.get_slice(thread_idx); auto tErAcc = make_tensor(accumulators.data(), tile_RS_R); @@ -326,17 +353,23 @@ class EpilogueVariadic< auto tSR_rAcc = make_tensor(make_layout(tSR_sAcc.shape())); // R2R -------------------------------------------- - auto tiled_r2r = - make_tiled_copy(CopyAtomR2R{},typename CopyAtomR2R::ThrID{},make_layout(typename TiledCopyR2S::TiledNumVal{})); + auto tiled_r2r = + make_tiled_copy(CopyAtomR2R{}, + typename CopyAtomR2R::ThrID{}, + make_layout(typename TiledCopyR2S::TiledNumVal{})); auto thread_r2r = tiled_r2r.get_slice(thread_idx); auto tRR_rAcc = make_tensor(tSR_rAcc.shape()); auto tRR_rSrc = thread_r2r.retile_S(tSR_rAcc); auto tRR_rDst = thread_r2r.retile_D(tRR_rAcc); // R2G ------------------------------------------- - auto r2r_vlayout = filter(cute::layout<1>(tiled_r2r.get_layoutS_TV()).compose(cute::layout<1>(tiled_r2r.get_layoutD_TV()))); - auto r2g_tv_layout = (typename TiledCopyS2R::TiledLayout_TV{}).compose(_,r2r_vlayout); - auto tiled_r2g = TiledCopy{}; + auto r2r_vlayout = + filter(cute::layout<1>(tiled_r2r.get_layoutS_TV()) + .compose(cute::layout<1>(tiled_r2r.get_layoutD_TV()))); + auto r2g_tv_layout = + (typename TiledCopyS2R::TiledLayout_TV{}).compose(_, r2r_vlayout); + auto tiled_r2g = + TiledCopy{}; auto thread_r2g = tiled_r2g.get_slice(thread_idx); auto tRG_rAcc = thread_r2g.retile_S(tRR_rDst); @@ -344,8 +377,10 @@ class EpilogueVariadic< auto tRG_gD = thread_r2g.partition_D(tEgD); // Repeat the D-partitioning for coordinates and predication - Tensor cD = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD))); // (BLK_M,BLK_N) -> (blk_m,blk_n) - Tensor cDt = flat_divide(cD, EpilogueTile{}); // (SMEM_M,SMEM_N,TILE_M,TILE_N) + Tensor cD = make_identity_tensor(make_shape( + size<0>(gD), size<1>(gD))); // (BLK_M,BLK_N) -> (blk_m,blk_n) + Tensor cDt = + flat_divide(cD, EpilogueTile{}); // (SMEM_M,SMEM_N,TILE_M,TILE_N) Tensor tRG_cD = thread_r2g.partition_D(cDt); #if 0 @@ -380,38 +415,42 @@ class EpilogueVariadic< for (int epi_tile_m = 0; epi_tile_m < size<2>(tEgD).value; ++epi_tile_m) { CUTLASS_PRAGMA_UNROLL for (int epi_tile_n = 0; epi_tile_n < size<3>(tEgD).value; ++epi_tile_n) { - Tensor tRS_rAccmn = tErAcc(_,_,_,epi_tile_m,epi_tile_n); // ((2, (2, 2)), 1, 1) - Tensor cC_mn = tRS_cC(_,_,_,epi_tile_m, epi_tile_n); + Tensor tRS_rAccmn = + tErAcc(_, _, _, epi_tile_m, epi_tile_n); // ((2, (2, 2)), 1, 1) + Tensor cC_mn = tRS_cC(_, _, _, epi_tile_m, epi_tile_n); Tensor tRS_rD = make_tensor_like(tRS_rAccmn); - + int m_cta_coord_base = m_coord * size<0>(gD); int n_cta_coord_base = n_coord * size<1>(gD); CUTLASS_PRAGMA_UNROLL for (int i = 0; i < size(tRS_rAccmn); ++i) { auto cta_coord = cC_mn(i); - tRS_rD(i) = epilogue_op(tRS_rAccmn(i), - l_coord, - m_cta_coord_base + get<0>(cta_coord), - n_cta_coord_base + get<1>(cta_coord), - elem_less(cta_coord, take<0,2>(residue_mnk))); + tRS_rD(i) = + epilogue_op(tRS_rAccmn(i), + l_coord, + m_cta_coord_base + get<0>(cta_coord), + n_cta_coord_base + get<1>(cta_coord), + elem_less(cta_coord, take<0, 2>(residue_mnk))); } copy(CopyAtomR2S{}, tRS_rD, tRS_sAcc); copy(TiledCopyS2R{}, tSR_sAcc, tSR_rAcc); copy(CopyAtomR2R{}, tRR_rSrc, tRR_rDst); - Tensor tRG_gDmn = tRG_gD(_,_,_,epi_tile_m,epi_tile_n); - Tensor tRG_cDmn = tRG_cD(_,_,_,epi_tile_m,epi_tile_n); + Tensor tRG_gDmn = tRG_gD(_, _, _, epi_tile_m, epi_tile_n); + Tensor tRG_cDmn = tRG_cD(_, _, _, epi_tile_m, epi_tile_n); CUTLASS_PRAGMA_UNROLL - for (int atom_i = 0; atom_i < size<0,1>(tRR_rAcc); ++atom_i) { + for (int atom_i = 0; atom_i < size<0, 1>(tRR_rAcc); ++atom_i) { CUTLASS_PRAGMA_UNROLL for (int m = 0; m < size<1>(tRR_rAcc); ++m) { CUTLASS_PRAGMA_UNROLL for (int n = 0; n < size<2>(tRR_rAcc); ++n) { - - if (elem_less(tRG_cDmn(make_coord(0,atom_i),m,n), take<0,2>(residue_mnk))) { - copy(CopyAtomR2G{}, tRG_rAcc(make_coord(_,atom_i),m,n), tRG_gDmn(make_coord(_,atom_i),m,n)); + if (elem_less(tRG_cDmn(make_coord(0, atom_i), m, n), + take<0, 2>(residue_mnk))) { + copy(CopyAtomR2G{}, + tRG_rAcc(make_coord(_, atom_i), m, n), + tRG_gDmn(make_coord(_, atom_i), m, n)); } } } @@ -420,16 +459,15 @@ class EpilogueVariadic< } } -private: + private: Params params; ThreadEpilogueOp epilogue_op; }; - ///////////////////////////////////////////////////////////////////////////////////////////////// -} // namespace collective -} // namespace epilogue -} // namespace cutlass +} // namespace collective +} // namespace epilogue +} // namespace cutlass ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h index 1acd9cd667..ffda0b076b 100644 --- a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/thread/linear_combination_variadic.h @@ -24,7 +24,6 @@ #include "cutlass/functional.h" #include "cutlass/numeric_conversion.h" #include "cutlass/numeric_types.h" - #include "cutlass_patch/batched_matrix_coord.h" #include "cutlass_patch/trace_device.h" @@ -330,16 +329,18 @@ class LinearCombinationVariadic { // Specializations for scalar (for use with cute::collective::DefaultEpilogue) // CUTLASS_HOST_DEVICE - ElementD operator()(ElementAccumulator const accumulator, + ElementD operator()(ElementAccumulator const accumulator, ElementC const source, int batch, int row_offset, int column_offset, bool valid) const { - - // Convert everything to Compute type, do compute, and then store to output type - NumericConverter accumulator_converter; - [[maybe_unused]] NumericConverter source_converter; + // Convert everything to Compute type, do compute, and then store to output + // type + NumericConverter + accumulator_converter; + [[maybe_unused]] NumericConverter + source_converter; NumericConverter destination_converter; // Convert to destination numeric type @@ -357,16 +358,17 @@ class LinearCombinationVariadic { if constexpr (Scale == ScaleType::NoBetaScaling) { intermediate = source_converter(source); - } - else { - intermediate = multiply(params_.beta, source); // X = beta * C + uniform + } else { + intermediate = multiply(params_.beta, source); // X = beta * C + uniform } - intermediate = madd(params_.alpha, converted_accumulator, intermediate); // D = alpha * Accum + X + intermediate = madd(params_.alpha, + converted_accumulator, + intermediate); // D = alpha * Accum + X intermediate = variadic_op( - intermediate, - params_.variadic_args, - BatchedMatrixCoord(batch, row_offset, column_offset, valid)); + intermediate, + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset, valid)); return destination_converter(intermediate); } @@ -376,9 +378,10 @@ class LinearCombinationVariadic { int row_offset, int column_offset, bool valid) const { - - // Convert everything to Compute type, do compute, and then store to output type - NumericConverter accumulator_converter; + // Convert everything to Compute type, do compute, and then store to output + // type + NumericConverter + accumulator_converter; NumericConverter destination_converter; ElementCompute converted_accumulator = accumulator_converter(accumulator); @@ -392,14 +395,13 @@ class LinearCombinationVariadic { multiplies multiply; VariadicOp variadic_op; - intermediate = multiply(params_.alpha, accumulator); // D = alpha * Accum + intermediate = multiply(params_.alpha, accumulator); // D = alpha * Accum intermediate = variadic_op( - intermediate, - params_.variadic_args, - BatchedMatrixCoord(batch, row_offset, column_offset, valid)); + intermediate, + params_.variadic_args, + BatchedMatrixCoord(batch, row_offset, column_offset, valid)); return destination_converter(intermediate); } - }; } // namespace thread diff --git a/backends/iluvatar_gpu/apy/matmul/matmul.h b/backends/iluvatar_gpu/apy/matmul/matmul.h index dcd8b5caae..d2794859f0 100644 --- a/backends/iluvatar_gpu/apy/matmul/matmul.h +++ b/backends/iluvatar_gpu/apy/matmul/matmul.h @@ -14,14 +14,14 @@ #pragma once -#include -#include -#include - #include #include #include +#include +#include +#include + #define CHECK_CUDA(func) \ { \ cudaError_t err = func; \ diff --git a/backends/iluvatar_gpu/apy/matmul/params.h b/backends/iluvatar_gpu/apy/matmul/params.h index 035ceb01a0..6be8a242b6 100644 --- a/backends/iluvatar_gpu/apy/matmul/params.h +++ b/backends/iluvatar_gpu/apy/matmul/params.h @@ -71,7 +71,7 @@ struct GemmEpilogueParams { const void *bias; void *output; - cudaStream_t* stream_ptr; + cudaStream_t *stream_ptr; std::vector input0_shape; std::vector input1_shape; @@ -81,7 +81,7 @@ struct GemmEpilogueParams { std::vector> epilogue_out_shapes; GemmEpilogueParams() {} - GemmEpilogueParams(void* stream_ptr, + GemmEpilogueParams(void *stream_ptr, const void *input, const void *weight, const void *bias, @@ -91,7 +91,7 @@ struct GemmEpilogueParams { const std::vector &bias_shape, bool transpose_a = false, bool transpose_b = false) - : stream_ptr(reinterpret_cast(stream_ptr)), + : stream_ptr(reinterpret_cast(stream_ptr)), input(input), weight(weight), bias(bias), @@ -136,7 +136,8 @@ struct GemmEpilogueParams { std::cout << "-- [GemmEpilogueParams] weight: " << weight << std::endl; std::cout << "-- [GemmEpilogueParams] bias: " << bias << std::endl; std::cout << "-- [GemmEpilogueParams] output: " << output << std::endl; - std::cout << "-- [GemmEpilogueParams] stream_str: " << stream_str << std::endl; + std::cout << "-- [GemmEpilogueParams] stream_str: " << stream_str + << std::endl; #endif shape_args.batch_stride_A = m * k; diff --git a/backends/iluvatar_gpu/test_ap/set_env_ap.sh b/backends/iluvatar_gpu/test_ap/set_env_ap.sh index fd7430883f..ac011cb74a 100644 --- a/backends/iluvatar_gpu/test_ap/set_env_ap.sh +++ b/backends/iluvatar_gpu/test_ap/set_env_ap.sh @@ -1,5 +1,19 @@ #!/bin/bash +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + export FLAGS_prim_enable_dynamic=true export FLAGS_prim_all=true diff --git a/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py b/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py index 7bc3bbd3ce..63ea7bc6f9 100644 --- a/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py +++ b/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py @@ -1,7 +1,19 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os -import subprocess import unittest -import time import numpy as np @@ -13,7 +25,7 @@ # os.environ["AP_WORKSPACE_DIR"] = "/tmp/paddle/ap" -DT = 'float16' +DT = "float16" # BS = 4 # MS = 65536 # NS = 32 @@ -39,6 +51,7 @@ NS = 192 KS = 768 + class TestMatmulEpilogue(unittest.TestCase): def setUp(self): dtype = DT @@ -70,7 +83,6 @@ def setUp(self): self.mask = paddle.randn(mask_shape, dtype=dtype) self.mask.stop_gradient = True - def get_subgraph(self): B = pct.DimVar(BS) M = pct.DimVar(MS) @@ -90,7 +102,7 @@ def matmul_add_act( # return paddle.nn.functional.sigmoid(out) return paddle.nn.functional.relu(out) - #return matmul_add_act + # return matmul_add_act def matmul_add_divide_multipy_add_S1( x: pct.Tensor([B, M, K], T), @@ -104,7 +116,7 @@ def matmul_add_divide_multipy_add_S1( # out = out / 1.2 out = out * mask return residual + out - + return matmul_add_divide_multipy_add_S1 def test_subgraph(self): @@ -117,8 +129,7 @@ def test_subgraph(self): # dy_outs = foo(self.x, self.y, self.b, self.b1) ap_outs = fused_foo(self.x, self.y, self.bias, self.residual, self.mask) dy_outs = foo(self.x, self.y, self.bias, self.residual, self.mask) - #return - + # return # -------- 性能测试部分 -------- iters = 10 @@ -139,7 +150,7 @@ def test_subgraph(self): with profiler.Profiler( targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU], on_trace_ready=profiler.export_chrome_tracing("./profiler_log"), - timer_only = False + timer_only=False, ) as prof: for step in range(iters): # _ = fused_foo(self.x, self.y, self.b, self.b1) @@ -147,13 +158,16 @@ def test_subgraph(self): # _ = foo(self.x, self.y, self.b, self.b1) prof.step() print("[Profiler] Trace saved to ./profiler_log") - prof.summary(sorted_by=profiler.SortedKeys.GPUTotal, - op_detail=True, - thread_sep=False, - time_unit='us') - + prof.summary( + sorted_by=profiler.SortedKeys.GPUTotal, + op_detail=True, + thread_sep=False, + time_unit="us", + ) + for dy_out, ap_out in zip(dy_outs, ap_outs): np.testing.assert_allclose(dy_out, ap_out, rtol=5e-2, atol=1e-1) + if __name__ == "__main__": unittest.main() From 4ff9c62231ffd7806b0a0cc276b16e0c04089444 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 14 Apr 2026 13:25:04 +0800 Subject: [PATCH 5/6] Move test to the unittest directory. --- ...1_epilogue_vectorized_perwarp_variadic.hpp | 32 +---- backends/iluvatar_gpu/test_ap/set_env_ap.sh | 41 ------ .../test_ap_matmul_epilogue_iluvatar.py} | 117 ++++++++---------- 3 files changed, 51 insertions(+), 139 deletions(-) delete mode 100644 backends/iluvatar_gpu/test_ap/set_env_ap.sh rename backends/iluvatar_gpu/{test_ap/test_matmul_epilogue.py => tests/unittests/test_ap_matmul_epilogue_iluvatar.py} (59%) diff --git a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp index e714983847..269399222e 100644 --- a/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp +++ b/backends/iluvatar_gpu/apy/matmul/cutlass_patch/epilogue/collective/ix11_epilogue_vectorized_perwarp_variadic.hpp @@ -201,7 +201,7 @@ class EpilogueVariadic< } CUTLASS_HOST_DEVICE - EpilogueVariadic(Params const& params_) + explicit EpilogueVariadic(Params const& params_) : params(params_), epilogue_op(params_.epilogue_op) {} CUTLASS_DEVICE @@ -223,7 +223,7 @@ class EpilogueVariadic< ResidueMNK residue_mnk, int thread_idx, char* smem_buf) { - using namespace cute; + using namespace cute; // NOLINT using X = Underscore; static_assert(cute::rank(ProblemShapeMNKL{}) == 4, @@ -383,34 +383,6 @@ class EpilogueVariadic< flat_divide(cD, EpilogueTile{}); // (SMEM_M,SMEM_N,TILE_M,TILE_N) Tensor tRG_cD = thread_r2g.partition_D(cDt); -#if 0 - if (thread_idx == 0 && m_coord == 0 && n_coord == 0) { - print("aC : "); print(accumulators.layout()); print("\n"); - // print("gC : "); print(gC.layout()); print("\n"); - print("gD : "); print(gD.layout()); print("\n"); - // print("gBias : "); print(gBias.layout()); print("\n"); - print("sAcc : "); print(sAcc.layout()); print("\n"); - // print("rAcc : "); print(rAcc.layout()); print("\n"); - print("\n"); - // print("tRS_rAcc : "); print(tRS_rAcc.layout()); print("\n"); - print("tRS_sAcc : "); print(tRS_sAcc.layout()); print("\n"); - print("\n"); - print("tSR_sAcc : "); print(tSR_sAcc.layout()); print("\n"); - print("tSR_rAcc : "); print(tSR_rAcc.layout()); print("\n"); - print("\n"); - print("tRR_rSrc : "); print(tRR_rSrc.layout()); print("\n"); - print("tRR_rDst : "); print(tRR_rDst.layout()); print("\n"); - print("\n"); - print("tRG_rAcc : "); print(tRG_rAcc.layout()); print("\n"); - print("tRG_gD : "); print(tRG_gD.layout()); print("\n"); - print("tRS_cC : "); print(tRS_cC.layout()); print("\n"); - print("cCt : "); print(cCt.layout()); print("\n"); - print("cD : "); print(cD.layout()); print("\n"); - print("tE_D : "); print(tE_D.layout()); print("\n"); - print("sAcc : "); print(sAcc.layout()); print("\n"); - print("tErAcc : "); print(tErAcc.layout()); print("\n"); - } -#endif CUTLASS_PRAGMA_UNROLL for (int epi_tile_m = 0; epi_tile_m < size<2>(tEgD).value; ++epi_tile_m) { CUTLASS_PRAGMA_UNROLL diff --git a/backends/iluvatar_gpu/test_ap/set_env_ap.sh b/backends/iluvatar_gpu/test_ap/set_env_ap.sh deleted file mode 100644 index ac011cb74a..0000000000 --- a/backends/iluvatar_gpu/test_ap/set_env_ap.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -export FLAGS_prim_enable_dynamic=true -export FLAGS_prim_all=true - -# CINN related FLAG -export FLAGS_use_cinn=false -export FLAGS_group_schedule_tiling_first=true -# PIR mode -export FLAGS_enable_pir_api=true - -# print Program IR -export FLAGS_print_ir=true - -# debug log -export GLOG_v=0 -export GLOG_vmodule=ap_generic_drr_pass=6 - -export CUDA_VISIBLE_DEVICES=11 -export FLAGS_enable_ap=1 - -PADDLE_ROOT="${PADDLE_ROOT:-/path/to/your/paddle/build}" -export PYTHONPATH="${PADDLE_ROOT}/python:$PYTHONPATH" -export AP_WORKSPACE_DIR="/tmp/ap_workspace" -export AP_PATH="${PADDLE_ROOT}/python/paddle/apy/sys:${PADDLE_ROOT}/python/paddle/apy/matmul_pass:$AP_PATH" - -python test_matmul_epilogue.py 2>&1 | tee output.log diff --git a/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py similarity index 59% rename from backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py rename to backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py index 63ea7bc6f9..1b49c13a31 100644 --- a/backends/iluvatar_gpu/test_ap/test_matmul_epilogue.py +++ b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py @@ -14,38 +14,25 @@ import os import unittest - import numpy as np +from pathlib import Path import paddle import paddle.incubate.cc as pcc import paddle.incubate.cc.typing as pct - import paddle.profiler as profiler -# os.environ["AP_WORKSPACE_DIR"] = "/tmp/paddle/ap" + +os.environ["AP_WORKSPACE_DIR"] = "/tmp/paddle/ap_workspace" + + +def GetPirProgram(fused_func, tensor_args): + dtypes = tuple(tensor.dtype for tensor in tensor_args) + func = fused_func.func_overload_ctx.dtypes2func.get(dtypes, None) + return str(func.infer_program.forward_program) + DT = "float16" -# BS = 4 -# MS = 65536 -# NS = 32 -# KS = 128 -# BS = 1 -# MS = 128 -# NS = 64 -# KS = 128 -# BS = 1 -# MS = 64 -# NS = 768 -# KS = 768 -# BS = 4 -# MS = 64 -# NS = 3072 -# KS = 768 -# BS = 4 -# MS = 128 -# NS = 32 -# KS = 128 BS = 4 MS = 784 NS = 192 @@ -67,10 +54,6 @@ def setUp(self): self.b = paddle.randn(b_shape, dtype=dtype) self.b.stop_gradient = True - b1_shape = [1] - self.b1 = paddle.randn(b1_shape, dtype=dtype) - self.b1.stop_gradient = True - bias_shape = [NS] self.bias = paddle.randn(bias_shape, dtype=dtype) self.bias.stop_gradient = True @@ -83,7 +66,7 @@ def setUp(self): self.mask = paddle.randn(mask_shape, dtype=dtype) self.mask.stop_gradient = True - def get_subgraph(self): + def get_matmul_add_act(self): B = pct.DimVar(BS) M = pct.DimVar(MS) K = pct.DimVar(KS) @@ -94,17 +77,22 @@ def matmul_add_act( x: pct.Tensor([B, M, K], T), y: pct.Tensor([K, N], T), b: pct.Tensor([B, M, N], T), - b1: pct.Tensor([1], T), ): out = paddle.matmul(x, y) out = out + b - # return paddle.nn.functional.sigmoid(out) return paddle.nn.functional.relu(out) - # return matmul_add_act + return matmul_add_act + + def get_matmul_add_divide_multipy_add(self): + B = pct.DimVar(BS) + M = pct.DimVar(MS) + K = pct.DimVar(KS) + N = pct.DimVar(NS) + T = pct.DTypeVar("T", DT) - def matmul_add_divide_multipy_add_S1( + def matmul_add_divide_multipy_add( x: pct.Tensor([B, M, K], T), y: pct.Tensor([K, N], T), bias: pct.Tensor([N], T), @@ -117,54 +105,47 @@ def matmul_add_divide_multipy_add_S1( out = out * mask return residual + out - return matmul_add_divide_multipy_add_S1 + return matmul_add_divide_multipy_add - def test_subgraph(self): - foo = self.get_subgraph() - fused_foo = pcc.compile( - foo, ap_path=f"{os.path.dirname(paddle.__file__)}/apy/matmul_pass" - ) + def check_if_ap_variadic_exist(self, fused_foo, foo_args): + generated_pir_program = GetPirProgram(fused_foo, foo_args) + assert ( + "pd_op.ap_variadic" in generated_pir_program + ), "AP fusion failed, none pd_op.ap_variadic found in the pir_program." - # ap_outs = fused_foo(self.x, self.y, self.b, self.b1) - # dy_outs = foo(self.x, self.y, self.b, self.b1) - ap_outs = fused_foo(self.x, self.y, self.bias, self.residual, self.mask) - dy_outs = foo(self.x, self.y, self.bias, self.residual, self.mask) - # return + def check_by_profiler(self, fused_foo, foo_args): + paddle.device.synchronize() - # -------- 性能测试部分 -------- iters = 10 - # warmup - # _ = fused_foo(self.x, self.y, self.b, self.b1) - # _ = foo(self.x, self.y, self.b, self.b1) - - # paddle.device.synchronize() - # start = time.time() - # # for _ in range(iters): - # # _ = fused_foo(self.x, self.y, self.b, self.b1) - # paddle.device.synchronize() - # end = time.time() - # avg_time = (end - start) / iters - # print(f"[Performance] Avg latency per run: {avg_time:.6f} s") - - # profiler (保存到 log_dir) with profiler.Profiler( targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU], on_trace_ready=profiler.export_chrome_tracing("./profiler_log"), timer_only=False, ) as prof: - for step in range(iters): - # _ = fused_foo(self.x, self.y, self.b, self.b1) - _ = fused_foo(self.x, self.y, self.bias, self.residual, self.mask) - # _ = foo(self.x, self.y, self.b, self.b1) + for _ in range(iters): + _ = fused_foo(*foo_args) prof.step() - print("[Profiler] Trace saved to ./profiler_log") - prof.summary( - sorted_by=profiler.SortedKeys.GPUTotal, - op_detail=True, - thread_sep=False, - time_unit="us", + prof.summary() + + def test_subgraph(self): + foo = self.get_matmul_add_act() + foo_args = (self.x, self.y, self.b) + + # foo = self.get_matmul_add_divide_multipy_add() + # foo_args = (self.x, self.y, self.bias, self.residual, self.mask) + + iluvatar_gpu_dir = Path(__file__).resolve().parent.parent.parent + fused_foo = pcc.compile( + foo, + ap_path=f"{iluvatar_gpu_dir}/apy/device", + backend_device="custom_device", ) + self.check_if_ap_variadic_exist(fused_foo, foo_args) + self.check_by_profiler(fused_foo, foo_args) + + ap_outs = fused_foo(*foo_args) + dy_outs = foo(*foo_args) for dy_out, ap_out in zip(dy_outs, ap_outs): np.testing.assert_allclose(dy_out, ap_out, rtol=5e-2, atol=1e-1) From 8156bf3268c48d02da2368af5bd2715392d4d327 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 15 Apr 2026 10:32:41 +0800 Subject: [PATCH 6/6] Update profiler. --- .../tests/unittests/test_ap_matmul_epilogue_iluvatar.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py index 1b49c13a31..ccb41f116a 100644 --- a/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py +++ b/backends/iluvatar_gpu/tests/unittests/test_ap_matmul_epilogue_iluvatar.py @@ -125,7 +125,12 @@ def check_by_profiler(self, fused_foo, foo_args): for _ in range(iters): _ = fused_foo(*foo_args) prof.step() - prof.summary() + prof.summary( + sorted_by=profiler.SortedKeys.GPUTotal, + op_detail=True, + thread_sep=False, + time_unit="us", + ) def test_subgraph(self): foo = self.get_matmul_add_act()