From 331e0c4227b74c699bfb84013e5a85d48a42965f Mon Sep 17 00:00:00 2001 From: zhangstevenunity <128771452+zhangstevenunity@users.noreply.github.com> Date: Wed, 27 May 2026 17:58:46 +0800 Subject: [PATCH] Drain MTE before tnotify --- docs/PTO_IR_manual.md | 1 + lib/PTO/Transforms/PTOToEmitC.cpp | 11 ++++ test/lit/pto/issue711_tnotify_mte_drain.pto | 57 +++++++++++++++++++++ 3 files changed, 69 insertions(+) create mode 100644 test/lit/pto/issue711_tnotify_mte_drain.pto diff --git a/docs/PTO_IR_manual.md b/docs/PTO_IR_manual.md index a32de72f7..e3d22352b 100644 --- a/docs/PTO_IR_manual.md +++ b/docs/PTO_IR_manual.md @@ -9370,6 +9370,7 @@ pto.comm.tget(%dst, %src, buf(%ping, %pong) : !pto.partition_tensor_view<128xf32 - `signal` must be a GM-shaped value with element type `i32`. - `value` / `cmpValue` must be signless integer scalars. +- `pto.comm.tnotify` lowering emits `pipe_barrier(PIPE_ALL)` before `pto::comm::TNOTIFY(...)` so prior MTE-side loads and stores are drained before the signal write. **Examples:** diff --git a/lib/PTO/Transforms/PTOToEmitC.cpp b/lib/PTO/Transforms/PTOToEmitC.cpp index b3f0c6bbd..0dea25612 100644 --- a/lib/PTO/Transforms/PTOToEmitC.cpp +++ b/lib/PTO/Transforms/PTOToEmitC.cpp @@ -6387,6 +6387,15 @@ static std::string notifyOpTok(pto::NotifyOp op) { return "pto::comm::NotifyOp::Set"; } +static void emitPipeBarrierAll(ConversionPatternRewriter &rewriter, + Location loc) { + auto *ctx = rewriter.getContext(); + auto args = + rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "PIPE_ALL")}); + rewriter.create(loc, TypeRange{}, "pipe_barrier", args, + ArrayAttr{}, ValueRange{}); +} + static std::string waitCmpTok(pto::WaitCmp cmp) { switch (cmp) { case pto::WaitCmp::EQ: @@ -6641,6 +6650,8 @@ struct PTOSignalCommToEmitC : public OpConversionPattern { rewriter, op.getLoc(), notifyTy, notifyOpTok(op.getNotifyOp())); SmallVector operands{*signalGT, peelUnrealized(adaptor.getValue()), notifyOp}; + // TNOTIFY writes the signal on the scalar pipe; drain prior MTE work first. + emitPipeBarrierAll(rewriter, op.getLoc()); rewriter.create(op.getLoc(), TypeRange{}, callee, ArrayAttr{}, ArrayAttr{}, operands); rewriter.eraseOp(op); diff --git a/test/lit/pto/issue711_tnotify_mte_drain.pto b/test/lit/pto/issue711_tnotify_mte_drain.pto new file mode 100644 index 000000000..edb4d40b8 --- /dev/null +++ b/test/lit/pto/issue711_tnotify_mte_drain.pto @@ -0,0 +1,57 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// RUN: ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s + +module { + func.func @tnotify_drains_mte_before_signal( + %src_ptr: !pto.ptr, + %dst_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %c1_i32 = arith.constant 1 : i32 + + %tile = pto.alloc_tile : + !pto.tile_buf + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %src = pto.partition_view %src_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %dst = pto.partition_view %dst_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + pto.tload ins(%src : !pto.partition_tensor_view<1x32xf32>) + outs(%tile : !pto.tile_buf) + pto.tstore ins(%tile : !pto.tile_buf) + outs(%dst : !pto.partition_tensor_view<1x32xf32>) + + %signal_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %signal = pto.partition_view %signal_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%signal, %c1_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } +} + +// CHECK-LABEL: AICORE void tnotify_drains_mte_before_signal( +// CHECK: TLOAD( +// CHECK: TSTORE( +// CHECK: pipe_barrier(PIPE_ALL); +// CHECK-NEXT: pto::comm::TNOTIFY(