Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/PTO_IR_manual.md
Original file line number Diff line number Diff line change
Expand Up @@ -9370,6 +9370,7 @@ pto.comm.tget(%dst, %src, buf(%ping, %pong) : !pto.partition_tensor_view<128xf32

- `signal` must be a GM-shaped value with element type `i32`.
- `value` / `cmpValue` must be signless integer scalars.
- `pto.comm.tnotify` lowering emits `pipe_barrier(PIPE_ALL)` before `pto::comm::TNOTIFY(...)` so prior MTE-side loads and stores are drained before the signal write.

**Examples:**

Expand Down
11 changes: 11 additions & 0 deletions lib/PTO/Transforms/PTOToEmitC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6387,6 +6387,15 @@ static std::string notifyOpTok(pto::NotifyOp op) {
return "pto::comm::NotifyOp::Set";
}

static void emitPipeBarrierAll(ConversionPatternRewriter &rewriter,
Location loc) {
auto *ctx = rewriter.getContext();
auto args =
rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "PIPE_ALL")});
rewriter.create<emitc::CallOpaqueOp>(loc, TypeRange{}, "pipe_barrier", args,
ArrayAttr{}, ValueRange{});
}

static std::string waitCmpTok(pto::WaitCmp cmp) {
switch (cmp) {
case pto::WaitCmp::EQ:
Expand Down Expand Up @@ -6641,6 +6650,8 @@ struct PTOSignalCommToEmitC : public OpConversionPattern<SignalOp> {
rewriter, op.getLoc(), notifyTy, notifyOpTok(op.getNotifyOp()));
SmallVector<Value> operands{*signalGT, peelUnrealized(adaptor.getValue()),
notifyOp};
// TNOTIFY writes the signal on the scalar pipe; drain prior MTE work first.
emitPipeBarrierAll(rewriter, op.getLoc());
rewriter.create<emitc::CallOpaqueOp>(op.getLoc(), TypeRange{}, callee,
ArrayAttr{}, ArrayAttr{}, operands);
rewriter.eraseOp(op);
Expand Down
57 changes: 57 additions & 0 deletions test/lit/pto/issue711_tnotify_mte_drain.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright (c) 2026 Huawei Technologies Co., Ltd.
// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
// CANN Open Software License Agreement Version 2.0 (the "License").
// Please refer to the License for details. You may not use this file except in compliance with the License.
// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
// See LICENSE in the root of the software repository for the full text of the License.

// RUN: ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s

module {
func.func @tnotify_drains_mte_before_signal(
%src_ptr: !pto.ptr<f32>,
%dst_ptr: !pto.ptr<f32>,
%signal_ptr: !pto.ptr<i32>)
attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c1_i32 = arith.constant 1 : i32

%tile = pto.alloc_tile :
!pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>

%src_view = pto.make_tensor_view %src_ptr,
shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
%src = pto.partition_view %src_view,
offsets = [%c0, %c0], sizes = [%c1, %c32]
: !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x32xf32>

%dst_view = pto.make_tensor_view %dst_ptr,
shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
%dst = pto.partition_view %dst_view,
offsets = [%c0, %c0], sizes = [%c1, %c32]
: !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x32xf32>

pto.tload ins(%src : !pto.partition_tensor_view<1x32xf32>)
outs(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tstore ins(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
outs(%dst : !pto.partition_tensor_view<1x32xf32>)

%signal_view = pto.make_tensor_view %signal_ptr,
shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
%signal = pto.partition_view %signal_view,
offsets = [%c0], sizes = [%c1]
: !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
pto.comm.tnotify(%signal, %c1_i32 : !pto.partition_tensor_view<1xi32>, i32)
{notifyOp = #pto<notify_op set>}
return
}
}

// CHECK-LABEL: AICORE void tnotify_drains_mte_before_signal(
// CHECK: TLOAD(
// CHECK: TSTORE(
// CHECK: pipe_barrier(PIPE_ALL);
// CHECK-NEXT: pto::comm::TNOTIFY(
Loading