hw-native-sys · zhangstevenunity · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/include/PTO/Transforms/GraphSyncSolver/SyncSolverIRTranslator.h b/include/PTO/Transforms/GraphSyncSolver/SyncSolverIRTranslator.h
@@ -89,6 +89,8 @@ class IRTranslator {
   std::unique_ptr<OperationBase> getPipeInterfaceOp(pto::OpPipeInterface op,
                                                     OperationBase *parentOp);
 
+  template <typename OP> void appendP2PCommOps(OP op, Scope *parentOp);
+
   std::unique_ptr<OperationBase> getTensorExtractOp(tensor::ExtractOp extractOp,
                                                     OperationBase *parentOp);
 

diff --git a/include/PTO/Transforms/InsertSync/PTOIRTranslator.h b/include/PTO/Transforms/InsertSync/PTOIRTranslator.h
@@ -88,6 +88,13 @@ class PTOIRTranslator {
 
   // --- 核心：处理计算/搬运指令 (生成 Compound 节点) ---
   void UpdatePTOOpInfo(Operation *op);
+  void UpdateP2PCommOpInfo(pto::TPutOp op);
+  void UpdateP2PCommOpInfo(pto::TGetOp op);
+  void AddPTOOpInfo(Operation *op, PipelineType pipe, ValueRange defs,
+                    ValueRange uses);
+  void AddCompoundOpInfo(Operation *op, PipelineType pipe,
+                         SmallVector<const BaseMemInfo *> defVec,
+                         SmallVector<const BaseMemInfo *> useVec);
 
   // --- 辅助函数 ---
 

diff --git a/lib/PTO/Transforms/GraphSyncSolver/SyncSolver.cpp b/lib/PTO/Transforms/GraphSyncSolver/SyncSolver.cpp
@@ -2322,6 +2322,12 @@ void Solver::processConflict(Occurrence *occ1, Occurrence *occ2,
   }
 }
 
+static bool isInternalP2PCommPhasePair(RWOperation *rwOp1,
+                                       RWOperation *rwOp2) {
+  return rwOp1 != rwOp2 && rwOp1->op && rwOp1->op == rwOp2->op &&
+         isa<pto::TPutOp, pto::TGetOp>(rwOp1->op);
+}
+
 // Main processing loop that iterates processingOrders and attempts to
 // discover and record conflicts.
 void Solver::processOrders() {
@@ -2334,6 +2340,7 @@ void Solver::processOrders() {
     }
     if (checkImpossibleOccPair(occ1, occ2) || checkAlreadySynced(occ1, occ2) ||
         skipMMad1DecomposedLoopOpt(occ1, occ2) ||
+        isInternalP2PCommPhasePair(rwOp1, rwOp2) ||
         checkSkipParallelLoop(occ1, occ2) ||
         checkSkipCrossCorePair(occ1, occ2)) {
       continue;

diff --git a/lib/PTO/Transforms/GraphSyncSolver/SyncSolverIRTranslator.cpp b/lib/PTO/Transforms/GraphSyncSolver/SyncSolverIRTranslator.cpp
@@ -192,6 +192,25 @@ IRTranslator::getPipeInterfaceOp(pto::OpPipeInterface op,
       pipeWrite, reads, writes);
 }
 
+template <typename OP>
+void IRTranslator::appendP2PCommOps(OP commOp, Scope *parentOp) {
+  llvm::SmallVector<Value> scratch{commOp.getPing()};
+  if (Value pong = commOp.getPong())
+    scratch.push_back(pong);
+
+  // Synchronous TPUT/TGET hide MTE2 staging and MTE3 commit inside one call.
+  // Keep the phases separate so scratch writes are attributed to MTE2 and
+  // scratch reads are attributed to MTE3.
+  parentOp->body.push_back(std::make_unique<RWOperation>(
+      commOp.getOperation(), parentOp, TCoreType::CUBE_OR_VECTOR,
+      pto::PIPE::PIPE_MTE2, pto::PIPE::PIPE_MTE2,
+      getMemoryOps({commOp.getSrc()}), getMemoryOps(scratch)));
+  parentOp->body.push_back(std::make_unique<RWOperation>(
+      commOp.getOperation(), parentOp, TCoreType::CUBE_OR_VECTOR,
+      pto::PIPE::PIPE_MTE3, pto::PIPE::PIPE_MTE3, getMemoryOps(scratch),
+      getMemoryOps({commOp.getDst()})));
+}
+
 std::unique_ptr<OperationBase>
 IRTranslator::getTensorExtractOp(tensor::ExtractOp extractOp,
                                  OperationBase *parentOp) {
@@ -309,7 +328,11 @@ std::unique_ptr<Scope> IRTranslator::funcIrBuilder(Region &region,
         continue;
       }
 
-      if (auto pipeOp = dyn_cast<pto::OpPipeInterface>(op)) {
+      if (auto tputOp = dyn_cast<pto::TPutOp>(op)) {
+        appendP2PCommOps(tputOp, parScope);
+      } else if (auto tgetOp = dyn_cast<pto::TGetOp>(op)) {
+        appendP2PCommOps(tgetOp, parScope);
+      } else if (auto pipeOp = dyn_cast<pto::OpPipeInterface>(op)) {
         if (auto rw = getPipeInterfaceOp(pipeOp, parScope))
           parScope->body.push_back(std::move(rw));
       } else if (auto storeOp = dyn_cast<memref::StoreOp>(op)) {

diff --git a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp
@@ -360,6 +360,10 @@ void PTOIRTranslator::RecursionIR(Region *region) {
       return WalkResult::skip();
     } else if (auto yieldOp = dyn_cast<scf::YieldOp>(op)) {
       UpdateYieldOpInfo(yieldOp);
+    } else if (auto tputOp = dyn_cast<pto::TPutOp>(op)) {
+      UpdateP2PCommOpInfo(tputOp);
+    } else if (auto tgetOp = dyn_cast<pto::TGetOp>(op)) {
+      UpdateP2PCommOpInfo(tgetOp);
     } else if (isa<pto::OpPipeInterface>(op)) {
       // --- Case D: 带有 OpPipeInterface 的计算/搬运指令 ---
       UpdatePTOOpInfo(op);
@@ -555,17 +559,59 @@ void PTOIRTranslator::UpdatePTOOpInfo(Operation *op) {
                             << " has Pipe but no MemoryEffects interface.\n");
   }
 
-  // 3. 构建 Compound Node
+  AddCompoundOpInfo(op, pipe, std::move(defVec), std::move(useVec));
+}
+
+// ============================================================================
+// 6. Model compound p2p communication ops
+// ============================================================================
+// TPUT/TGET hide a TLOAD-to-staging and TSTORE-from-staging sequence inside the
+// PTO-ISA helper. Model those pipe effects at the call boundary for auto-sync.
+void PTOIRTranslator::UpdateP2PCommOpInfo(pto::TPutOp op) {
+  SmallVector<Value, 2> scratch{op.getPing()};
+  if (Value pong = op.getPong())
+    scratch.push_back(pong);
+
+  AddPTOOpInfo(op.getOperation(), PipelineType::PIPE_MTE2, scratch,
+               ValueRange{op.getSrc()});
+  AddPTOOpInfo(op.getOperation(), PipelineType::PIPE_MTE3,
+               ValueRange{op.getDst()}, scratch);
+}
+
+void PTOIRTranslator::UpdateP2PCommOpInfo(pto::TGetOp op) {
+  SmallVector<Value, 2> scratch{op.getPing()};
+  if (Value pong = op.getPong())
+    scratch.push_back(pong);
+
+  AddPTOOpInfo(op.getOperation(), PipelineType::PIPE_MTE2, scratch,
+               ValueRange{op.getSrc()});
+  AddPTOOpInfo(op.getOperation(), PipelineType::PIPE_MTE3,
+               ValueRange{op.getDst()}, scratch);
+}
+
+void PTOIRTranslator::AddPTOOpInfo(Operation *op, PipelineType pipe,
+                                   ValueRange defs, ValueRange uses) {
+  if (pipe == pto::PipelineType::PIPE_UNASSIGNED)
+    return;
+
+  SmallVector<const BaseMemInfo *> defVec;
+  SmallVector<const BaseMemInfo *> useVec;
+  UpdateDefUseVec(defs, defVec);
+  UpdateDefUseVec(uses, useVec);
+  AddCompoundOpInfo(op, pipe, std::move(defVec), std::move(useVec));
+}
+
+void PTOIRTranslator::AddCompoundOpInfo(
+    Operation *op, PipelineType pipe, SmallVector<const BaseMemInfo *> defVec,
+    SmallVector<const BaseMemInfo *> useVec) {
   auto compoundElement = std::make_unique<CompoundInstanceElement>(
-      index, defVec, useVec, pipe, op->getName());
+      index, std::move(defVec), std::move(useVec), pipe, op->getName());
   compoundElement->elementOp = op;
 
-  // 4. 设置 Core Type (用于区分 Cube/Vector 资源)
-  // Matmul (M) 和 L1->L0 搬运 (MTE1) 通常涉及 Cube 资源
-  if (pipe == pto::PipelineType::PIPE_M || pipe == pto::PipelineType::PIPE_MTE1) {
+  if (pipe == pto::PipelineType::PIPE_M ||
+      pipe == pto::PipelineType::PIPE_MTE1) {
     compoundElement->compoundCoreType = pto::TCoreType::CUBE;
   } else {
-    // MTE2, MTE3, Vector 归类为 Vector Core (或者对应 MTE 资源)
     compoundElement->compoundCoreType = pto::TCoreType::VECTOR;
   }
 
@@ -574,7 +620,7 @@ void PTOIRTranslator::UpdatePTOOpInfo(Operation *op) {
 }
 
 // ============================================================================
-// 6. [P0 修改] 获取 Op 的 Pipeline 类型
+// 7. [P0 修改] 获取 Op 的 Pipeline 类型
 // ============================================================================
 pto::PipelineType PTOIRTranslator::getOpPipeline(Operation *op) {
   // 1. 优先尝试通过接口获取
@@ -589,7 +635,7 @@ pto::PipelineType PTOIRTranslator::getOpPipeline(Operation *op) {
 }
 
 // ============================================================================
-// 7. 控制流处理 (SCF Support)
+// 8. 控制流处理 (SCF Support)
 // ============================================================================
 
 void PTOIRTranslator::UpdateForOpInfo(scf::ForOp forOp) {
@@ -719,7 +765,7 @@ void PTOIRTranslator::UpdateYieldOpInfo(scf::YieldOp yieldOp) {
 }
 
 // ============================================================================
-// 8. 辅助函数
+// 9. 辅助函数
 // ============================================================================
 void PTOIRTranslator::UpdateAliasBufferInfo(Value result, Value source) {
   if (!result || !source) return;
@@ -940,7 +986,7 @@ void PTOIRTranslator::UpdateDefUseVec(ValueRange values, SmallVector<const BaseM
 }
 
 // ============================================================================
-// 9. 调试与打印支持
+// 10. 调试与打印支持
 // ============================================================================
 
 std::string PTOIRTranslator::getPipelineName(pto::PipelineType pipe) {

diff --git a/test/lit/pto/issue706_comm_p2p_insert_sync.pto b/test/lit/pto/issue706_comm_p2p_insert_sync.pto
@@ -0,0 +1,122 @@
+// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync %s | FileCheck %s
+
+module attributes {pto.target_arch = "a2a3"} {
+  func.func @issue706_tput_waits_for_prior_tstore(
+      %input: !pto.ptr<f32>, %local_src: !pto.ptr<f32>,
+      %remote_dst: !pto.ptr<f32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0_i64 = arith.constant 0 : i64
+    %c4096_i64 = arith.constant 4096 : i64
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+
+    %input_view = pto.make_tensor_view %input, shape = [%c64],
+                  strides = [%c1] {layout = #pto.layout<nd>}
+                : !pto.tensor_view<64xf32>
+    %src_view = pto.make_tensor_view %local_src, shape = [%c64],
+                strides = [%c1] {layout = #pto.layout<nd>}
+              : !pto.tensor_view<64xf32>
+    %dst_view = pto.make_tensor_view %remote_dst, shape = [%c64],
+                strides = [%c1] {layout = #pto.layout<nd>}
+              : !pto.tensor_view<64xf32>
+
+    %input_part = pto.partition_view %input_view, offsets = [%c0],
+                  sizes = [%c64]
+                : !pto.tensor_view<64xf32>
+                  -> !pto.partition_tensor_view<64xf32>
+    %src_store_part = pto.partition_view %src_view, offsets = [%c0],
+                      sizes = [%c64]
+                    : !pto.tensor_view<64xf32>
+                      -> !pto.partition_tensor_view<64xf32>
+    %src_tput_part = pto.partition_view %src_view, offsets = [%c0],
+                     sizes = [%c64]
+                   : !pto.tensor_view<64xf32>
+                     -> !pto.partition_tensor_view<64xf32>
+    %dst_part = pto.partition_view %dst_view, offsets = [%c0],
+                sizes = [%c64]
+              : !pto.tensor_view<64xf32>
+                -> !pto.partition_tensor_view<64xf32>
+
+    %tile = pto.alloc_tile addr = %c0_i64
+          : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %stage = pto.alloc_tile addr = %c4096_i64
+           : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    pto.tload ins(%input_part : !pto.partition_tensor_view<64xf32>)
+              outs(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tstore ins(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%src_store_part : !pto.partition_tensor_view<64xf32>)
+    pto.comm.tput(%dst_part, %src_tput_part, buf(%stage)
+                  : !pto.partition_tensor_view<64xf32>,
+                    !pto.partition_tensor_view<64xf32>,
+                    !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+        {atomicType = #pto<atomic_type atomic_none>}
+    return
+  }
+
+  func.func @issue706_tget_orders_later_tload(
+      %local_dst: !pto.ptr<f32>, %remote_src: !pto.ptr<f32>,
+      %sink: !pto.ptr<f32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0_i64 = arith.constant 0 : i64
+    %c4096_i64 = arith.constant 4096 : i64
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+
+    %dst_view = pto.make_tensor_view %local_dst, shape = [%c64],
+                strides = [%c1] {layout = #pto.layout<nd>}
+              : !pto.tensor_view<64xf32>
+    %src_view = pto.make_tensor_view %remote_src, shape = [%c64],
+                strides = [%c1] {layout = #pto.layout<nd>}
+              : !pto.tensor_view<64xf32>
+    %sink_view = pto.make_tensor_view %sink, shape = [%c64],
+                 strides = [%c1] {layout = #pto.layout<nd>}
+               : !pto.tensor_view<64xf32>
+
+    %dst_tget_part = pto.partition_view %dst_view, offsets = [%c0],
+                     sizes = [%c64]
+                   : !pto.tensor_view<64xf32>
+                     -> !pto.partition_tensor_view<64xf32>
+    %dst_load_part = pto.partition_view %dst_view, offsets = [%c0],
+                     sizes = [%c64]
+                   : !pto.tensor_view<64xf32>
+                     -> !pto.partition_tensor_view<64xf32>
+    %src_part = pto.partition_view %src_view, offsets = [%c0],
+                sizes = [%c64]
+              : !pto.tensor_view<64xf32>
+                -> !pto.partition_tensor_view<64xf32>
+    %sink_part = pto.partition_view %sink_view, offsets = [%c0],
+                 sizes = [%c64]
+               : !pto.tensor_view<64xf32>
+                 -> !pto.partition_tensor_view<64xf32>
+
+    %stage = pto.alloc_tile addr = %c0_i64
+           : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %tile = pto.alloc_tile addr = %c4096_i64
+          : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    pto.comm.tget(%dst_tget_part, %src_part, buf(%stage)
+                  : !pto.partition_tensor_view<64xf32>,
+                    !pto.partition_tensor_view<64xf32>,
+                    !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tload ins(%dst_load_part : !pto.partition_tensor_view<64xf32>)
+              outs(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tstore ins(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%sink_part : !pto.partition_tensor_view<64xf32>)
+    return
+  }
+}
+
+// CHECK-LABEL: AICORE void issue706_tput_waits_for_prior_tstore(
+// CHECK: TSTORE(
+// CHECK-NEXT: set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID[[TPUT_ID:[0-9]+]]);
+// CHECK: wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID[[TPUT_ID]]);
+// CHECK-NEXT: pto::comm::TPUT(
+
+// CHECK-LABEL: AICORE void issue706_tget_orders_later_tload(
+// CHECK: pto::comm::TGET(
+// CHECK-NEXT: set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID[[TGET_ID:[0-9]+]]);
+// CHECK-NEXT: wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID[[TGET_ID]]);
+// CHECK-NEXT: TLOAD(