Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ class IRTranslator {
std::unique_ptr<OperationBase> getPipeInterfaceOp(pto::OpPipeInterface op,
OperationBase *parentOp);

template <typename OP> void appendP2PCommOps(OP op, Scope *parentOp);

std::unique_ptr<OperationBase> getTensorExtractOp(tensor::ExtractOp extractOp,
OperationBase *parentOp);

Expand Down
7 changes: 7 additions & 0 deletions include/PTO/Transforms/InsertSync/PTOIRTranslator.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,13 @@ class PTOIRTranslator {

// --- 核心:处理计算/搬运指令 (生成 Compound 节点) ---
void UpdatePTOOpInfo(Operation *op);
void UpdateP2PCommOpInfo(pto::TPutOp op);
void UpdateP2PCommOpInfo(pto::TGetOp op);
void AddPTOOpInfo(Operation *op, PipelineType pipe, ValueRange defs,
ValueRange uses);
void AddCompoundOpInfo(Operation *op, PipelineType pipe,
SmallVector<const BaseMemInfo *> defVec,
SmallVector<const BaseMemInfo *> useVec);

// --- 辅助函数 ---

Expand Down
7 changes: 7 additions & 0 deletions lib/PTO/Transforms/GraphSyncSolver/SyncSolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2322,6 +2322,12 @@ void Solver::processConflict(Occurrence *occ1, Occurrence *occ2,
}
}

static bool isInternalP2PCommPhasePair(RWOperation *rwOp1,
RWOperation *rwOp2) {
return rwOp1 != rwOp2 && rwOp1->op && rwOp1->op == rwOp2->op &&
isa<pto::TPutOp, pto::TGetOp>(rwOp1->op);
}

// Main processing loop that iterates processingOrders and attempts to
// discover and record conflicts.
void Solver::processOrders() {
Expand All @@ -2334,6 +2340,7 @@ void Solver::processOrders() {
}
if (checkImpossibleOccPair(occ1, occ2) || checkAlreadySynced(occ1, occ2) ||
skipMMad1DecomposedLoopOpt(occ1, occ2) ||
isInternalP2PCommPhasePair(rwOp1, rwOp2) ||
checkSkipParallelLoop(occ1, occ2) ||
checkSkipCrossCorePair(occ1, occ2)) {
continue;
Expand Down
25 changes: 24 additions & 1 deletion lib/PTO/Transforms/GraphSyncSolver/SyncSolverIRTranslator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,25 @@ IRTranslator::getPipeInterfaceOp(pto::OpPipeInterface op,
pipeWrite, reads, writes);
}

template <typename OP>
void IRTranslator::appendP2PCommOps(OP commOp, Scope *parentOp) {
llvm::SmallVector<Value> scratch{commOp.getPing()};
if (Value pong = commOp.getPong())
scratch.push_back(pong);

// Synchronous TPUT/TGET hide MTE2 staging and MTE3 commit inside one call.
// Keep the phases separate so scratch writes are attributed to MTE2 and
// scratch reads are attributed to MTE3.
parentOp->body.push_back(std::make_unique<RWOperation>(
commOp.getOperation(), parentOp, TCoreType::CUBE_OR_VECTOR,
pto::PIPE::PIPE_MTE2, pto::PIPE::PIPE_MTE2,
getMemoryOps({commOp.getSrc()}), getMemoryOps(scratch)));
parentOp->body.push_back(std::make_unique<RWOperation>(
commOp.getOperation(), parentOp, TCoreType::CUBE_OR_VECTOR,
pto::PIPE::PIPE_MTE3, pto::PIPE::PIPE_MTE3, getMemoryOps(scratch),
getMemoryOps({commOp.getDst()})));
}

std::unique_ptr<OperationBase>
IRTranslator::getTensorExtractOp(tensor::ExtractOp extractOp,
OperationBase *parentOp) {
Expand Down Expand Up @@ -309,7 +328,11 @@ std::unique_ptr<Scope> IRTranslator::funcIrBuilder(Region &region,
continue;
}

if (auto pipeOp = dyn_cast<pto::OpPipeInterface>(op)) {
if (auto tputOp = dyn_cast<pto::TPutOp>(op)) {
appendP2PCommOps(tputOp, parScope);
} else if (auto tgetOp = dyn_cast<pto::TGetOp>(op)) {
appendP2PCommOps(tgetOp, parScope);
} else if (auto pipeOp = dyn_cast<pto::OpPipeInterface>(op)) {
if (auto rw = getPipeInterfaceOp(pipeOp, parScope))
parScope->body.push_back(std::move(rw));
} else if (auto storeOp = dyn_cast<memref::StoreOp>(op)) {
Expand Down
66 changes: 56 additions & 10 deletions lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,10 @@ void PTOIRTranslator::RecursionIR(Region *region) {
return WalkResult::skip();
} else if (auto yieldOp = dyn_cast<scf::YieldOp>(op)) {
UpdateYieldOpInfo(yieldOp);
} else if (auto tputOp = dyn_cast<pto::TPutOp>(op)) {
UpdateP2PCommOpInfo(tputOp);
} else if (auto tgetOp = dyn_cast<pto::TGetOp>(op)) {
UpdateP2PCommOpInfo(tgetOp);
} else if (isa<pto::OpPipeInterface>(op)) {
// --- Case D: 带有 OpPipeInterface 的计算/搬运指令 ---
UpdatePTOOpInfo(op);
Expand Down Expand Up @@ -555,17 +559,59 @@ void PTOIRTranslator::UpdatePTOOpInfo(Operation *op) {
<< " has Pipe but no MemoryEffects interface.\n");
}

// 3. 构建 Compound Node
AddCompoundOpInfo(op, pipe, std::move(defVec), std::move(useVec));
}

// ============================================================================
// 6. Model compound p2p communication ops
// ============================================================================
// TPUT/TGET hide a TLOAD-to-staging and TSTORE-from-staging sequence inside the
// PTO-ISA helper. Model those pipe effects at the call boundary for auto-sync.
void PTOIRTranslator::UpdateP2PCommOpInfo(pto::TPutOp op) {
SmallVector<Value, 2> scratch{op.getPing()};
if (Value pong = op.getPong())
scratch.push_back(pong);

AddPTOOpInfo(op.getOperation(), PipelineType::PIPE_MTE2, scratch,
ValueRange{op.getSrc()});
AddPTOOpInfo(op.getOperation(), PipelineType::PIPE_MTE3,
ValueRange{op.getDst()}, scratch);
}

void PTOIRTranslator::UpdateP2PCommOpInfo(pto::TGetOp op) {
SmallVector<Value, 2> scratch{op.getPing()};
if (Value pong = op.getPong())
scratch.push_back(pong);

AddPTOOpInfo(op.getOperation(), PipelineType::PIPE_MTE2, scratch,
ValueRange{op.getSrc()});
AddPTOOpInfo(op.getOperation(), PipelineType::PIPE_MTE3,
ValueRange{op.getDst()}, scratch);
}

void PTOIRTranslator::AddPTOOpInfo(Operation *op, PipelineType pipe,
ValueRange defs, ValueRange uses) {
if (pipe == pto::PipelineType::PIPE_UNASSIGNED)
return;

SmallVector<const BaseMemInfo *> defVec;
SmallVector<const BaseMemInfo *> useVec;
UpdateDefUseVec(defs, defVec);
UpdateDefUseVec(uses, useVec);
AddCompoundOpInfo(op, pipe, std::move(defVec), std::move(useVec));
}

void PTOIRTranslator::AddCompoundOpInfo(
Operation *op, PipelineType pipe, SmallVector<const BaseMemInfo *> defVec,
SmallVector<const BaseMemInfo *> useVec) {
auto compoundElement = std::make_unique<CompoundInstanceElement>(
index, defVec, useVec, pipe, op->getName());
index, std::move(defVec), std::move(useVec), pipe, op->getName());
compoundElement->elementOp = op;

// 4. 设置 Core Type (用于区分 Cube/Vector 资源)
// Matmul (M) 和 L1->L0 搬运 (MTE1) 通常涉及 Cube 资源
if (pipe == pto::PipelineType::PIPE_M || pipe == pto::PipelineType::PIPE_MTE1) {
if (pipe == pto::PipelineType::PIPE_M ||
pipe == pto::PipelineType::PIPE_MTE1) {
compoundElement->compoundCoreType = pto::TCoreType::CUBE;
} else {
// MTE2, MTE3, Vector 归类为 Vector Core (或者对应 MTE 资源)
compoundElement->compoundCoreType = pto::TCoreType::VECTOR;
}

Expand All @@ -574,7 +620,7 @@ void PTOIRTranslator::UpdatePTOOpInfo(Operation *op) {
}

// ============================================================================
// 6. [P0 修改] 获取 Op 的 Pipeline 类型
// 7. [P0 修改] 获取 Op 的 Pipeline 类型
// ============================================================================
pto::PipelineType PTOIRTranslator::getOpPipeline(Operation *op) {
// 1. 优先尝试通过接口获取
Expand All @@ -589,7 +635,7 @@ pto::PipelineType PTOIRTranslator::getOpPipeline(Operation *op) {
}

// ============================================================================
// 7. 控制流处理 (SCF Support)
// 8. 控制流处理 (SCF Support)
// ============================================================================

void PTOIRTranslator::UpdateForOpInfo(scf::ForOp forOp) {
Expand Down Expand Up @@ -719,7 +765,7 @@ void PTOIRTranslator::UpdateYieldOpInfo(scf::YieldOp yieldOp) {
}

// ============================================================================
// 8. 辅助函数
// 9. 辅助函数
// ============================================================================
void PTOIRTranslator::UpdateAliasBufferInfo(Value result, Value source) {
if (!result || !source) return;
Expand Down Expand Up @@ -940,7 +986,7 @@ void PTOIRTranslator::UpdateDefUseVec(ValueRange values, SmallVector<const BaseM
}

// ============================================================================
// 9. 调试与打印支持
// 10. 调试与打印支持
// ============================================================================

std::string PTOIRTranslator::getPipelineName(pto::PipelineType pipe) {
Expand Down
122 changes: 122 additions & 0 deletions test/lit/pto/issue706_comm_p2p_insert_sync.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync %s | FileCheck %s

module attributes {pto.target_arch = "a2a3"} {
func.func @issue706_tput_waits_for_prior_tstore(
%input: !pto.ptr<f32>, %local_src: !pto.ptr<f32>,
%remote_dst: !pto.ptr<f32>)
attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0_i64 = arith.constant 0 : i64
%c4096_i64 = arith.constant 4096 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index

%input_view = pto.make_tensor_view %input, shape = [%c64],
strides = [%c1] {layout = #pto.layout<nd>}
: !pto.tensor_view<64xf32>
%src_view = pto.make_tensor_view %local_src, shape = [%c64],
strides = [%c1] {layout = #pto.layout<nd>}
: !pto.tensor_view<64xf32>
%dst_view = pto.make_tensor_view %remote_dst, shape = [%c64],
strides = [%c1] {layout = #pto.layout<nd>}
: !pto.tensor_view<64xf32>

%input_part = pto.partition_view %input_view, offsets = [%c0],
sizes = [%c64]
: !pto.tensor_view<64xf32>
-> !pto.partition_tensor_view<64xf32>
%src_store_part = pto.partition_view %src_view, offsets = [%c0],
sizes = [%c64]
: !pto.tensor_view<64xf32>
-> !pto.partition_tensor_view<64xf32>
%src_tput_part = pto.partition_view %src_view, offsets = [%c0],
sizes = [%c64]
: !pto.tensor_view<64xf32>
-> !pto.partition_tensor_view<64xf32>
%dst_part = pto.partition_view %dst_view, offsets = [%c0],
sizes = [%c64]
: !pto.tensor_view<64xf32>
-> !pto.partition_tensor_view<64xf32>

%tile = pto.alloc_tile addr = %c0_i64
: !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%stage = pto.alloc_tile addr = %c4096_i64
: !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>

pto.tload ins(%input_part : !pto.partition_tensor_view<64xf32>)
outs(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tstore ins(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
outs(%src_store_part : !pto.partition_tensor_view<64xf32>)
pto.comm.tput(%dst_part, %src_tput_part, buf(%stage)
: !pto.partition_tensor_view<64xf32>,
!pto.partition_tensor_view<64xf32>,
!pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
{atomicType = #pto<atomic_type atomic_none>}
return
}

func.func @issue706_tget_orders_later_tload(
%local_dst: !pto.ptr<f32>, %remote_src: !pto.ptr<f32>,
%sink: !pto.ptr<f32>)
attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0_i64 = arith.constant 0 : i64
%c4096_i64 = arith.constant 4096 : i64
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index

%dst_view = pto.make_tensor_view %local_dst, shape = [%c64],
strides = [%c1] {layout = #pto.layout<nd>}
: !pto.tensor_view<64xf32>
%src_view = pto.make_tensor_view %remote_src, shape = [%c64],
strides = [%c1] {layout = #pto.layout<nd>}
: !pto.tensor_view<64xf32>
%sink_view = pto.make_tensor_view %sink, shape = [%c64],
strides = [%c1] {layout = #pto.layout<nd>}
: !pto.tensor_view<64xf32>

%dst_tget_part = pto.partition_view %dst_view, offsets = [%c0],
sizes = [%c64]
: !pto.tensor_view<64xf32>
-> !pto.partition_tensor_view<64xf32>
%dst_load_part = pto.partition_view %dst_view, offsets = [%c0],
sizes = [%c64]
: !pto.tensor_view<64xf32>
-> !pto.partition_tensor_view<64xf32>
%src_part = pto.partition_view %src_view, offsets = [%c0],
sizes = [%c64]
: !pto.tensor_view<64xf32>
-> !pto.partition_tensor_view<64xf32>
%sink_part = pto.partition_view %sink_view, offsets = [%c0],
sizes = [%c64]
: !pto.tensor_view<64xf32>
-> !pto.partition_tensor_view<64xf32>

%stage = pto.alloc_tile addr = %c0_i64
: !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%tile = pto.alloc_tile addr = %c4096_i64
: !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>

pto.comm.tget(%dst_tget_part, %src_part, buf(%stage)
: !pto.partition_tensor_view<64xf32>,
!pto.partition_tensor_view<64xf32>,
!pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tload ins(%dst_load_part : !pto.partition_tensor_view<64xf32>)
outs(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tstore ins(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=64, v_row=1, v_col=64, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
outs(%sink_part : !pto.partition_tensor_view<64xf32>)
return
}
}

// CHECK-LABEL: AICORE void issue706_tput_waits_for_prior_tstore(
// CHECK: TSTORE(
// CHECK-NEXT: set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID[[TPUT_ID:[0-9]+]]);
// CHECK: wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID[[TPUT_ID]]);
// CHECK-NEXT: pto::comm::TPUT(

// CHECK-LABEL: AICORE void issue706_tget_orders_later_tload(
// CHECK: pto::comm::TGET(
// CHECK-NEXT: set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID[[TGET_ID:[0-9]+]]);
// CHECK-NEXT: wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID[[TGET_ID]]);
// CHECK-NEXT: TLOAD(
Loading
Loading