Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,13 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
}
pipe_barrier(PIPE_ALL);

// Reentrant barrier cleanup: TWAIT has consumed this invocation's notifications.
// Reset local signal slots so the next invocation cannot pass on stale >=1 values.
for (int i = 0; i < nranks; ++i) {
signal_base[i] = 0;
}
pipe_barrier(PIPE_ALL);

// ------------------------------------------------------------------
// Phase 3: gather — read each rank's scratch slot and write it into
// the corresponding slice of the output tensor.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,13 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
}
pipe_barrier(PIPE_ALL);

// Reentrant barrier cleanup: TWAIT has consumed this invocation's notifications.
// Reset local signal slots so the next invocation cannot pass on stale >=1 values.
for (int i = 0; i < nranks; ++i) {
signal_base[i] = 0;
}
pipe_barrier(PIPE_ALL);

// ------------------------------------------------------------------
// Phase 3: compute — sum every rank's scratch slot into accTile.
// Start from my local scratch (no remote pointer needed), then add
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,13 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
}
pipe_barrier(PIPE_ALL);

// Reentrant barrier cleanup: TWAIT has consumed this invocation's notifications.
// Reset local signal slots so the next invocation cannot pass on stale >=1 values.
for (int i = 0; i < nranks; ++i) {
signal_base[i] = 0;
}
pipe_barrier(PIPE_ALL);

// ------------------------------------------------------------------
// Phase 3: reduce — sum chunk my_rank from every rank's scratch into
// accTile. Start with my own copy, then add all peers via
Expand Down