diff --git a/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp b/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp index 20668e22b..f76df2e69 100644 --- a/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp +++ b/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp @@ -119,6 +119,13 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in } pipe_barrier(PIPE_ALL); + // Reentrant barrier cleanup: TWAIT has consumed this invocation's notifications. + // Reset local signal slots so the next invocation cannot pass on stale >=1 values. + for (int i = 0; i < nranks; ++i) { + signal_base[i] = 0; + } + pipe_barrier(PIPE_ALL); + // ------------------------------------------------------------------ // Phase 3: gather — read each rank's scratch slot and write it into // the corresponding slice of the output tensor. diff --git a/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp b/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp index 245161649..27ef935a5 100644 --- a/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp +++ b/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp @@ -126,6 +126,13 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in } pipe_barrier(PIPE_ALL); + // Reentrant barrier cleanup: TWAIT has consumed this invocation's notifications. + // Reset local signal slots so the next invocation cannot pass on stale >=1 values. + for (int i = 0; i < nranks; ++i) { + signal_base[i] = 0; + } + pipe_barrier(PIPE_ALL); + // ------------------------------------------------------------------ // Phase 3: compute — sum every rank's scratch slot into accTile. // Start from my local scratch (no remote pointer needed), then add diff --git a/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp b/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp index 7e49e67f4..b8b8a241f 100644 --- a/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp +++ b/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp @@ -127,6 +127,13 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in } pipe_barrier(PIPE_ALL); + // Reentrant barrier cleanup: TWAIT has consumed this invocation's notifications. + // Reset local signal slots so the next invocation cannot pass on stale >=1 values. + for (int i = 0; i < nranks; ++i) { + signal_base[i] = 0; + } + pipe_barrier(PIPE_ALL); + // ------------------------------------------------------------------ // Phase 3: reduce — sum chunk my_rank from every rank's scratch into // accTile. Start with my own copy, then add all peers via