From 60db4df7354461382a7afb0dfd9372ec2223ea2f Mon Sep 17 00:00:00 2001 From: georgebisbas Date: Tue, 26 May 2026 10:25:16 +0200 Subject: [PATCH 1/2] Fix: reset signal slots after Phase 2 barrier for reentrant collective kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TNOTIFY(AtomicAdd, 1) + TWAIT(GE, 1) barrier pattern leaves each signal slot permanently at 1 after the first kernel invocation. A second call on the same HCCL window (e.g. inside a training loop) would find signal_base[peer] >= 1 already, causing every TWAIT to return immediately without actually synchronising — a silent correctness bug. Fix: after all TWAIT calls complete and pipe_barrier(PIPE_ALL) drains in-flight MTE2 reads, zero signal_base[0..nranks-1] in the local window and flush with a second pipe_barrier(PIPE_ALL). Safety of the reset: signal_base[i] in OUR window is where peer i writes its arrival notification. TWAIT(signal_base[i], GE 1) returning guarantees that peer i's AtomicAdd DMA has already committed to our local memory; no further write to signal_base[i] can arrive from the current invocation. We are the exclusive writer of our own signal_base, so zeroing it here races with nothing. Applied uniformly to all three collective kernels: - allreduce_distributed - allgather_distributed (new, added in parent commit) - reduce_scatter_distributed (new, added in parent commit) Addresses the HIGH-priority barrier comment from the Gemini review on https://github.com/hw-native-sys/simpler/pull/842 --- .../kernels/aiv/allgather_kernel.cpp | 19 +++++++++++++++++++ .../kernels/aiv/allreduce_kernel.cpp | 19 +++++++++++++++++++ .../kernels/aiv/reduce_scatter_kernel.cpp | 19 +++++++++++++++++++ 3 files changed, 57 insertions(+) diff --git a/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp b/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp index 20668e22b..23e76fe77 100644 --- a/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp +++ b/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp @@ -119,6 +119,25 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in } pipe_barrier(PIPE_ALL); + // Reset signal slots — makes the barrier reentrant across multiple + // kernel invocations on the same HCCL window (e.g. a training loop). + // + // Safety argument: + // signal_base[0..nranks-1] lives in OUR local window. Peers write + // to these slots via TNOTIFY (AtomicAdd). The TWAIT loop above + // returns only after each peer's write has been committed to our + // local memory — no further DMA to signal_base[i] can arrive from + // the current invocation once TWAIT(i) returns. We are the sole + // writer of our own signal_base; resetting to zero here (drained by + // the pipe_barrier below) leaves the scratch region clean so that + // the next invocation finds the expected initial state of zero + // rather than the stale value of 1, which would cause TWAIT to + // pass spuriously and skip synchronisation entirely. + for (int i = 0; i < nranks; ++i) { + signal_base[i] = 0; + } + pipe_barrier(PIPE_ALL); + // ------------------------------------------------------------------ // Phase 3: gather — read each rank's scratch slot and write it into // the corresponding slice of the output tensor. diff --git a/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp b/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp index 245161649..b064b2780 100644 --- a/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp +++ b/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp @@ -126,6 +126,25 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in } pipe_barrier(PIPE_ALL); + // Reset signal slots — makes the barrier reentrant across multiple + // kernel invocations on the same HCCL window (e.g. a training loop). + // + // Safety argument: + // signal_base[0..nranks-1] lives in OUR local window. Peers write + // to these slots via TNOTIFY (AtomicAdd). The TWAIT loop above + // returns only after each peer's write has been committed to our + // local memory — no further DMA to signal_base[i] can arrive from + // the current invocation once TWAIT(i) returns. We are the sole + // writer of our own signal_base; resetting to zero here (drained by + // the pipe_barrier below) leaves the scratch region clean so that + // the next invocation finds the expected initial state of zero + // rather than the stale value of 1, which would cause TWAIT to + // pass spuriously and skip synchronisation entirely. + for (int i = 0; i < nranks; ++i) { + signal_base[i] = 0; + } + pipe_barrier(PIPE_ALL); + // ------------------------------------------------------------------ // Phase 3: compute — sum every rank's scratch slot into accTile. // Start from my local scratch (no remote pointer needed), then add diff --git a/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp b/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp index 7e49e67f4..2ddacfd89 100644 --- a/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp +++ b/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp @@ -127,6 +127,25 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in } pipe_barrier(PIPE_ALL); + // Reset signal slots — makes the barrier reentrant across multiple + // kernel invocations on the same HCCL window (e.g. a training loop). + // + // Safety argument: + // signal_base[0..nranks-1] lives in OUR local window. Peers write + // to these slots via TNOTIFY (AtomicAdd). The TWAIT loop above + // returns only after each peer's write has been committed to our + // local memory — no further DMA to signal_base[i] can arrive from + // the current invocation once TWAIT(i) returns. We are the sole + // writer of our own signal_base; resetting to zero here (drained by + // the pipe_barrier below) leaves the scratch region clean so that + // the next invocation finds the expected initial state of zero + // rather than the stale value of 1, which would cause TWAIT to + // pass spuriously and skip synchronisation entirely. + for (int i = 0; i < nranks; ++i) { + signal_base[i] = 0; + } + pipe_barrier(PIPE_ALL); + // ------------------------------------------------------------------ // Phase 3: reduce — sum chunk my_rank from every rank's scratch into // accTile. Start with my own copy, then add all peers via From 9ae0fc877d94cc86b7c51f7e65f64719e3ed5194 Mon Sep 17 00:00:00 2001 From: georgebisbas Date: Tue, 26 May 2026 11:17:10 +0200 Subject: [PATCH 2/2] Chore: condense barrier-reset rationale comments to 2 lines Keep kernel source concise while preserving full safety rationale in prior commit message on this branch (60db4df7). Behavior is unchanged. --- .../kernels/aiv/allgather_kernel.cpp | 16 ++-------------- .../kernels/aiv/allreduce_kernel.cpp | 16 ++-------------- .../kernels/aiv/reduce_scatter_kernel.cpp | 16 ++-------------- 3 files changed, 6 insertions(+), 42 deletions(-) diff --git a/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp b/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp index 23e76fe77..f76df2e69 100644 --- a/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp +++ b/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp @@ -119,20 +119,8 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in } pipe_barrier(PIPE_ALL); - // Reset signal slots — makes the barrier reentrant across multiple - // kernel invocations on the same HCCL window (e.g. a training loop). - // - // Safety argument: - // signal_base[0..nranks-1] lives in OUR local window. Peers write - // to these slots via TNOTIFY (AtomicAdd). The TWAIT loop above - // returns only after each peer's write has been committed to our - // local memory — no further DMA to signal_base[i] can arrive from - // the current invocation once TWAIT(i) returns. We are the sole - // writer of our own signal_base; resetting to zero here (drained by - // the pipe_barrier below) leaves the scratch region clean so that - // the next invocation finds the expected initial state of zero - // rather than the stale value of 1, which would cause TWAIT to - // pass spuriously and skip synchronisation entirely. + // Reentrant barrier cleanup: TWAIT has consumed this invocation's notifications. + // Reset local signal slots so the next invocation cannot pass on stale >=1 values. for (int i = 0; i < nranks; ++i) { signal_base[i] = 0; } diff --git a/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp b/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp index b064b2780..27ef935a5 100644 --- a/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp +++ b/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp @@ -126,20 +126,8 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in } pipe_barrier(PIPE_ALL); - // Reset signal slots — makes the barrier reentrant across multiple - // kernel invocations on the same HCCL window (e.g. a training loop). - // - // Safety argument: - // signal_base[0..nranks-1] lives in OUR local window. Peers write - // to these slots via TNOTIFY (AtomicAdd). The TWAIT loop above - // returns only after each peer's write has been committed to our - // local memory — no further DMA to signal_base[i] can arrive from - // the current invocation once TWAIT(i) returns. We are the sole - // writer of our own signal_base; resetting to zero here (drained by - // the pipe_barrier below) leaves the scratch region clean so that - // the next invocation finds the expected initial state of zero - // rather than the stale value of 1, which would cause TWAIT to - // pass spuriously and skip synchronisation entirely. + // Reentrant barrier cleanup: TWAIT has consumed this invocation's notifications. + // Reset local signal slots so the next invocation cannot pass on stale >=1 values. for (int i = 0; i < nranks; ++i) { signal_base[i] = 0; } diff --git a/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp b/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp index 2ddacfd89..b8b8a241f 100644 --- a/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp +++ b/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp @@ -127,20 +127,8 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in } pipe_barrier(PIPE_ALL); - // Reset signal slots — makes the barrier reentrant across multiple - // kernel invocations on the same HCCL window (e.g. a training loop). - // - // Safety argument: - // signal_base[0..nranks-1] lives in OUR local window. Peers write - // to these slots via TNOTIFY (AtomicAdd). The TWAIT loop above - // returns only after each peer's write has been committed to our - // local memory — no further DMA to signal_base[i] can arrive from - // the current invocation once TWAIT(i) returns. We are the sole - // writer of our own signal_base; resetting to zero here (drained by - // the pipe_barrier below) leaves the scratch region clean so that - // the next invocation finds the expected initial state of zero - // rather than the stale value of 1, which would cause TWAIT to - // pass spuriously and skip synchronisation entirely. + // Reentrant barrier cleanup: TWAIT has consumed this invocation's notifications. + // Reset local signal slots so the next invocation cannot pass on stale >=1 values. for (int i = 0; i < nranks; ++i) { signal_base[i] = 0; }