From 60db4df7354461382a7afb0dfd9372ec2223ea2f Mon Sep 17 00:00:00 2001
From: georgebisbas <georgios.bismpas@h-partners.com>
Date: Tue, 26 May 2026 10:25:16 +0200
Subject: [PATCH 1/2] Fix: reset signal slots after Phase 2 barrier for
 reentrant collective kernels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The TNOTIFY(AtomicAdd, 1) + TWAIT(GE, 1) barrier pattern leaves each
signal slot permanently at 1 after the first kernel invocation.  A second
call on the same HCCL window (e.g. inside a training loop) would find
signal_base[peer] >= 1 already, causing every TWAIT to return immediately
without actually synchronising — a silent correctness bug.

Fix: after all TWAIT calls complete and pipe_barrier(PIPE_ALL) drains
in-flight MTE2 reads, zero signal_base[0..nranks-1] in the local window
and flush with a second pipe_barrier(PIPE_ALL).

Safety of the reset:
  signal_base[i] in OUR window is where peer i writes its arrival
  notification.  TWAIT(signal_base[i], GE 1) returning guarantees that
  peer i's AtomicAdd DMA has already committed to our local memory; no
  further write to signal_base[i] can arrive from the current invocation.
  We are the exclusive writer of our own signal_base, so zeroing it here
  races with nothing.

Applied uniformly to all three collective kernels:
  - allreduce_distributed
  - allgather_distributed   (new, added in parent commit)
  - reduce_scatter_distributed  (new, added in parent commit)

Addresses the HIGH-priority barrier comment from the Gemini review on
https://github.com/hw-native-sys/simpler/pull/842
---
 .../kernels/aiv/allgather_kernel.cpp          | 19 +++++++++++++++++++
 .../kernels/aiv/allreduce_kernel.cpp          | 19 +++++++++++++++++++
 .../kernels/aiv/reduce_scatter_kernel.cpp     | 19 +++++++++++++++++++
 3 files changed, 57 insertions(+)

diff --git a/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp b/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp
index 20668e22b..23e76fe77 100644
--- a/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp
+++ b/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp
@@ -119,6 +119,25 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     }
     pipe_barrier(PIPE_ALL);
 
+    // Reset signal slots — makes the barrier reentrant across multiple
+    // kernel invocations on the same HCCL window (e.g. a training loop).
+    //
+    // Safety argument:
+    //   signal_base[0..nranks-1] lives in OUR local window.  Peers write
+    //   to these slots via TNOTIFY (AtomicAdd).  The TWAIT loop above
+    //   returns only after each peer's write has been committed to our
+    //   local memory — no further DMA to signal_base[i] can arrive from
+    //   the current invocation once TWAIT(i) returns.  We are the sole
+    //   writer of our own signal_base; resetting to zero here (drained by
+    //   the pipe_barrier below) leaves the scratch region clean so that
+    //   the next invocation finds the expected initial state of zero
+    //   rather than the stale value of 1, which would cause TWAIT to
+    //   pass spuriously and skip synchronisation entirely.
+    for (int i = 0; i < nranks; ++i) {
+        signal_base[i] = 0;
+    }
+    pipe_barrier(PIPE_ALL);
+
     // ------------------------------------------------------------------
     // Phase 3: gather — read each rank's scratch slot and write it into
     // the corresponding slice of the output tensor.
diff --git a/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp b/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp
index 245161649..b064b2780 100644
--- a/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp
+++ b/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp
@@ -126,6 +126,25 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     }
     pipe_barrier(PIPE_ALL);
 
+    // Reset signal slots — makes the barrier reentrant across multiple
+    // kernel invocations on the same HCCL window (e.g. a training loop).
+    //
+    // Safety argument:
+    //   signal_base[0..nranks-1] lives in OUR local window.  Peers write
+    //   to these slots via TNOTIFY (AtomicAdd).  The TWAIT loop above
+    //   returns only after each peer's write has been committed to our
+    //   local memory — no further DMA to signal_base[i] can arrive from
+    //   the current invocation once TWAIT(i) returns.  We are the sole
+    //   writer of our own signal_base; resetting to zero here (drained by
+    //   the pipe_barrier below) leaves the scratch region clean so that
+    //   the next invocation finds the expected initial state of zero
+    //   rather than the stale value of 1, which would cause TWAIT to
+    //   pass spuriously and skip synchronisation entirely.
+    for (int i = 0; i < nranks; ++i) {
+        signal_base[i] = 0;
+    }
+    pipe_barrier(PIPE_ALL);
+
     // ------------------------------------------------------------------
     // Phase 3: compute — sum every rank's scratch slot into accTile.
     // Start from my local scratch (no remote pointer needed), then add
diff --git a/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp b/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp
index 7e49e67f4..2ddacfd89 100644
--- a/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp
+++ b/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp
@@ -127,6 +127,25 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     }
     pipe_barrier(PIPE_ALL);
 
+    // Reset signal slots — makes the barrier reentrant across multiple
+    // kernel invocations on the same HCCL window (e.g. a training loop).
+    //
+    // Safety argument:
+    //   signal_base[0..nranks-1] lives in OUR local window.  Peers write
+    //   to these slots via TNOTIFY (AtomicAdd).  The TWAIT loop above
+    //   returns only after each peer's write has been committed to our
+    //   local memory — no further DMA to signal_base[i] can arrive from
+    //   the current invocation once TWAIT(i) returns.  We are the sole
+    //   writer of our own signal_base; resetting to zero here (drained by
+    //   the pipe_barrier below) leaves the scratch region clean so that
+    //   the next invocation finds the expected initial state of zero
+    //   rather than the stale value of 1, which would cause TWAIT to
+    //   pass spuriously and skip synchronisation entirely.
+    for (int i = 0; i < nranks; ++i) {
+        signal_base[i] = 0;
+    }
+    pipe_barrier(PIPE_ALL);
+
     // ------------------------------------------------------------------
     // Phase 3: reduce — sum chunk my_rank from every rank's scratch into
     // accTile.  Start with my own copy, then add all peers via

From 9ae0fc877d94cc86b7c51f7e65f64719e3ed5194 Mon Sep 17 00:00:00 2001
From: georgebisbas <georgios.bismpas@h-partners.com>
Date: Tue, 26 May 2026 11:17:10 +0200
Subject: [PATCH 2/2] Chore: condense barrier-reset rationale comments to 2
 lines

Keep kernel source concise while preserving full safety rationale in prior
commit message on this branch (60db4df7). Behavior is unchanged.
---
 .../kernels/aiv/allgather_kernel.cpp             | 16 ++--------------
 .../kernels/aiv/allreduce_kernel.cpp             | 16 ++--------------
 .../kernels/aiv/reduce_scatter_kernel.cpp        | 16 ++--------------
 3 files changed, 6 insertions(+), 42 deletions(-)

diff --git a/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp b/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp
index 23e76fe77..f76df2e69 100644
--- a/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp
+++ b/examples/workers/l3/allgather_distributed/kernels/aiv/allgather_kernel.cpp
@@ -119,20 +119,8 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     }
     pipe_barrier(PIPE_ALL);
 
-    // Reset signal slots — makes the barrier reentrant across multiple
-    // kernel invocations on the same HCCL window (e.g. a training loop).
-    //
-    // Safety argument:
-    //   signal_base[0..nranks-1] lives in OUR local window.  Peers write
-    //   to these slots via TNOTIFY (AtomicAdd).  The TWAIT loop above
-    //   returns only after each peer's write has been committed to our
-    //   local memory — no further DMA to signal_base[i] can arrive from
-    //   the current invocation once TWAIT(i) returns.  We are the sole
-    //   writer of our own signal_base; resetting to zero here (drained by
-    //   the pipe_barrier below) leaves the scratch region clean so that
-    //   the next invocation finds the expected initial state of zero
-    //   rather than the stale value of 1, which would cause TWAIT to
-    //   pass spuriously and skip synchronisation entirely.
+    // Reentrant barrier cleanup: TWAIT has consumed this invocation's notifications.
+    // Reset local signal slots so the next invocation cannot pass on stale >=1 values.
     for (int i = 0; i < nranks; ++i) {
         signal_base[i] = 0;
     }
diff --git a/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp b/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp
index b064b2780..27ef935a5 100644
--- a/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp
+++ b/examples/workers/l3/allreduce_distributed/kernels/aiv/allreduce_kernel.cpp
@@ -126,20 +126,8 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     }
     pipe_barrier(PIPE_ALL);
 
-    // Reset signal slots — makes the barrier reentrant across multiple
-    // kernel invocations on the same HCCL window (e.g. a training loop).
-    //
-    // Safety argument:
-    //   signal_base[0..nranks-1] lives in OUR local window.  Peers write
-    //   to these slots via TNOTIFY (AtomicAdd).  The TWAIT loop above
-    //   returns only after each peer's write has been committed to our
-    //   local memory — no further DMA to signal_base[i] can arrive from
-    //   the current invocation once TWAIT(i) returns.  We are the sole
-    //   writer of our own signal_base; resetting to zero here (drained by
-    //   the pipe_barrier below) leaves the scratch region clean so that
-    //   the next invocation finds the expected initial state of zero
-    //   rather than the stale value of 1, which would cause TWAIT to
-    //   pass spuriously and skip synchronisation entirely.
+    // Reentrant barrier cleanup: TWAIT has consumed this invocation's notifications.
+    // Reset local signal slots so the next invocation cannot pass on stale >=1 values.
     for (int i = 0; i < nranks; ++i) {
         signal_base[i] = 0;
     }
diff --git a/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp b/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp
index 2ddacfd89..b8b8a241f 100644
--- a/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp
+++ b/examples/workers/l3/reduce_scatter_distributed/kernels/aiv/reduce_scatter_kernel.cpp
@@ -127,20 +127,8 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     }
     pipe_barrier(PIPE_ALL);
 
-    // Reset signal slots — makes the barrier reentrant across multiple
-    // kernel invocations on the same HCCL window (e.g. a training loop).
-    //
-    // Safety argument:
-    //   signal_base[0..nranks-1] lives in OUR local window.  Peers write
-    //   to these slots via TNOTIFY (AtomicAdd).  The TWAIT loop above
-    //   returns only after each peer's write has been committed to our
-    //   local memory — no further DMA to signal_base[i] can arrive from
-    //   the current invocation once TWAIT(i) returns.  We are the sole
-    //   writer of our own signal_base; resetting to zero here (drained by
-    //   the pipe_barrier below) leaves the scratch region clean so that
-    //   the next invocation finds the expected initial state of zero
-    //   rather than the stale value of 1, which would cause TWAIT to
-    //   pass spuriously and skip synchronisation entirely.
+    // Reentrant barrier cleanup: TWAIT has consumed this invocation's notifications.
+    // Reset local signal slots so the next invocation cannot pass on stale >=1 values.
     for (int i = 0; i < nranks; ++i) {
         signal_base[i] = 0;
     }