From 411912e766d0625750534c7f5bdce2dabffa2b85 Mon Sep 17 00:00:00 2001
From: zhangqi-chen <wshchzhq@gmail.com>
Date: Fri, 22 May 2026 09:58:09 +0800
Subject: [PATCH 1/2] Update: ep_dispatch_combine idx channel uses INT32
 TROWSUM compaction

The idx stage-out channel previously fell back to a scalar GM copy of
column 0 because INT32 TROWSUM hung on a2a3 hardware. pto-isa now
supports INT32 TROWSUM, so switch the idx channel to the same
TLOAD + TROWSUM + TSTORE compaction already used for the FP32 weight
channel, restoring symmetry between the two channels.

Refs hw-native-sys/pto-isa#119
---
 .../kernels/aiv/dispatch.cpp                  | 37 +++++++++++--------
 .../workers/l3/ep_dispatch_combine/main.py    |  2 +-
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp b/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp
index 547a759b6..1ee637655 100644
--- a/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp
+++ b/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp
@@ -33,7 +33,8 @@
  *                                                      recovers column-0 values
  *                                                      because columns [1, W_PAD)
  *                                                      are zero by design);
- *                            recv_idx_out is [L, R]  (scalar copy of column 0)
+ *                            recv_idx_out is [L, R]  (TROWSUM over [L,R,IDX_PAD]
+ *                                                      wide window, INT32)
  *
  * Design notes:
  *   - All cross-rank GM writes go through tile primitives (TPUT). No AIV
@@ -44,9 +45,8 @@
  *   - Weight uses TROWSUM along the W_PAD axis to compact the wide window
  *     [L, R, W_PAD] → [L, R] FP32: sum-of-row recovers slot [0] because the
  *     other lanes are zero. One TLOAD + TROWSUM + TSTORE per expert.
- *   - Idx uses scalar GM copy of column 0 to compact [L, R, IDX_PAD] →
- *     [L, R] INT32. INT32 TROWSUM exists in pto-isa but hangs on a2a3 in
- *     this configuration; the L*R = 128 scalar stores are negligible.
+ *   - Idx uses the same TROWSUM compaction along the IDX_PAD axis to compact
+ *     [L, R, IDX_PAD] → [L, R] INT32. One TLOAD + TROWSUM + TSTORE per expert.
  */
 
 #ifndef __gm__
@@ -527,18 +527,25 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
         wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1);
     }
 
-    // Stage out idx: scalar copy of column 0 from the wide window.
-    //
-    // ⚠ The same TROWSUM compaction used above for the FP32 weight channel
-    // does NOT work reliably for INT32 on a2a3: pto-isa declares INT32
-    // TROWSUM support, but with the same [R, IDX_PAD] / Layout::DN setup
-    // the kernel hangs on hardware. Until that path is stabilized, fall
-    // back to a scalar copy here. Volume is small (L*R = 128 INT32 stores)
-    // so the perf cost is negligible.
+    // Stage out idx: same TROWSUM compaction as the weight channel, on the
+    // INT32 [R, IDX_PAD] wide window. sum-along-PAD recovers slot [0] because
+    // columns [1, IDX_PAD) are zero by design.
     for (int e = 0; e < L; ++e) {
-        for (int slot = 0; slot < R; ++slot) {
-            recv_idx_out[e * R + slot] = recv_idx_local[(e * R + slot) * IDX_PAD];
-        }
+        __gm__ int32_t *idx_win = recv_idx_local + e * R * IDX_PAD;
+        __gm__ int32_t *idx_out = recv_idx_out + e * R;
+        IWideG idx_win_g(idx_win);
+        ISumG idx_out_g(idx_out);
+        TLOAD(idx_wide_tile, idx_win_g);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        pipe_barrier(PIPE_V);
+        TROWSUM(idx_sum_tile, idx_wide_tile, idx_tmp_tile);
+        pipe_barrier(PIPE_V);
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+        TSTORE(idx_out_g, idx_sum_tile);
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1);
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1);
     }
     pipe_barrier(PIPE_ALL);
 }
diff --git a/examples/workers/l3/ep_dispatch_combine/main.py b/examples/workers/l3/ep_dispatch_combine/main.py
index 54f674d25..9f5d5ed25 100644
--- a/examples/workers/l3/ep_dispatch_combine/main.py
+++ b/examples/workers/l3/ep_dispatch_combine/main.py
@@ -43,7 +43,7 @@
     [weight, 0, 0, …, 0]; receiver writes recv_w[loc_e][slot, :W_PAD]
     and the kernel TROWSUM-compacts to a [L, R] FP32 host output.
   - Idx uses the same minimum-tile rationale: 1xIDX_PAD=8 INT32 per
-    route, actual r=t*TOPK+k at slot [0]; compacted via scalar copy to
+    route, actual r=t*TOPK+k at slot [0]; TROWSUM-compacted to
     [L, R] INT32 host output. Combine reads it to address
     routed_y_buf[t, k, :] without a host-built origin_map.
   - ``recv_count_out`` is [L, 1] INT32 emitted by dispatch's prefix_sum

From 44bfbff8caa12245fe440f84beac4fdbd123ceb5 Mon Sep 17 00:00:00 2001
From: zhangqi-chen <wshchzhq@gmail.com>
Date: Thu, 28 May 2026 09:59:21 +0800
Subject: [PATCH 2/2] Fix: ep_dispatch_combine UB allocation overruns 192 KB

The stage_out idx_* tiles were assigned to 0x30000 / 0x40000 / 0x41000,
the first of which sits exactly at the 192 KB UB ceiling and the latter
two overshoot by 64-68 KB. Alias idx_wide / idx_sum / idx_tmp onto the
weight loop's slots (0x10000 / 0x20000 / 0x21000) and add a
pipe_barrier(PIPE_ALL) between the two loops so the trailing weight
TSTORE drains before idx TLOAD reuses the same UB.
---
 .../l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp   | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp b/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp
index 1ee637655..8a9e13a8f 100644
--- a/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp
+++ b/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp
@@ -484,9 +484,9 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     TASSIGN(w_wide_tile, 0x10000);
     TASSIGN(w_sum_tile, 0x20000);
     TASSIGN(w_tmp_tile, 0x21000);
-    TASSIGN(idx_wide_tile, 0x30000);
-    TASSIGN(idx_sum_tile, 0x40000);
-    TASSIGN(idx_tmp_tile, 0x41000);
+    TASSIGN(idx_wide_tile, 0x10000);
+    TASSIGN(idx_sum_tile, 0x20000);
+    TASSIGN(idx_tmp_tile, 0x21000);
 
     // Stage out x: per-row 1xD copies.
     for (int e = 0; e < L; ++e) {
@@ -527,6 +527,11 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
         wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1);
     }
 
+    // Drain the weight loop's last TSTORE before reusing the same UB slots
+    // for idx_*. Without this fence, the idx TLOAD could overwrite UB while
+    // the trailing w TSTORE is still in flight on MTE3.
+    pipe_barrier(PIPE_ALL);
+
     // Stage out idx: same TROWSUM compaction as the weight channel, on the
     // INT32 [R, IDX_PAD] wide window. sum-along-PAD recovers slot [0] because
     // columns [1, IDX_PAD) are zero by design.