From 411912e766d0625750534c7f5bdce2dabffa2b85 Mon Sep 17 00:00:00 2001 From: zhangqi-chen Date: Fri, 22 May 2026 09:58:09 +0800 Subject: [PATCH 1/2] Update: ep_dispatch_combine idx channel uses INT32 TROWSUM compaction The idx stage-out channel previously fell back to a scalar GM copy of column 0 because INT32 TROWSUM hung on a2a3 hardware. pto-isa now supports INT32 TROWSUM, so switch the idx channel to the same TLOAD + TROWSUM + TSTORE compaction already used for the FP32 weight channel, restoring symmetry between the two channels. Refs hw-native-sys/pto-isa#119 --- .../kernels/aiv/dispatch.cpp | 37 +++++++++++-------- .../workers/l3/ep_dispatch_combine/main.py | 2 +- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp b/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp index 547a759b6..1ee637655 100644 --- a/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp +++ b/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp @@ -33,7 +33,8 @@ * recovers column-0 values * because columns [1, W_PAD) * are zero by design); - * recv_idx_out is [L, R] (scalar copy of column 0) + * recv_idx_out is [L, R] (TROWSUM over [L,R,IDX_PAD] + * wide window, INT32) * * Design notes: * - All cross-rank GM writes go through tile primitives (TPUT). No AIV @@ -44,9 +45,8 @@ * - Weight uses TROWSUM along the W_PAD axis to compact the wide window * [L, R, W_PAD] → [L, R] FP32: sum-of-row recovers slot [0] because the * other lanes are zero. One TLOAD + TROWSUM + TSTORE per expert. - * - Idx uses scalar GM copy of column 0 to compact [L, R, IDX_PAD] → - * [L, R] INT32. INT32 TROWSUM exists in pto-isa but hangs on a2a3 in - * this configuration; the L*R = 128 scalar stores are negligible. + * - Idx uses the same TROWSUM compaction along the IDX_PAD axis to compact + * [L, R, IDX_PAD] → [L, R] INT32. One TLOAD + TROWSUM + TSTORE per expert. */ #ifndef __gm__ @@ -527,18 +527,25 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); } - // Stage out idx: scalar copy of column 0 from the wide window. - // - // ⚠ The same TROWSUM compaction used above for the FP32 weight channel - // does NOT work reliably for INT32 on a2a3: pto-isa declares INT32 - // TROWSUM support, but with the same [R, IDX_PAD] / Layout::DN setup - // the kernel hangs on hardware. Until that path is stabilized, fall - // back to a scalar copy here. Volume is small (L*R = 128 INT32 stores) - // so the perf cost is negligible. + // Stage out idx: same TROWSUM compaction as the weight channel, on the + // INT32 [R, IDX_PAD] wide window. sum-along-PAD recovers slot [0] because + // columns [1, IDX_PAD) are zero by design. for (int e = 0; e < L; ++e) { - for (int slot = 0; slot < R; ++slot) { - recv_idx_out[e * R + slot] = recv_idx_local[(e * R + slot) * IDX_PAD]; - } + __gm__ int32_t *idx_win = recv_idx_local + e * R * IDX_PAD; + __gm__ int32_t *idx_out = recv_idx_out + e * R; + IWideG idx_win_g(idx_win); + ISumG idx_out_g(idx_out); + TLOAD(idx_wide_tile, idx_win_g); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_V); + TROWSUM(idx_sum_tile, idx_wide_tile, idx_tmp_tile); + pipe_barrier(PIPE_V); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(idx_out_g, idx_sum_tile); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); } pipe_barrier(PIPE_ALL); } diff --git a/examples/workers/l3/ep_dispatch_combine/main.py b/examples/workers/l3/ep_dispatch_combine/main.py index 54f674d25..9f5d5ed25 100644 --- a/examples/workers/l3/ep_dispatch_combine/main.py +++ b/examples/workers/l3/ep_dispatch_combine/main.py @@ -43,7 +43,7 @@ [weight, 0, 0, …, 0]; receiver writes recv_w[loc_e][slot, :W_PAD] and the kernel TROWSUM-compacts to a [L, R] FP32 host output. - Idx uses the same minimum-tile rationale: 1xIDX_PAD=8 INT32 per - route, actual r=t*TOPK+k at slot [0]; compacted via scalar copy to + route, actual r=t*TOPK+k at slot [0]; TROWSUM-compacted to [L, R] INT32 host output. Combine reads it to address routed_y_buf[t, k, :] without a host-built origin_map. - ``recv_count_out`` is [L, 1] INT32 emitted by dispatch's prefix_sum From 44bfbff8caa12245fe440f84beac4fdbd123ceb5 Mon Sep 17 00:00:00 2001 From: zhangqi-chen Date: Thu, 28 May 2026 09:59:21 +0800 Subject: [PATCH 2/2] Fix: ep_dispatch_combine UB allocation overruns 192 KB The stage_out idx_* tiles were assigned to 0x30000 / 0x40000 / 0x41000, the first of which sits exactly at the 192 KB UB ceiling and the latter two overshoot by 64-68 KB. Alias idx_wide / idx_sum / idx_tmp onto the weight loop's slots (0x10000 / 0x20000 / 0x21000) and add a pipe_barrier(PIPE_ALL) between the two loops so the trailing weight TSTORE drains before idx TLOAD reuses the same UB. --- .../l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp b/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp index 1ee637655..8a9e13a8f 100644 --- a/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp +++ b/examples/workers/l3/ep_dispatch_combine/kernels/aiv/dispatch.cpp @@ -484,9 +484,9 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in TASSIGN(w_wide_tile, 0x10000); TASSIGN(w_sum_tile, 0x20000); TASSIGN(w_tmp_tile, 0x21000); - TASSIGN(idx_wide_tile, 0x30000); - TASSIGN(idx_sum_tile, 0x40000); - TASSIGN(idx_tmp_tile, 0x41000); + TASSIGN(idx_wide_tile, 0x10000); + TASSIGN(idx_sum_tile, 0x20000); + TASSIGN(idx_tmp_tile, 0x21000); // Stage out x: per-row 1xD copies. for (int e = 0; e < L; ++e) { @@ -527,6 +527,11 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); } + // Drain the weight loop's last TSTORE before reusing the same UB slots + // for idx_*. Without this fence, the idx TLOAD could overwrite UB while + // the trailing w TSTORE is still in flight on MTE3. + pipe_barrier(PIPE_ALL); + // Stage out idx: same TROWSUM compaction as the weight channel, on the // INT32 [R, IDX_PAD] wide window. sum-along-PAD recovers slot [0] because // columns [1, IDX_PAD) are zero by design.