From 7d0764d924c5b758ab16ab386d29f5bda76373af Mon Sep 17 00:00:00 2001
From: ChaoZheng109 <zhengchao47@huawei.com>
Date: Wed, 13 May 2026 10:20:00 +0800
Subject: [PATCH 1/2] =?UTF-8?q?Fix:=20a5=20AICore=20SIMT=20launch=20?=
 =?UTF-8?q?=E2=80=94=20set=20localMemorySize=20+=20inject=20SIMT=20TLVs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three coupled changes that together unblock the
rtKernelLaunchWithHandleV2 path on a5 and add CI coverage for it:

1. cfg.localMemorySize was left at 0 in launch_aicore_kernel, so runtime
   allocated no AICore local memory and SIMT execution failed. Add a
   pair of constants in platform_config.h —
   PLATFORM_AICORE_SHARE_MEM_SIZE (8 KB) and
   PLATFORM_AICORE_LOCAL_MEMORY_SIZE (216 KB) — and pass the latter
   through rtTaskCfgInfo_t::localMemorySize. The pair sums to exactly
   RT_SIMT_REMAIN_UB_SIZE (224 KB = 256 KB UB − 32 KB dcache); runtime's
   check is strict > so equality is accepted. Section 2 below consumes
   PLATFORM_AICORE_SHARE_MEM_SIZE as the advertised TLV value.

2. Runtime reads two TLV records (COMPILER_ALLOC_UB_SIZE / type=7 and
   AIV_TYPE_FLAG / type=12) from the kernel ELF's \`.ascend.meta.<func>\`
   section to populate Kernel::shareMemSize_ and Kernel::kernelVfType_.
   bisheng only emits these when it can statically infer SIMT use; our
   SU-dispatcher entry can't be tagged automatically. Inject a
   hand-written meta record for the AIV variant (ub_size=8 KB via
   PLATFORM_AICORE_SHARE_MEM_SIZE, aiv_type=SIMD_SIMT_MIX_VF — the
   dispatcher routes task .o files containing both SIMD and SIMT vector
   kernels, so MIX_VF avoids runtime's per-type restrictions) and
   disable bisheng's auto-emission with
   \`-mllvm -cce-dyn-kernel-stack-size=false\` so the runtime parser,
   which keys kernelInfoMap by section name and overwrites instead of
   merging, doesn't shadow our values with NO_VF / shareMemSize=0.

3. Add tests/st/a5/tensormap_and_ringbuffer/simt_basic/ — a minimal
   element-scatter ST that exercises the SIMT launch path end-to-end.
   The kernel is distilled from the ptoas-generated mscatter reference
   and keeps the pieces real hardware actually requires:

   - per-data 3-tile alias pattern (TLOAD binds one tile, MSCATTER
     reads from another aliased to the same UB address; the single-tile
     form silently dropped the scatter on a5 hw)
   - set_mask_norm / set_vector_mask SIMT mask init at entry
   - MTE2 → V flag/wait before MSCATTER (the ptoas default MTE2 → MTE3
     also silently dropped the scatter on hw)

   Indices use torch.arange() so the golden reduces to \`out == src\`,
   keeping the test a strict bring-up signal: a regression in any of
   the launch-path layers fixed above (TLV injection, localMemorySize
   budget, sync) flips it red while keeping false-positives from the
   scatter algorithm itself out of the way. A follow-up case with
   torch.randperm indices can be added once the ptoas dispatcher's
   per-element vs row-mode behaviour on hw is confirmed.

   The orchestration wraps rt_submit_aiv_task in PTO2_SCOPE() so the
   submit flushes through the task ringbuffer before the entry returns.
---
 .../platform/include/common/platform_config.h |  12 ++
 src/a5/platform/onboard/aicore/CMakeLists.txt |   1 +
 src/a5/platform/onboard/aicore/kernel.cpp     |  17 +++
 src/a5/platform/onboard/aicore/simt_meta.h    |  80 ++++++++++++
 .../platform/onboard/host/device_runner.cpp   |   1 +
 .../kernels/aiv/kernel_simt_scatter.cpp       | 115 ++++++++++++++++++
 .../kernels/orchestration/simt_basic_orch.cpp |  53 ++++++++
 .../simt_basic/test_simt_basic.py             |  82 +++++++++++++
 8 files changed, 361 insertions(+)
 create mode 100644 src/a5/platform/onboard/aicore/simt_meta.h
 create mode 100644 tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp
 create mode 100644 tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/orchestration/simt_basic_orch.cpp
 create mode 100644 tests/st/a5/tensormap_and_ringbuffer/simt_basic/test_simt_basic.py
diff --git a/src/a5/platform/include/common/platform_config.h b/src/a5/platform/include/common/platform_config.h
index 9e9a4faab..899823428 100644
--- a/src/a5/platform/include/common/platform_config.h
+++ b/src/a5/platform/include/common/platform_config.h
@@ -77,6 +77,18 @@ constexpr int PLATFORM_MAX_AIV_PER_THREAD = PLATFORM_MAX_BLOCKDIM * PLATFORM_AIV
 
 constexpr int PLATFORM_MAX_CORES_PER_THREAD = PLATFORM_MAX_AIC_PER_THREAD + PLATFORM_MAX_AIV_PER_THREAD;  // 108
 
+// AICore UB reservation for the legacy SIMT launch path.
+//
+// rtKernelLaunchWithHandleV2 + rtRegisterAllKernel checks
+// kernel->ShareMemSize_() + cfg.localMemorySize against
+// RT_SIMT_REMAIN_UB_SIZE (224 KB = 256 KB UB − 32 KB dcache). The kernel
+// advertises PLATFORM_AICORE_SHARE_MEM_SIZE via the SIMT TLV record
+// injected in onboard/aicore/kernel.cpp; the host passes
+// PLATFORM_AICORE_LOCAL_MEMORY_SIZE through cfg.localMemorySize. They sum
+// to exactly 224 KB; runtime's check is strict >, so equality is accepted.
+constexpr uint32_t PLATFORM_AICORE_SHARE_MEM_SIZE = 8 * 1024;       // 8 KB
+constexpr uint32_t PLATFORM_AICORE_LOCAL_MEMORY_SIZE = 216 * 1024;  // 216 KB
+
 // =============================================================================
 // Performance Profiling Configuration
 // =============================================================================
diff --git a/src/a5/platform/onboard/aicore/CMakeLists.txt b/src/a5/platform/onboard/aicore/CMakeLists.txt
index 770b26530..e2abae61d 100644
--- a/src/a5/platform/onboard/aicore/CMakeLists.txt
+++ b/src/a5/platform/onboard/aicore/CMakeLists.txt
@@ -57,6 +57,7 @@ set(AICORE_FLAGS
     -mllvm -cce-aicore-record-overflow=false \
     -mllvm -cce-aicore-addr-transform \
     -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    -mllvm -cce-dyn-kernel-stack-size=false \
     ${CMAKE_CUSTOM_INCLUDE_DIR_FLAGS}"
 )
 separate_arguments(AICORE_FLAGS)
diff --git a/src/a5/platform/onboard/aicore/kernel.cpp b/src/a5/platform/onboard/aicore/kernel.cpp
index 6789b66b0..88a2ca2c3 100644
--- a/src/a5/platform/onboard/aicore/kernel.cpp
+++ b/src/a5/platform/onboard/aicore/kernel.cpp
@@ -18,6 +18,7 @@
 #include "common/l2_perf_profiling.h"
 #include "common/platform_config.h"
 #include "common/pmu_profiling.h"
+#include "simt_meta.h"
 
 class Runtime;
 
@@ -67,6 +68,22 @@ __attribute__((weak)) __aicore__ uint64_t get_aicore_pmu_reg_base() { return s_a
 
 extern __aicore__ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type);
 
+// Derive the section name from the same KERNEL_ENTRY macro that mangles the
+// entry symbol, so the meta section name cannot drift if the suffix scheme
+// changes. STRINGIFY needs two levels to expand the macro before stringizing.
+#define SIMPLER_STRINGIFY_(x) #x
+#define SIMPLER_STRINGIFY(x) SIMPLER_STRINGIFY_(x)
+#define KERNEL_META_SECTION(func) ".ascend.meta." SIMPLER_STRINGIFY(KERNEL_ENTRY(func))
+
+#ifdef __DAV_VEC__
+static const FuncLevelMeta func_simt_section __attribute__((used, section(KERNEL_META_SECTION(aicore_kernel)))) = {
+    {{static_cast<unsigned short>(FuncMetaType::F_TYPE_COMPILER_ALLOC_UB_SIZE), sizeof(unsigned int)},
+     PLATFORM_AICORE_SHARE_MEM_SIZE},
+    {{static_cast<unsigned short>(FuncMetaType::F_TYPE_AIV_TYPE_FLAG), sizeof(unsigned int)},
+     static_cast<unsigned int>(AIVType::AIV_TYPE_SIMD_SIMT_MIX_VF)},
+};
+#endif
+
 /**
  * Kernel entry point with control loop
  *
diff --git a/src/a5/platform/onboard/aicore/simt_meta.h b/src/a5/platform/onboard/aicore/simt_meta.h
new file mode 100644
index 000000000..72d5b6d13
--- /dev/null
+++ b/src/a5/platform/onboard/aicore/simt_meta.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * @file simt_meta.h
+ * @brief SIMT metadata TLV records for AICore kernel ELF (onboard / a5)
+ *
+ * The legacy launch path (rtKernelLaunchWithHandleV2 + rtRegisterAllKernel)
+ * requires the kernel ELF to carry two TLV records that runtime reads at
+ * register time:
+ *   - F_TYPE_COMPILER_ALLOC_UB_SIZE (7)  -> Kernel::shareMemSize_
+ *   - F_TYPE_AIV_TYPE_FLAG          (12) -> Kernel::kernelVfType_
+ * bisheng emits these only when it can statically infer the kernel uses
+ * SIMT intrinsics. Our entry is an SU dispatcher (vector ops live in task
+ * .o files invoked through aicore_execute), so the compiler cannot tag it.
+ *
+ * kernel.cpp's CMakeLists.txt pairs the hand-written record with
+ * `-mllvm -cce-dyn-kernel-stack-size=false`, which stops bisheng from
+ * auto-emitting a sibling `.ascend.meta.<funcname>` section. Without that
+ * flag, runtime's parser (kernelInfoMap keyed by section name) would
+ * overwrite our values with bisheng's NO_VF / shareMemSize=0 defaults.
+ *
+ * TLV type IDs mirror RT_FUNCTION_TYPE_COMPILER_ALLOC_UB_SIZE (7) and
+ * RT_FUNCTION_TYPE_AIV_TYPE_FLAG (12) in CANN's runtime/runtime/elf_base.h.
+ * That header is host-side (extern "C", part of the runtime API) so we
+ * re-declare the two values we need rather than pull runtime headers into
+ * an AICore device-side TU.
+ */
+
+#ifndef PLATFORM_A5_AICORE_SIMT_META_H_
+#define PLATFORM_A5_AICORE_SIMT_META_H_
+
+// Underlying type matches TlvHeader::type so the cast at the TLV write site is
+// width-preserving.
+enum class FuncMetaType : unsigned short {
+    F_TYPE_COMPILER_ALLOC_UB_SIZE = 7,
+    F_TYPE_AIV_TYPE_FLAG = 12,
+};
+
+// AIVType values are not exposed in any CANN C/C++ header. The canonical
+// source is CANN's compiler-side Python script
+// (python/site-packages/tbe/tikcpp/ascendc_identify_meta_section_info.py),
+// which is what bisheng / asc_op_compiler consult when classifying kernels.
+// Underlying type matches FuncMetaAivTypeFlag::aiv_type.
+enum class AIVType : unsigned int {
+    AIV_TYPE_NO_VF = 1,
+    AIV_TYPE_SIMD_VF_ONLY = 2,
+    AIV_TYPE_SIMT_VF_ONLY = 3,
+    AIV_TYPE_SIMD_SIMT_MIX_VF = 4,
+};
+
+struct TlvHeader {
+    unsigned short type;
+    unsigned short len;
+};
+
+struct FuncMetaCompilerUbSize {
+    TlvHeader head;
+    unsigned int ub_size;
+};
+
+struct FuncMetaAivTypeFlag {
+    TlvHeader head;
+    unsigned int aiv_type;
+};
+
+struct FuncLevelMeta {
+    FuncMetaCompilerUbSize ub_size_meta;
+    FuncMetaAivTypeFlag aiv_type_meta;
+};
+
+#endif  // PLATFORM_A5_AICORE_SIMT_META_H_
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index fd1e38697..9d49ab4e0 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -948,6 +948,7 @@ int DeviceRunner::launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args) {
 
     rtTaskCfgInfo_t cfg = {};
     cfg.schemMode = RT_SCHEM_MODE_BATCH;
+    cfg.localMemorySize = PLATFORM_AICORE_LOCAL_MEMORY_SIZE;
 
     rc = rtKernelLaunchWithHandleV2(bin_handle, 0, block_dim_, &rt_args, nullptr, stream, &cfg);
     if (rc != RT_ERROR_NONE) {
diff --git a/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp
new file mode 100644
index 000000000..c259716bd
--- /dev/null
+++ b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Minimal SIMT element-scatter kernel (AIV).
+//
+// Distilled from the ptoas-generated mscatter reference. Drops cosmetic
+// noise (v1..v30 names, dummy v4/v5/v6/v7 constants, explicit
+// Layout::ND, the verbose Tile template tail, GM-offset arithmetic that
+// always reduces to zero, the ptoas_auto_sync_tail wrapper).
+//
+// Kept on purpose:
+//   - per-data 3-tile alias pattern (TLOAD binds one tile, MSCATTER
+//     reads from another aliased to the same UB address; a single-tile
+//     form has reproduced golden mismatches on hw)
+//   - `set_mask_norm` / `set_vector_mask` SIMT mask init
+//   - `MTE2 → V` sync before MSCATTER (the ptoas default `MTE2 → MTE3`
+//     silently drops the scatter on a5 hw)
+//   - `__DAV_VEC__` guard so the AIC variant compiles to a no-op
+//
+// Operation: out[idx[r, c]] = src[r, c] for an 8x32 source and 256-slot
+// destination.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+#include "pipe_sync.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+static constexpr int TILE_ROWS = 8;
+static constexpr int TILE_COLS = 32;
+static constexpr int DST_LEN = TILE_ROWS * TILE_COLS;  // 256
+static constexpr int SRC_TILE_BYTES = TILE_ROWS * TILE_COLS * sizeof(float);
+
+static __aicore__ void simt_scatter_impl(__gm__ float *src, __gm__ int32_t *idx, __gm__ float *out) {
+    using SrcTile = Tile<TileType::Vec, float, TILE_ROWS, TILE_COLS, BLayout::RowMajor, -1, -1>;
+    using IdxTile = Tile<TileType::Vec, int32_t, TILE_ROWS, TILE_COLS, BLayout::RowMajor, -1, -1>;
+
+    using TileShape = Shape<1, 1, 1, TILE_ROWS, TILE_COLS>;
+    using TileStride = pto::Stride<DST_LEN, DST_LEN, DST_LEN, TILE_COLS, 1>;
+    using SrcGT = GlobalTensor<float, TileShape, TileStride>;
+    using IdxGT = GlobalTensor<int32_t, TileShape, TileStride>;
+
+    using DstShape = Shape<1, 1, 1, 1, DST_LEN>;
+    using DstStride = pto::Stride<DST_LEN, DST_LEN, DST_LEN, DST_LEN, 1>;
+    using DstGT = GlobalTensor<float, DstShape, DstStride>;
+
+    // Per-data 3-tile alias pattern:
+    //   *_loader  — bound directly at the UB offset; consumed by TLOAD
+    //   *_scatter — bound via the loader's data() pointer; consumed by MSCATTER
+    //   *_anchor  — bound to the same offset literal; preserves the
+    //               original ptoas binding sequence
+    constexpr int SRC_UB = 0;
+    constexpr int IDX_UB = SRC_TILE_BYTES;
+
+    SrcTile src_loader(TILE_ROWS, TILE_COLS);
+    TASSIGN(src_loader, SRC_UB);
+    SrcTile src_scatter(TILE_ROWS, TILE_COLS);
+    TASSIGN(src_scatter, reinterpret_cast<uint64_t>(src_loader.data()));
+    SrcTile src_anchor(TILE_ROWS, TILE_COLS);
+    TASSIGN(src_anchor, static_cast<uint64_t>(SRC_UB));
+
+    IdxTile idx_loader(TILE_ROWS, TILE_COLS);
+    TASSIGN(idx_loader, IDX_UB);
+    IdxTile idx_scatter(TILE_ROWS, TILE_COLS);
+    TASSIGN(idx_scatter, reinterpret_cast<uint64_t>(idx_loader.data()));
+    IdxTile idx_anchor(TILE_ROWS, TILE_COLS);
+    TASSIGN(idx_anchor, static_cast<uint64_t>(IDX_UB));
+
+    SrcGT srcGlobal(src);
+    IdxGT idxGlobal(idx);
+    DstGT dstGlobal(out);
+
+    TLOAD(src_anchor, srcGlobal);
+    TLOAD(idx_anchor, idxGlobal);
+
+    // MTE2 → V before MSCATTER (critical: MTE2 → MTE3 silently drops the
+    // scatter on a5 hw).
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+    MSCATTER(dstGlobal, src_scatter, idx_scatter);
+
+    pipe_sync();
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset;
+
+    __gm__ Tensor *idx_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ int32_t *idx = reinterpret_cast<__gm__ int32_t *>(idx_tensor->buffer.addr) + idx_tensor->start_offset;
+
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    simt_scatter_impl(src, idx, out);
+}
diff --git a/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/orchestration/simt_basic_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/orchestration/simt_basic_orch.cpp
new file mode 100644
index 000000000..ad905984e
--- /dev/null
+++ b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/orchestration/simt_basic_orch.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * SIMT basic orchestration: submit a single AIV SIMT scatter task.
+ *
+ * Args layout: [src, indices, out]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_SIMT_SCATTER 0
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;  // NOLINT(readability/casting)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 3,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
+    Tensor src = from_tensor_arg(orch_args.tensor(0));
+    Tensor indices = from_tensor_arg(orch_args.tensor(1));
+    Tensor out = from_tensor_arg(orch_args.tensor(2));
+
+    // PTO2_SCOPE ensures rt_submit_aiv_task flushes through the task
+    // ringbuffer before the entry returns. No set_core_num — let the
+    // runtime use the config's block_dim, matching the ptoas-validated
+    // mscatter reference.
+    PTO2_SCOPE() {
+        Arg args;
+        args.add_input(src);
+        args.add_input(indices);
+        args.add_output(out);
+        rt_submit_aiv_task(FUNC_SIMT_SCATTER, args);
+    }
+}
+
+}  // extern "C"
diff --git a/tests/st/a5/tensormap_and_ringbuffer/simt_basic/test_simt_basic.py b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/test_simt_basic.py
new file mode 100644
index 000000000..d223b2c0a
--- /dev/null
+++ b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/test_simt_basic.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SIMT basic element-scatter: minimal AIV scatter kernel that exercises the SIMT launch path.
+
+Config (block_dim=24, aicpu_thread_num=4, sequential identity indices)
+mirrors the ptoas-validated mscatter reference at
+mscatter_fp32_8x32_seq_20260513_140539/test_mscatter.py. Identity
+indices keep the golden trivially src-equals-out so a failure here
+points at the SIMT launch path itself (TLV injection, localMemorySize
+budget, sync) rather than at the scatter index semantics.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+TILE_ROWS = 8
+TILE_COLS = 32
+SRC_ELEMS = TILE_ROWS * TILE_COLS  # 256
+DST_LEN = SRC_ELEMS  # 256
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSimtBasic(SceneTestCase):
+    RTOL = 1e-5
+    ATOL = 1e-5
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/simt_basic_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "name": "SIMT_SCATTER",
+                "source": "kernels/aiv/kernel_simt_scatter.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        }
+    ]
+
+    def generate_args(self, params):
+        torch.manual_seed(0)
+        src = torch.randn(SRC_ELEMS, dtype=torch.float32)
+        # Identity indices (0..DST_LEN-1) — matches the ptoas reference and
+        # makes the golden trivially `out == src`. Switch to torch.randperm
+        # later once the baseline launch path is confirmed green.
+        indices = torch.arange(DST_LEN, dtype=torch.int32)
+        out = torch.zeros(DST_LEN, dtype=torch.float32)
+        return TaskArgsBuilder(
+            Tensor("src", src),
+            Tensor("indices", indices),
+            Tensor("out", out),
+        )
+
+    def compute_golden(self, args, params):
+        args.out.zero_()
+        args.out[args.indices.to(torch.int64)] = args.src
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)

From 86b633cc88dd64fd3b50457ac262393048e42ec3 Mon Sep 17 00:00:00 2001
From: ChaoZheng109 <zhengchao47@huawei.com>
Date: Thu, 21 May 2026 20:04:58 +0800
Subject: [PATCH 2/2] tmp

---
 docs/simt_basic_blocking_report.md            | 179 ++++++++++++++++++
 src/a5/platform/onboard/aicore/CMakeLists.txt |  18 +-
 src/a5/platform/onboard/aicore/kernel.cpp     |  21 ++
 src/a5/platform/onboard/aicore/simt_meta.h    |  58 +++++-
 .../kernels/aiv/kernel_simt_scatter.cpp       | 102 +++++-----
 5 files changed, 306 insertions(+), 72 deletions(-)
 create mode 100644 docs/simt_basic_blocking_report.md

diff --git a/docs/simt_basic_blocking_report.md b/docs/simt_basic_blocking_report.md
new file mode 100644
index 000000000..9e8245edc
--- /dev/null
+++ b/docs/simt_basic_blocking_report.md
@@ -0,0 +1,179 @@
+# a5 SIMT 指令在 simpler dispatcher 路径下不可用 — 现状与决策建议
+
+> 写作日期：2026-05-21
+> 涉及分支：`fix-a5-aicore-simt-tlv` (HEAD `7d0764d9`)
+> 相关用例：`tests/st/a5/tensormap_and_ringbuffer/simt_basic`
+> 相关指令：`MSCATTER`（A5 上的 SIMT 指令，pto-isa `include/pto/npu/a5/MScatter.hpp` 提供）
+
+---
+
+## 1. 一句话现状
+
+simpler 的运行时调度架构（**SU dispatcher：AICore 上跑 polling loop，AICPU 通过寄存器下发 task，AICore 跳到 task 函数指针执行**）**不支持调用 pto-isa 当前实现的 SIMT 指令**（A5 上是 `MSCATTER` 和 `MGATHER`）。
+
+这是 **pto-isa 已知的限制**，他们的 doc 自己写明，并承诺 "We will resolve this issue as soon as possible"。我们不应该绕过这个限制，而是**等 pto-isa team 修指令**。
+
+---
+
+## 2. 复现与对照
+
+| 条件 | 用例 | 结果 |
+|---|---|---|
+| simpler simt_basic ST，`MSCATTER` 单条指令 | simpler + a5 onboard | **AIV trap, errcode 331 = "VEC VF instruction param invalid"**，chip fault |
+| simpler simt_basic ST，**注释掉 `MSCATTER` 这一条**（其他代码保留） | simpler + a5 onboard | **完全干净**，3 cluster / 9 核全部 handshake 成功，task 1/1 完成，零硬件 error |
+| pto-isa 自己的 a5 MSCATTER ST，同一条 `MSCATTER<Coalesce::Elem, ScatterAtomicOp::None, ScatterOOB::Skip>` | pto-isa native ST + 同一台 a5 硬件 | **50/50 全部通过** |
+
+→ **MSCATTER 在硬件上能跑**，但**在 simpler dispatcher 这条调度路径下跑不通**。
+
+---
+
+## 3. 根因（已实验论证）
+
+### 3.1 AIV 硬件 SPMD/SIMT 模式互斥
+
+A5 的 AIV 核执行模式由 kernel ELF 的 `AIV_TYPE_FLAG` TLV 决定，运行时通过 `rtKernelLaunch*` 把这个值传给 driver/firmware 配置硬件：
+
+| AIV_TYPE_FLAG | 含义 | 我们在这条路径上的行为 |
+|---|---|---|
+| `1 = NO_VF` | 纯 SPMD | （未实测） |
+| `2 = SIMD_VF_ONLY` | SIMD（也是非 SIMT） | （未实测） |
+| `3 = SIMT_VF_ONLY` | 纯 SIMT | **AICore polling loop 第一条 SPMD 指令就 trap**，task 完全无法下发 |
+| `4 = SIMD_SIMT_MIX_VF` | 混合 | **SPMD polling loop 跑通**，task 偶尔能完成 1/1，但**最终 SIMT 部分 trap** errcode 331 |
+
+> 用 `AIV_TYPE=3` (SIMT_VF_ONLY) 跑：实验编号 `2100222`，trap 在 polling loop 头部
+> 用 `AIV_TYPE=4` (MIX_VF) 跑：实验编号 `1884844`，task 完成但 deinit 时 trap
+
+只有 **`AIV_TYPE=4 (MIX_VF)`** 能让 SPMD polling loop 和 SIMT task 都执行，但**两边共享部分 vec 硬件状态**，SPMD 指令会覆盖 SIMT context（vec mask、warp/lane 配置等），导致 SIMT task 的 vec VF 指令收到非法参数。
+
+### 3.2 这正是 pto-isa 自己 doc 描述的限制
+
+pto-isa 自家文档 `tests/npu/a5/src/st/testcase/mgather/MGATHER.md` (line 338-349)：
+
+> `MGATHER` (like every SIMT kernel in PTO and CANN) uses `cce::async_invoke<...>(cce::dim3{WARP_SIZE, kLaunchWarps}, ...)` internally to fan a per-warp/per-lane workload out across up to `32 × 32 = 1024` threads. `cce::async_invoke` consumes hardware/runtime state — TID registers, warp/lane configuration, vector-pipe scheduling — that the launch path has to install before the kernel function is entered. **The standard CANN launch (`rtKernelLaunch`, used by the `<<<1, nullptr, stream>>>` syntax) installs that state correctly.**
+>
+> Dispatch kernels as a direct C function-pointer call ... **This is fine for SPMD ops (TLOAD, TSTORE, TADD …) but skips the SIMT-context init step, so the first `cce::async_invoke` inside `MGATHER` has no warp scheduler to dispatch into and hangs.**
+>
+> ... **a regular AIV function cannot self-promote itself into SIMT context**. **We will resolve this issue as soon as possible.**
+
+pto-isa 自己罗列三种修法（[MGATHER.md:361-363](../pto-isa/tests/npu/a5/src/st/testcase/mgather/MGATHER.md#L361)），**全部在 pto-isa dispatcher 侧改**，没有一种是 simpler 用户侧能修的。
+
+### 3.3 SIMT kernel 启动需要哪些初始化项 — TLV 能覆盖到哪一层
+
+要让 `MSCATTER` / `MGATHER` 这类 SIMT 指令能跑，硬件 + runtime 实际上需要装入两类完全不同性质的状态：
+
+#### A. 静态元数据（kernel ELF 里描述、由 TLV 承载） — 这部分我们能注入
+
+runtime 在 `rtRegisterAllKernel` 阶段读取 `.ascend.meta.<func>` 段里的 TLV，把这些值固化进 `Kernel::kernelInfoMap`，发 SQE 的时候作为 launch 参数：
+
+| TLV type | 字段 | 作用 | 我们的注入值 |
+|---|---|---|---|
+| 7 `F_TYPE_COMPILER_ALLOC_UB_SIZE` | `Kernel::shareMemSize_` | 编译器静态分配的 UB 共享内存大小 | `PLATFORM_AICORE_SHARE_MEM_SIZE` (0x2000) |
+| 8 `F_TYPE_SU_STACK_SIZE` | `Kernel::minStackSize_` 阈值 | 每核 SU(scalar unit) 栈/状态阈值；过小则 runtime 拒绝 launch | `0xE0`（跟 pto-isa 字节对齐） |
+| 9 `F_TYPE_SIMT_WARP_STACK_SIZE` | per-warp 栈大小 | SIMT scheduler 给每个 warp 切分的栈 | 0（默认） |
+| 10 `F_TYPE_SIMT_DVG_WARP_STACK_SIZE` | per-divergent-warp 栈大小 | 同上，divergent 路径 | 0（默认） |
+| 12 `F_TYPE_AIV_TYPE_FLAG` | `Kernel::kernelVfType_` | 选 hw 工作模式：NO_VF=1/SIMD=2/SIMT=3/MIX=4 | 4 (MIX_VF) |
+
+这一层是**简单的静态声明**，runtime 看到这些值就知道"该按什么模式 launch、该预留多少栈"。我们已经把 5 条 TLV 跟 pto-isa 字节对齐，runtime 解析到的值跟 pto-isa SIMT ST 完全一致。
+
+#### B. 动态硬件状态（每次进入 SIMT 前必须现场装入硬件寄存器） — 这部分我们装不进去
+
+pto-isa MGATHER.md 列出的"the launch path has to install before the kernel function is entered"具体指三类硬件寄存器：
+
+| 状态 | 内容 | 装入时机 | 装入路径 |
+|---|---|---|---|
+| **TID 寄存器** | 当前 thread 在 grid/block 中的坐标 (`block_idx`, `subblock_id`, `lane_id`) | kernel 入口前 | firmware 解析 SQE 中的 `cce::dim3{WARP_SIZE, kLaunchWarps}` 后写入 |
+| **warp/lane 配置** | `vfsimt_info`：lane 数、warp 数、active mask、SIMT 模式开关 | kernel 入口前 | firmware 根据 `kernelVfType_` 装入；SPMD 指令执行后会写脏 |
+| **vector-pipe scheduling state** | warp scheduler 内部状态、divergent stack、active warp 队列 | kernel 入口前 | firmware 在 launch 完成的最后阶段触发 SIMT scheduler reset |
+
+**关键约束**：firmware 只会在 **`rtKernelLaunch` 那一次**走完整 launch path 把 A+B 全部装进硬件。一旦 polling loop 起来开始 SPMD 工作，硬件已经处于 "kernel running" 状态，没有任何下游路径让 firmware 再触发一次 B 的重装。
+
+#### C. 为什么"手动注入 TLV"无法把硬件初始化好
+
+| 缺陷 | 解释 |
+|---|---|
+| **TLV 只是声明，不是动作** | TLV 改的是 runtime 内存里的 kernel meta 表；真正写硬件寄存器的是 firmware 解析 SQE 的那段代码。simpler 改 TLV 能改的只是 firmware **第一次 launch** 时给 polling kernel 装的初始状态，无法触发 task 切换时的二次装入 |
+| **`AIV_TYPE_FLAG` 是单值、整段 kernel 共享** | 我们的入口同时含 SPMD（polling loop）+ SIMT（task）。`= 3` (SIMT_VF_ONLY) → polling loop 头一条 SPMD 立即 trap；`= 4` (MIX_VF) → polling loop 跑得过，但 SPMD 指令会污染 vec mask / warp 配置，等 fn pointer 跳进 SIMT task 时硬件状态已经被覆盖，VF 指令收到非法参数 (errcode 331)；`= 1/2` → SIMT scheduler 根本没启用 |
+| **device 侧没有"重装 SIMT context"的 intrinsic** | bisheng 公开的 SIMT 相关 builtin 是 `__builtin_cce_store_vfsimt_info` (写 `vfsimt_info` 一个寄存器)、`set_mask_norm` / `set_vector_mask` (写 vec mask 部分位)。**它们都不能从 SPMD 上下文里重新装载 warp scheduler 状态**。我们试着在 user kernel 头部连续调这几个 builtin：errcode 从 331 变成 0，但 trap 形态变化（PC 跳到 runtime library 区），task 仍然完不成 — 印证 pto-isa doc 那句 "a regular AIV function cannot self-promote itself into SIMT context" |
+| **TLV 不传新约定 = 不存在新约定** | 即便我们想绕过，runtime 仓里的 SQE 打包代码（`Kernel::Launch` → `rtPackedKernelLaunch`）目前只读 `kernelVfType_` 一个 bit 决定走哪条 launch path，没有任何 TLV 字段可以让我们告诉 runtime "这个 kernel 中途要切 SIMT context，请在 fn pointer 调用前重发一次 SIMT init 序列" |
+
+→ 一句话：**TLV 装的是"该开启什么模式"的静态描述，硬件需要的"实际把 SIMT scheduler 现场装起来"那一步只能由 pto-isa 在指令实现内部、或 firmware 在 launch path 上提供**。simpler 即使把 TLV 写到完美也只能解决 A 这一层的需求，B 的缺口是不可达的。
+
+### 3.4 我们已经穷尽 simpler 侧能做的所有事情
+
+| 尝试方向 | 结论 |
+|---|---|
+| 补全 ELF TLV (type 7/8/9/10/12) 跟 pto-isa kernel ELF byte 级对齐 | 完成 ✓，trap 依旧 |
+| TLV 字段值跟 pto-isa 比对 | 唯一差异是 `AIV_TYPE_FLAG` (我们 4 vs pto-isa 3)；两个值都试了都不行 |
+| AIC 编译不带 `-cce-dyn-kernel-stack-size=false`（让 bisheng 给 AIC 自动生成默认 meta） | 完成 ✓ |
+| user task kernel 用 pto-isa runElem2D 风格（Shape/Stride/Tile valid extents/sync 全对齐） | 完成 ✓ |
+| 显式 `Coalesce::Elem` + `ScatterOOB::Skip` 模板 | 完成 ✓ |
+| 在 user kernel 加 `__builtin_cce_store_vfsimt_info` 手工初始化 SIMT lane/warp 配置 | 试过，errcode 从 331 → 0，但 trap 形态变化、task 仍不通 |
+| 在 user kernel 加 `__global__ __aicore__` 让 bisheng 生成 SIMT prelude | 试过，无变化（bisheng 的 prelude 假设 firmware 已经装好 B，不会自己重装） |
+| 加 `set_mask_norm` / `set_vector_mask` | 试过，反而把 trap 推到 runtime library 区 |
+
+→ 所有尝试都印证 3.3 的结论：**A 层（TLV）已经完美对齐，B 层（硬件状态实时重装）无解**。pto-isa 自家 doc 也已经写明 **"a regular AIV function cannot self-promote itself into SIMT context"**。
+
+---
+
+## 4. 决策建议
+
+### 4.1 应该做的
+
+1. **等 pto-isa team 修这个 dispatcher 限制**，他们的 doc 已经承诺 "We will resolve this issue as soon as possible"。
+2. simpler 这边把用 SIMT 指令的 ST 用例（`simt_basic` 等）**标 skip 或不运行**，不阻塞主线 CI。在 doc 里写明 "blocked on pto-isa dispatcher gap"。
+3. 保留这次 PR 范围内已经验证正确的改动：
+   - kernel.cpp 补全 5 条 SIMT TLV（type 7/8/9/10/12）
+   - AIC 变体不手工注入 TLV
+   - kernel.cpp CMakeLists AIV 才带 `-cce-dyn-kernel-stack-size=false`
+   - simt_meta.h 新增 `SU_STACK_SIZE` / `SIMT_WARP_STACK_SIZE` / `SIMT_DVG_WARP_STACK_SIZE` 三个 enum + 常量
+
+   这些独立于 SIMT 指令本身能不能跑，都是该有的正确修复。
+
+### 4.2 不应该做的
+
+1. **不应该在 simpler kernel 里手工注入 SIMT 初始化代码** （`__builtin_cce_store_vfsimt_info`、`set_mask_norm` 等）。理由：
+   - 即便芯片将来提供完整的 device-side SIMT 初始化能力，这个能力**也应该由 pto-isa 在 SIMT 指令（MSCATTER / MGATHER）内部封装调用**，而不是要求每个用户 kernel 显式加初始化
+   - simpler 是 pto-isa 指令的下游消费者，**不应该承担 ISA 指令实现层应该处理的硬件状态管理**
+   - 手工注入的初始化代码会随着 bisheng / 硬件演进而失效，维护成本高
+
+2. **不应该改 simpler 的 dispatch 架构**（"每个 task 都走 host rtKernelLaunch"），原因：
+   - simpler 整套架构基于"一次 launch + N 次 device 端 dispatch"的性能模型，单 task launch 性能损失太大
+   - host 也无法预知 orchestrator 会下发什么类型的 task，无法分流
+   - pto-isa team 自己的修法也没要求 simpler 改 dispatch 模型
+
+### 4.3 是否需要给 runtime team 提 issue
+
+**目前不需要。**
+
+给 runtime team 提 issue 的**前提是 "通过 TLV 能解决问题，只是缺接口/规范"**。但当前事实是：
+
+- TLV 字段我们已经跟 pto-isa kernel ELF 字节级对齐
+- 仍然 trap
+
+→ 说明 **TLV 已经不是瓶颈**：如 [§3.3](#33-simt-kernel-启动需要哪些初始化项--tlv-能覆盖到哪一层) 所述，TLV 只覆盖"该开启什么模式"的静态描述（A 层），SIMT scheduler 需要的硬件状态实时重装（B 层）没有任何 TLV 字段能表达，必须由 firmware 在 launch path 上重发或由 pto-isa 在指令实现内部封装。这是 ISA / firmware-level 的能力缺口，不是 runtime 接口缺口。
+
+**未来如果出现下面任一情况，再给 runtime team 提 issue**：
+
+| 触发条件 | issue 内容 |
+|---|---|
+| pto-isa team 给出新 TLV type 或新 cfgInfo 字段 | 请求 runtime 提供官方 inject TLV 接口（避免手动写 simt_meta.h 容易跟 runtime 内部解析约定漂移） |
+| runtime 内部对 simt_meta TLV 解析规则变化 | 请求 runtime 暴露 `simt_meta.h` 头文件 / 文档化 TLV 类型枚举，避免手动定义被静默破坏 |
+
+runtime 仓地址：<https://gitcode.com/cann/runtime.git>
+
+---
+
+## 5. 已采集的环境信息（备查）
+
+| 项 | 值 |
+|---|---|
+| CANN 版本 | 9.1.T500 (V100R001C10B813) |
+| Driver 版本 | 25.6.rc1.b108, ascendhal 7.35.23 |
+| bisheng | clang 15.0.5 (2026-04-28) |
+| OS | openEuler 24.03 LTS-SP2, kernel 6.6.0 |
+| 架构 | aarch64 |
+| simpler branch | `fix-a5-aicore-simt-tlv` @ `7d0764d9` |
+| pto-isa HEAD | `cc93c4d4` (main) |
+| 测试 device | device 3 |
+| 最佳一次实验编号 | `1884844` (AIV_TYPE=4 + 各种 device init，task 1/1 完成但 deinit trap) |
+| 纯 TLV 对齐实验编号 | `2100222` (AIV_TYPE=3 跟 pto-isa 完全对齐，反而 0/0) |
diff --git a/src/a5/platform/onboard/aicore/CMakeLists.txt b/src/a5/platform/onboard/aicore/CMakeLists.txt
index e2abae61d..a923decca 100644
--- a/src/a5/platform/onboard/aicore/CMakeLists.txt
+++ b/src/a5/platform/onboard/aicore/CMakeLists.txt
@@ -48,7 +48,10 @@ foreach(INC_DIR ${CMAKE_CUSTOM_INCLUDE_DIRS})
     list(APPEND CMAKE_CUSTOM_INCLUDE_DIR_FLAGS "-I${INC_DIR}")
 endforeach()
 
-# Compiler flags
+# Compiler flags. `-mllvm -cce-dyn-kernel-stack-size=false` is appended only
+# for the AIV variant below: AIC has no SIMT meta to defend, and disabling
+# bisheng's auto-emission on AIC was observed to push the merged kernel into
+# an illegal VEC configuration (errcode 331 trap on AIV at SIMT prelude).
 set(AICORE_FLAGS
     "-c -O3 -g -x cce -Wall -std=c++17 \
     --cce-aicore-only \
@@ -57,11 +60,17 @@ set(AICORE_FLAGS
     -mllvm -cce-aicore-record-overflow=false \
     -mllvm -cce-aicore-addr-transform \
     -mllvm -cce-aicore-dcci-insert-for-scalar=false \
-    -mllvm -cce-dyn-kernel-stack-size=false \
     ${CMAKE_CUSTOM_INCLUDE_DIR_FLAGS}"
 )
 separate_arguments(AICORE_FLAGS)
 
+# AIV-only: suppress bisheng's auto-emitted `.ascend.meta.<func>` section so
+# runtime's kernelInfoMap (keyed by section name, overwrite semantics) keeps
+# the hand-written TLV record from kernel.cpp instead of the NO_VF /
+# shareMemSize=0 defaults bisheng would otherwise emit for our SU-dispatcher
+# entry. Not applied to AIC — AIC has no SIMT TLV to protect.
+set(AIV_EXTRA_FLAGS "-mllvm" "-cce-dyn-kernel-stack-size=false")
+
 # Step 1: Compile each source file to individual .o files for AIC and AIV
 set(ALL_AIC_OBJECTS "")
 set(ALL_AIV_OBJECTS "")
@@ -83,10 +92,11 @@ foreach(SRC_FILE ${ALL_SOURCES})
         COMMENT "Compiling ${SRC_NAME} for AIC"
     )
 
-    # Compile for AIV architecture
+    # Compile for AIV architecture. AIV_EXTRA_FLAGS adds the SIMT-meta
+    # suppression flag so our hand-written TLV record survives runtime parse.
     add_custom_command(
         OUTPUT ${OBJ_AIV}
-        COMMAND ${BISHENG_CC} ${AICORE_FLAGS} ${CMAKE_CUSTOM_INCLUDE_DIR_FLAGS} --cce-aicore-arch=dav-c310-vec
+        COMMAND ${BISHENG_CC} ${AICORE_FLAGS} ${AIV_EXTRA_FLAGS} ${CMAKE_CUSTOM_INCLUDE_DIR_FLAGS} --cce-aicore-arch=dav-c310-vec
                 -o ${OBJ_AIV} ${SRC_FILE}
         DEPENDS ${SRC_FILE}
         COMMENT "Compiling ${SRC_NAME} for AIV"
diff --git a/src/a5/platform/onboard/aicore/kernel.cpp b/src/a5/platform/onboard/aicore/kernel.cpp
index 88a2ca2c3..3c7112ee3 100644
--- a/src/a5/platform/onboard/aicore/kernel.cpp
+++ b/src/a5/platform/onboard/aicore/kernel.cpp
@@ -76,9 +76,30 @@ extern __aicore__ void aicore_execute(__gm__ Runtime *runtime, int block_idx, Co
 #define KERNEL_META_SECTION(func) ".ascend.meta." SIMPLER_STRINGIFY(KERNEL_ENTRY(func))
 
 #ifdef __DAV_VEC__
+// Emit SIMT meta only for the AIV variant. The AIC variant has no vector
+// unit and standard kernels never carry SIMT TLVs on AIC; advertising
+// AIV_TYPE_FLAG on AIC confuses runtime's MIX merge and is what bisheng
+// itself does for vec-only SIMT kernels (e.g. pto-isa's mscatter).
 static const FuncLevelMeta func_simt_section __attribute__((used, section(KERNEL_META_SECTION(aicore_kernel)))) = {
     {{static_cast<unsigned short>(FuncMetaType::F_TYPE_COMPILER_ALLOC_UB_SIZE), sizeof(unsigned int)},
      PLATFORM_AICORE_SHARE_MEM_SIZE},
+    {{static_cast<unsigned short>(FuncMetaType::F_TYPE_SU_STACK_SIZE), sizeof(unsigned int)},
+     PLATFORM_AICORE_SU_STACK_SIZE},
+    {{static_cast<unsigned short>(FuncMetaType::F_TYPE_SIMT_WARP_STACK_SIZE), sizeof(unsigned int)}, 0u},
+    {{static_cast<unsigned short>(FuncMetaType::F_TYPE_SIMT_DVG_WARP_STACK_SIZE), sizeof(unsigned int)}, 0u},
+    // AIV_TYPE_FLAG=4 (SIMD_SIMT_MIX_VF). pto-isa's mscatter ST kernel ELF
+    // carries 3 (SIMT_VF_ONLY) because that kernel is a pure SIMT entry
+    // (MSCATTER is the first and only instruction); but our entry is an SU
+    // dispatcher with a long SPMD polling loop, scalar arithmetic, and
+    // handshake register reads/writes BEFORE the fn-pointer dispatch to a
+    // SIMT task. Declaring SIMT_VF_ONLY forces firmware into a pure-SIMT
+    // hw context where the very first SPMD instruction in the polling
+    // loop traps (errcode 331 fires immediately, no task ever dispatches —
+    // verified). MIX_VF=4 is the only value that lets both SPMD and SIMT
+    // execute, even if not perfectly (the SIMT task occasionally completes
+    // 1/1 with a trailing trap). This is the closest-to-working setting
+    // while the dispatcher gap documented in pto-isa MGATHER.md / MSCATTER.md
+    // ("Runtime Dispatch Requirement") remains unresolved upstream.
     {{static_cast<unsigned short>(FuncMetaType::F_TYPE_AIV_TYPE_FLAG), sizeof(unsigned int)},
      static_cast<unsigned int>(AIVType::AIV_TYPE_SIMD_SIMT_MIX_VF)},
 };
diff --git a/src/a5/platform/onboard/aicore/simt_meta.h b/src/a5/platform/onboard/aicore/simt_meta.h
index 72d5b6d13..17bb1ae4b 100644
--- a/src/a5/platform/onboard/aicore/simt_meta.h
+++ b/src/a5/platform/onboard/aicore/simt_meta.h
@@ -14,25 +14,33 @@
  * @brief SIMT metadata TLV records for AICore kernel ELF (onboard / a5)
  *
  * The legacy launch path (rtKernelLaunchWithHandleV2 + rtRegisterAllKernel)
- * requires the kernel ELF to carry two TLV records that runtime reads at
- * register time:
- *   - F_TYPE_COMPILER_ALLOC_UB_SIZE (7)  -> Kernel::shareMemSize_
- *   - F_TYPE_AIV_TYPE_FLAG          (12) -> Kernel::kernelVfType_
+ * requires the kernel ELF to carry several TLV records that runtime reads at
+ * register time. We emit the full set bisheng would emit for a statically
+ * inferable SIMT kernel, matching pto-isa's a5 SIMT ST binaries byte-for-byte:
+ *   - F_TYPE_COMPILER_ALLOC_UB_SIZE   (7)  -> Kernel::shareMemSize_
+ *   - F_TYPE_SU_STACK_SIZE            (8)  -> per-core SU (scalar unit) stack
+ *   - F_TYPE_SIMT_WARP_STACK_SIZE     (9)  -> per-warp stack (0 = default)
+ *   - F_TYPE_SIMT_DVG_WARP_STACK_SIZE (10) -> per-divergent-warp stack (0)
+ *   - F_TYPE_AIV_TYPE_FLAG            (12) -> Kernel::kernelVfType_
  * bisheng emits these only when it can statically infer the kernel uses
  * SIMT intrinsics. Our entry is an SU dispatcher (vector ops live in task
  * .o files invoked through aicore_execute), so the compiler cannot tag it.
  *
+ * Without F_TYPE_SU_STACK_SIZE the runtime allocates zero SU stack for the
+ * SIMT scheduler — the first scheduler frame overflows into unmapped UB and
+ * the AICore reports an AIV error (`aivmap` bit set), then never acks the
+ * subsequent EXIT_SIGNAL, surfacing as fleet-wide deinit timeouts.
+ *
  * kernel.cpp's CMakeLists.txt pairs the hand-written record with
  * `-mllvm -cce-dyn-kernel-stack-size=false`, which stops bisheng from
  * auto-emitting a sibling `.ascend.meta.<funcname>` section. Without that
  * flag, runtime's parser (kernelInfoMap keyed by section name) would
  * overwrite our values with bisheng's NO_VF / shareMemSize=0 defaults.
  *
- * TLV type IDs mirror RT_FUNCTION_TYPE_COMPILER_ALLOC_UB_SIZE (7) and
- * RT_FUNCTION_TYPE_AIV_TYPE_FLAG (12) in CANN's runtime/runtime/elf_base.h.
- * That header is host-side (extern "C", part of the runtime API) so we
- * re-declare the two values we need rather than pull runtime headers into
- * an AICore device-side TU.
+ * TLV type IDs mirror RT_FUNCTION_TYPE_* in CANN's
+ * runtime/runtime/elf_base.h. That header is host-side (extern "C", part of
+ * the runtime API) so we re-declare the values we need rather than pull
+ * runtime headers into an AICore device-side TU.
  */
 
 #ifndef PLATFORM_A5_AICORE_SIMT_META_H_
@@ -42,6 +50,9 @@
 // width-preserving.
 enum class FuncMetaType : unsigned short {
     F_TYPE_COMPILER_ALLOC_UB_SIZE = 7,
+    F_TYPE_SU_STACK_SIZE = 8,
+    F_TYPE_SIMT_WARP_STACK_SIZE = 9,
+    F_TYPE_SIMT_DVG_WARP_STACK_SIZE = 10,
     F_TYPE_AIV_TYPE_FLAG = 12,
 };
 
@@ -67,14 +78,43 @@ struct FuncMetaCompilerUbSize {
     unsigned int ub_size;
 };
 
+struct FuncMetaSuStackSize {
+    TlvHeader head;
+    unsigned int stack_size;
+};
+
+struct FuncMetaSimtWarpStackSize {
+    TlvHeader head;
+    unsigned int stack_size;
+};
+
+struct FuncMetaSimtDvgWarpStackSize {
+    TlvHeader head;
+    unsigned int stack_size;
+};
+
 struct FuncMetaAivTypeFlag {
     TlvHeader head;
     unsigned int aiv_type;
 };
 
+// Order matters: runtime parses TLVs sequentially by section offset, and the
+// pto-isa a5 SIMT ST binaries emit the records in ascending type order. Keep
+// this layout aligned with that reference.
 struct FuncLevelMeta {
     FuncMetaCompilerUbSize ub_size_meta;
+    FuncMetaSuStackSize su_stack_meta;
+    FuncMetaSimtWarpStackSize simt_warp_stack_meta;
+    FuncMetaSimtDvgWarpStackSize simt_dvg_warp_stack_meta;
     FuncMetaAivTypeFlag aiv_type_meta;
 };
 
+// Per-core SU (scalar unit) stack/state size in bytes. Copied byte-for-byte
+// from pto-isa a5 SIMT ST kernel ELFs (0xE0 = 224). This is not a true
+// stack-buffer size — empirically larger values (e.g. 0x8000) cause runtime
+// to reject the launch — but rather a minStackSize threshold the runtime
+// validates against the SU stack the linker actually emitted. Match the
+// reference value so the runtime check accepts our kernel.
+inline constexpr unsigned int PLATFORM_AICORE_SU_STACK_SIZE = 0xE0;
+
 #endif  // PLATFORM_A5_AICORE_SIMT_META_H_
diff --git a/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp
index c259716bd..31ae1ab5a 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp
+++ b/tests/st/a5/tensormap_and_ringbuffer/simt_basic/kernels/aiv/kernel_simt_scatter.cpp
@@ -9,21 +9,11 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-// Minimal SIMT element-scatter kernel (AIV).
-//
-// Distilled from the ptoas-generated mscatter reference. Drops cosmetic
-// noise (v1..v30 names, dummy v4/v5/v6/v7 constants, explicit
-// Layout::ND, the verbose Tile template tail, GM-offset arithmetic that
-// always reduces to zero, the ptoas_auto_sync_tail wrapper).
-//
-// Kept on purpose:
-//   - per-data 3-tile alias pattern (TLOAD binds one tile, MSCATTER
-//     reads from another aliased to the same UB address; a single-tile
-//     form has reproduced golden mismatches on hw)
-//   - `set_mask_norm` / `set_vector_mask` SIMT mask init
-//   - `MTE2 → V` sync before MSCATTER (the ptoas default `MTE2 → MTE3`
-//     silently drops the scatter on a5 hw)
-//   - `__DAV_VEC__` guard so the AIC variant compiles to a no-op
+// Minimal SIMT element-scatter kernel (AIV), structurally mirroring
+// pto-isa's runElem2D template (tests/npu/a5/src/st/testcase/mscatter/
+// mscatter_kernel.cpp) so the MSCATTER call sees the same Shape /
+// Stride / Tile / sync surroundings that the upstream ST validates
+// against.
 //
 // Operation: out[idx[r, c]] = src[r, c] for an 8x32 source and 256-slot
 // destination.
@@ -50,53 +40,47 @@ static constexpr int DST_LEN = TILE_ROWS * TILE_COLS;  // 256
 static constexpr int SRC_TILE_BYTES = TILE_ROWS * TILE_COLS * sizeof(float);
 
 static __aicore__ void simt_scatter_impl(__gm__ float *src, __gm__ int32_t *idx, __gm__ float *out) {
-    using SrcTile = Tile<TileType::Vec, float, TILE_ROWS, TILE_COLS, BLayout::RowMajor, -1, -1>;
-    using IdxTile = Tile<TileType::Vec, int32_t, TILE_ROWS, TILE_COLS, BLayout::RowMajor, -1, -1>;
-
-    using TileShape = Shape<1, 1, 1, TILE_ROWS, TILE_COLS>;
-    using TileStride = pto::Stride<DST_LEN, DST_LEN, DST_LEN, TILE_COLS, 1>;
-    using SrcGT = GlobalTensor<float, TileShape, TileStride>;
-    using IdxGT = GlobalTensor<int32_t, TileShape, TileStride>;
-
-    using DstShape = Shape<1, 1, 1, 1, DST_LEN>;
-    using DstStride = pto::Stride<DST_LEN, DST_LEN, DST_LEN, DST_LEN, 1>;
-    using DstGT = GlobalTensor<float, DstShape, DstStride>;
-
-    // Per-data 3-tile alias pattern:
-    //   *_loader  — bound directly at the UB offset; consumed by TLOAD
-    //   *_scatter — bound via the loader's data() pointer; consumed by MSCATTER
-    //   *_anchor  — bound to the same offset literal; preserves the
-    //               original ptoas binding sequence
-    constexpr int SRC_UB = 0;
-    constexpr int IDX_UB = SRC_TILE_BYTES;
-
-    SrcTile src_loader(TILE_ROWS, TILE_COLS);
-    TASSIGN(src_loader, SRC_UB);
-    SrcTile src_scatter(TILE_ROWS, TILE_COLS);
-    TASSIGN(src_scatter, reinterpret_cast<uint64_t>(src_loader.data()));
-    SrcTile src_anchor(TILE_ROWS, TILE_COLS);
-    TASSIGN(src_anchor, static_cast<uint64_t>(SRC_UB));
-
-    IdxTile idx_loader(TILE_ROWS, TILE_COLS);
-    TASSIGN(idx_loader, IDX_UB);
-    IdxTile idx_scatter(TILE_ROWS, TILE_COLS);
-    TASSIGN(idx_scatter, reinterpret_cast<uint64_t>(idx_loader.data()));
-    IdxTile idx_anchor(TILE_ROWS, TILE_COLS);
-    TASSIGN(idx_anchor, static_cast<uint64_t>(IDX_UB));
-
-    SrcGT srcGlobal(src);
-    IdxGT idxGlobal(idx);
-    DstGT dstGlobal(out);
-
-    TLOAD(src_anchor, srcGlobal);
-    TLOAD(idx_anchor, idxGlobal);
-
-    // MTE2 → V before MSCATTER (critical: MTE2 → MTE3 silently drops the
-    // scatter on a5 hw).
+    // No explicit set_mask_norm / set_vector_mask: pto-isa's mscatter ST does
+    // not call them either. MSCATTER is `__simt_callee__`; bisheng inlines
+    // the SIMT-side mask setup at every call site, so the vec mask state
+    // is owned by MSCATTER itself rather than by the host kernel.
+
+    // Mirror pto-isa's runElem2D template exactly: Shape/Stride descriptors,
+    // static-Valid-extents Tile, single-tile binding, idx-then-src TASSIGN
+    // and TLOAD order, and the pipe_barrier(PIPE_ALL) + V→MTE3 sync after
+    // MSCATTER.
+    using SrcShape = pto::Shape<1, 1, 1, TILE_ROWS, TILE_COLS>;
+    using SrcStride = pto::Stride<1, 1, 1, TILE_COLS, 1>;
+    using IdxShape = pto::Shape<1, 1, 1, TILE_ROWS, TILE_COLS>;
+    using IdxStride = pto::Stride<1, 1, 1, TILE_COLS, 1>;
+    using OutShape = pto::Shape<1, 1, 1, 1, DST_LEN>;
+    using OutStride = pto::Stride<1, 1, 1, DST_LEN, 1>;
+
+    GlobalTensor<float, SrcShape, SrcStride> srcGlobal(src);
+    GlobalTensor<int32_t, IdxShape, IdxStride> idxGlobal(idx);
+    GlobalTensor<float, OutShape, OutStride> outGlobal(out);
+
+    using SrcTile = Tile<TileType::Vec, float, TILE_ROWS, TILE_COLS, BLayout::RowMajor, TILE_ROWS, TILE_COLS>;
+    using IdxTile = Tile<TileType::Vec, int32_t, TILE_ROWS, TILE_COLS, BLayout::RowMajor, TILE_ROWS, TILE_COLS>;
+
+    SrcTile srcTile;
+    IdxTile idxTile;
+
+    constexpr int idxBytes = ((TILE_ROWS * TILE_COLS * static_cast<int>(sizeof(int32_t)) + 31) / 32) * 32;
+    TASSIGN(idxTile, 0x0);
+    TASSIGN(srcTile, idxBytes);
+
+    TLOAD(idxTile, idxGlobal);
+    TLOAD(srcTile, srcGlobal);
+
     set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
     wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
 
-    MSCATTER(dstGlobal, src_scatter, idx_scatter);
+    MSCATTER<Coalesce::Elem, ScatterAtomicOp::None, ScatterOOB::Skip>(outGlobal, srcTile, idxTile);
+
+    pipe_barrier(PIPE_ALL);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
 
     pipe_sync();
 }