hw-native-sys · vegetabledoww · May 27, 2026 · coderabbitai · May 27, 2026 · vegetabledoww
diff --git a/docs/dfx/tensor-dump.md b/docs/dfx/tensor-dump.md
@@ -21,15 +21,18 @@ saw, without the timing distortion of inline printing.
   dispatch, outputs snapshotted after FIN; `INOUT` tensors at both
   stages.
 - **Logical shape preserved.** Records carry dtype, shape,
-  `raw_shape`, offsets, and `is_contiguous` so non-contiguous views
-  are reconstructable.
-- **Manifest + binary payload.** A single JSON manifest plus one
-  `.bin` payload per run; each manifest entry has `bin_offset` /
-  `bin_size` into the payload.
-- **Cross-architecture.** Same `--dump-tensor` flag, same on-disk
-  format on `a2a3` and `a5`. Both runtimes are wired through.
-
-Enable in one line:
+  `strides`, `start_offset`, and `is_contiguous` so logical views are
+  reconstructable.
+- **Manifest + binary payload.** `--dump-tensor` writes a JSON
+  manifest plus one `.bin` payload per run; each manifest entry has
+  `bin_offset` / `bin_size` into the payload.
+- **Per-dispatch args dump.** `--dump-tensor` also writes
+  `kernel_args_dump.json`, capturing the actual `kernel_entry(args)`
+  slot layout seen by each dispatch.
+- **Cross-architecture.** Same flags and on-disk layout family on
+  `a2a3` and `a5`. Both runtimes are wired through.
+
+Enable dump capture in one line:
 
 ```bash
 python tests/st/<case>/test_<name>.py -p a5sim --dump-tensor
@@ -40,7 +43,6 @@ python tests/st/<case>/test_<name>.py -p a5sim --dump-tensor
 ### 3.1 Enable Tensor Dump
 
 ```bash
-# Standalone runner
 python tests/st/<case>/test_<name>.py -p a5sim --dump-tensor
 python tests/st/<case>/test_<name>.py -p a2a3 -d 0 --dump-tensor
 
@@ -49,16 +51,12 @@ pytest tests/st/<case> --platform a5sim --dump-tensor
 pytest examples/a5/host_build_graph/vector_example --platform a5sim --dump-tensor
 ```
 
-The flag flips `CallConfig::enable_dump_tensor`. The host then
+`--dump-tensor` flips `CallConfig::enable_dump_tensor`. The host
 allocates dump storage, publishes its base address through
-`kernel_args.dump_data_base`, and sets
-`PROFILING_FLAG_DUMP_TENSOR` in each worker handshake's
-`enable_profiling_flag`. The on-device AICPU kernel reads both:
-the storage base via `set_platform_dump_base()` and the enable bit
-via `set_enable_dump_tensor(GET_PROFILING_FLAG(...))`. AICore
-executors read the same handshake bit to insert a
-`pipe_barrier(PIPE_ALL)` before FIN when dump is on, so
-`AFTER_COMPLETION` snapshots see the kernel's final writes.
+`kernel_args.dump_data_base`, and sets `PROFILING_FLAG_DUMP_TENSOR`
+in the worker profiling bitmask. AICPU reads the storage base via
+`set_platform_dump_base()` and the enable state via
+`set_dump_tensor_enabled(...)`.
 
 ### 3.2 Output
 
@@ -70,12 +68,16 @@ The dump artifacts land under the per-task output prefix
 ```text
 <output_prefix>/
 └── tensor_dump/
-    ├── tensor_dump.json
-    └── tensor_dump.bin
+    ├── tensor_dump.json       # per-task tensor metadata / snapshots (`--dump-tensor`)
+    ├── tensor_dump.bin        # raw tensor payload (`--dump-tensor`)
+    └── kernel_args_dump.json  # per-dispatch kernel args metadata (`--dump-tensor`)
 ```
 
 Filenames are fixed (no per-file timestamp) — the directory is the
-per-task uniqueness boundary.
+per-task uniqueness boundary. `--dump-tensor` emits all three files in
+the same `tensor_dump/` directory.
+
+#### `tensor_dump.json` — Tensor payloads
 
 `tensor_dump.json` is the manifest; its `bin_file` field points at
 the sibling binary payload.
@@ -109,8 +111,8 @@ Example manifest (one input tensor captured before dispatch):
       "arg_index": 0,
       "dtype": "float32",
       "shape": [16384],
-      "raw_shape": [16384],
-      "offsets": [0],
+      "strides": [1],
+      "start_offset": 0,
       "is_contiguous": true,
       "truncated": false,
       "overwritten": false,
@@ -129,16 +131,18 @@ Key fields:
 - `arg_index` — position in the formal callable signature.
 - `role` / `stage` — `input` / `output` / `inout`, captured
   `before_dispatch` / `after_completion`.
-- `dtype` / `shape` / `raw_shape` / `offsets` / `is_contiguous` —
-  view geometry. `bin_size` is `numel × elem_size` of the *logical*
-  view, gathered if non-contiguous.
+- `dtype` / `shape` / `strides` / `start_offset` /
+  `is_contiguous` — logical view geometry. `bin_size` is
+  `numel × elem_size` of the *logical* view, gathered if
+  non-contiguous.
 - `bin_offset` — byte offset into `tensor_dump.bin` where the
   payload starts.
 - `truncated` / `overwritten` — set when the tensor exceeded arena
   size or was overwritten by a later task; see §7.
 - Top-level `dropped_records` / `dropped_overwrite` counters
   surface aggregate loss — useful for spot-checking a run.
 
+
 ### 3.3 Inspect with `dump_viewer`
 
 The viewer auto-picks the latest `outputs/*/tensor_dump` directory
@@ -167,8 +171,8 @@ spreadsheet.
 
 ### 3.4 Add dump support to a new test
 
-Only `host_build_graph` needs explicit wiring; other runtimes pick
-up metadata automatically.
+Only `host_build_graph` needs explicit wiring; other runtimes derive
+tensor view metadata automatically.
 
 ```cpp
 // In orchestration C++ (host_build_graph only)
@@ -202,14 +206,16 @@ What you can read out of `tensor_dump.json` + `tensor_dump.bin`:
   ensures these reflect the kernel's final writes.
 - **`INOUT` deltas** — same arg captured at both stages; diff
   before vs after to see exactly what the kernel modified.
-- **Non-contiguous view reconstruction** — `raw_shape` / `offsets`
-  / `is_contiguous` plus the gathered logical-contiguous payload.
+- **Logical view reconstruction** — `shape` / `strides` /
+  `start_offset` / `is_contiguous` plus the gathered
+  logical-contiguous payload.
 - **Per-task identity** — `task_id` / `subtask_id` / `func_id`
   correlates dump entries with swimlane and PMU rows.
 - **Loss accounting** — `truncated` / `overwritten` per-record
   flags, plus aggregate `dropped_records` / `dropped_overwrite` in
   the summary.
 
+
 ## 5. Design Highlights
 
 ### 5.1 Common device-side structures
@@ -281,8 +287,9 @@ Each runtime's scheduler dispatch code calls
 ```
 
 `dump_tensors_for_task` walks the formal callable signature,
-matches each non-scalar slot to a `TensorDumpInfo` (dtype + shape + offsets + device address), and calls `dump_tensor_record` for
-slots that match the current stage.
+matches each non-scalar slot to a `TensorDumpInfo` (dtype + shape +
+strides + start offset + device address), and calls
+`dump_tensor_record` for slots that match the current stage.
 
 When dump is enabled, AICore executors also issue
 `pipe_barrier(PIPE_ALL)` after kernel execution and before writing
@@ -527,7 +534,8 @@ Tensor Dump is opt-in and zero-overhead when disabled — without
 AICore skip the dump-specific code paths. The `pipe_barrier(PIPE_ALL)`
 before FIN is also gated on the same handshake bit.
 
-When enabled, the per-task overhead is dominated by:
+With `--dump-tensor`, AICPU records full `BEFORE_DISPATCH` /
+`AFTER_COMPLETION` tensor payloads. The per-task overhead is dominated by:
 
 - The `BEFORE_DISPATCH` / `AFTER_COMPLETION` payload memcpy into
   the per-thread arena (contiguous fast-path; logical traversal for

diff --git a/docs/testing.md b/docs/testing.md
@@ -72,7 +72,7 @@ If a module is pure C++ with no Python binding, test in **ut-cpp** (`tests/ut/cp
 
 Scene tests support advanced CLI options for benchmarking, profiling, and runtime control. These work identically in both pytest and standalone mode.
 
-> "Profiling" is the umbrella for three parallel diagnostics sub-features: `--enable-l2-swimlane` (L2 swimlane), `--dump-tensor` (per-task tensor I/O), and `--enable-pmu` (PMU CSV). They are independent and can be combined.
+> "Profiling" is the umbrella for three parallel diagnostics sub-features: `--enable-l2-swimlane` (L2 swimlane), `--dump-tensor` (tensor payload dump + per-dispatch kernel args dump), and `--enable-pmu` (PMU CSV). They are independent and can be combined.
 
 ### pytest
 
@@ -91,7 +91,7 @@ pytest --platform a2a3sim --log-level debug                        # verbose C++
 python test_xxx.py -p a2a3sim                                    # default: 1 round + golden
 python test_xxx.py -p a2a3 -d 0 --rounds 100 --skip-golden       # benchmark mode
 python test_xxx.py -p a2a3 --enable-l2-swimlane                         # L2 swimlane (first round)
-python test_xxx.py -p a2a3 --dump-tensor                         # dump per-task tensor I/O
+python test_xxx.py -p a2a3 --dump-tensor                         # dump tensor payloads + per-dispatch kernel args
 python test_xxx.py -p a2a3 --enable-pmu 4                        # PMU CSV (MEMORY)
 python test_xxx.py -p a2a3sim --build                            # compile runtime from source
 python test_xxx.py -p a2a3sim --log-level debug                  # verbose C++ logging
@@ -110,7 +110,7 @@ python test_xxx.py -p a2a3sim --log-level debug                  # verbose C++ l
 | `--manual` | | `exclude` | `exclude`/`include`/`only` for manual cases |
 | `--skip-golden` | | false | Skip golden comparison (for benchmarking) |
 | `--enable-l2-swimlane [PERF_LEVEL]` | | `0` | Enable L2 swimlane collection on first round only. The flag takes an integer perf_level 0–4 (bare = 4); see [docs/dfx/l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md#31-enable-l2-swimlane) for the level table. Each test case gets its own `outputs/<case>_<ts>/` directory under which `l2_perf_records.json` lands; parallel runs never collide. |
-| `--dump-tensor` | | false | Dump per-task tensor I/O during runtime execution |
+| `--dump-tensor` | | false | Dump tensors and per-dispatch kernel args during runtime execution |
 | `--enable-pmu [EVENT_TYPE]` | | `0` | Enable a2a3 PMU CSV collection. Bare flag selects `PIPE_UTILIZATION` (`2`); pass an event type such as `4` for `MEMORY`. |
 | `--build` | | false | Compile runtime from source (not pre-built) |
 | `--exitfirst` | `-x` | false | Stop on first failing test (fail-fast, primarily for CI) |
@@ -325,7 +325,7 @@ A single file can declare both L2 and L3 classes; they're grouped by `(runtime,
 Each test case sets its own `CallConfig.output_prefix` (chosen by `scene_test.py::_build_output_prefix` as `outputs/<ClassName>_<case>_<YYYYMMDD_HHMMSS>/`). The C++ runtime writes all diagnostic artifacts under that prefix with fixed filenames:
 
 - `outputs/<case>_<ts>/l2_perf_records.json` — swimlane (`--enable-l2-swimlane`)
-- `outputs/<case>_<ts>/tensor_dump/` — tensor dump (`--dump-tensor`)
+- `outputs/<case>_<ts>/tensor_dump/` — tensor dump artifacts (`--dump-tensor`)
 - `outputs/<case>_<ts>/pmu.csv` — PMU counters (`--enable-pmu`)
 
 Because each case gets its own directory, parallel runs (xdist workers, L3 case fanout, L2 device fanout) can never collide on filename — there is no per-file timestamp, no env-var scoping, and no post-run flatten step. `CallConfig::validate()` throws if any diagnostic flag is enabled but `output_prefix` is empty; `scene_test.py::run_class_cases` always fills it from the case label.

diff --git a/simpler_setup/tools/dump_viewer.py b/simpler_setup/tools/dump_viewer.py
@@ -272,12 +272,12 @@ def main():
     args = parser.parse_args()
 
     dump_dir = _resolve_dump_dir(args.dump_dir)
-    manifest_files = list(dump_dir.glob("*.json"))
-    if not manifest_files:
-        print(f"Error: no manifest JSON found in {dump_dir}", file=sys.stderr)
+    manifest_path = dump_dir / "tensor_dump.json"
+    if not manifest_path.exists():
+        print(f"Error: tensor_dump.json not found in {dump_dir}", file=sys.stderr)
         sys.exit(1)
 
-    with open(manifest_files[0]) as f:
+    with open(manifest_path) as f:
         manifest = json.load(f)
 
     bin_path = dump_dir / manifest.get("bin_file", "tensors.bin")

diff --git a/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h b/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h
@@ -79,7 +79,7 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);
 template <int MaxSubtaskSlots, typename SlotStateT, typename IsSubtaskActiveFn, typename GetFunctionBinAddrFn>
 inline void dump_tensors_for_task(
     int32_t thread_idx, const SlotStateT &slot_state, TensorDumpStage stage, IsSubtaskActiveFn is_subtask_active,
-    GetFunctionBinAddrFn get_function_bin_addr
+    GetFunctionBinAddrFn get_function_bin_addr, uint64_t dispatch_id = 0
 ) {
     const auto &pl = *slot_state.payload;
     const CoreCallable *callables[MaxSubtaskSlots] = {};
@@ -112,7 +112,9 @@ inline void dump_tensors_for_task(
 
     rmb();
 
-    int32_t payload_index = 0;
+    int32_t arg_index = 0;
+    int32_t tensor_index = 0;
+    int32_t scalar_index = 0;
     for (int raw_subtask_id = 0; raw_subtask_id < MaxSubtaskSlots; raw_subtask_id++) {
         if (!is_subtask_active(slot_state.active_mask, raw_subtask_id)) {
             continue;
@@ -122,11 +124,30 @@ inline void dump_tensors_for_task(
         for (int32_t sig_idx = 0; sig_idx < callable.sig_count(); sig_idx++) {
             ArgDirection dir = callable.sig(sig_idx);
             if (dir == ArgDirection::SCALAR) {
+                if (stage == TensorDumpStage::KERNEL_ARGS_DUMP) {
+                    TensorDumpInfo info = {};
+                    info.task_id = slot_state.task->task_id.raw;
+                    info.subtask_id = raw_subtask_id;
+                    info.role = TensorDumpRole::INPUT;
+                    info.stage = stage;
+                    info.dtype = static_cast<uint8_t>(DataType::UINT64);
+                    info.ndims = 0;
+                    info.func_id = slot_state.task->kernel_id[slot_idx];
+                    info.arg_index = static_cast<uint32_t>(arg_index);
+                    info.buffer_addr = pl.scalars[scalar_index];
+                    info.dispatch_id = dispatch_id;
+                    info.pack_mode = static_cast<uint8_t>(TensorDumpPackMode::SCALAR_VALUE);
+                    dump_tensor_record(thread_idx, info);
+                }
+                arg_index++;
+                scalar_index++;
                 continue;
             }
             TensorDumpRole role;
-            if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage)) {
-                const auto &t = pl.tensors[payload_index];
+            bool dump_tensor = get_tensor_dump_role_from_direction(dir, &role) &&
+                               (stage == TensorDumpStage::KERNEL_ARGS_DUMP || should_dump_tensor_at_stage(role, stage));
+            if (dump_tensor) {
+                const auto &t = pl.tensors[tensor_index];
                 TensorDumpInfo info = {};
                 info.buffer_addr = t.buffer.addr;
                 info.dtype = static_cast<uint8_t>(t.dtype);
@@ -139,12 +160,15 @@ inline void dump_tensors_for_task(
                 info.task_id = slot_state.task->task_id.raw;
                 info.subtask_id = raw_subtask_id;
                 info.func_id = slot_state.task->kernel_id[slot_idx];
-                info.arg_index = static_cast<uint32_t>(payload_index);
-                info.role = role;
+                info.arg_index = static_cast<uint32_t>(arg_index);
+                info.role = (stage == TensorDumpStage::KERNEL_ARGS_DUMP) ? TensorDumpRole::INPUT : role;
                 info.stage = stage;
+                info.dispatch_id = dispatch_id;
+                info.pack_mode = static_cast<uint8_t>(TensorDumpPackMode::TENSOR_PTR);
                 dump_tensor_record(thread_idx, info);
             }
-            payload_index++;
+            arg_index++;
+            tensor_index++;
         }
     }
 }
@@ -221,7 +245,7 @@ inline void dump_tensors_for_task(
         info.dtype = static_cast<uint8_t>(t.dtype);
         info.ndims = t.ndims;
         info.func_id = static_cast<uint32_t>(func_id);
-        info.arg_index = static_cast<uint32_t>(tensor_arg_index);
+        info.arg_index = static_cast<uint32_t>(sig_idx);
         info.buffer_addr = buffer_addrs[tensor_arg_index];
         // TensorInfo (host_build_graph) still carries (raw_shapes, offsets)
         // implicitly describing a row-major-aligned sub-region. Translate to
@@ -241,6 +265,7 @@ inline void dump_tensors_for_task(
         tensor_arg_index++;
     }
 }
+
 #endif
 
 /**

diff --git a/src/a2a3/platform/include/common/tensor_dump.h b/src/a2a3/platform/include/common/tensor_dump.h
@@ -68,6 +68,12 @@ enum class TensorDumpRole : uint8_t {
 enum class TensorDumpStage : uint8_t {
     BEFORE_DISPATCH = 0,
     AFTER_COMPLETION = 1,
+    KERNEL_ARGS_DUMP = 2,
+};
+
+enum class TensorDumpPackMode : uint8_t {
+    TENSOR_PTR = 0,
+    SCALAR_VALUE = 1,
 };
 
 // =============================================================================
@@ -88,14 +94,16 @@ struct alignas(64) TensorDumpRecord {
     uint8_t stage;            // TensorDumpStage (before/after execution)
     uint8_t ndims;            // Number of dimensions
     uint32_t func_id;         // Kernel function identifier
-    uint32_t arg_index;       // Position in PTO2TaskPayload::tensors[]
+    uint32_t arg_index;       // Position in the callable signature
     uint8_t dtype;            // DataType raw enum value
     uint8_t truncated;        // 1 if payload was truncated (tensor > arena capacity)
     uint8_t is_contiguous;    // 1 when source view is already PyTorch-contiguous
     uint8_t pad0_align;       // Explicit alignment before 64-bit payload offsets
     uint64_t payload_offset;  // Monotonic byte offset into thread arena
     uint64_t payload_size;    // Bytes actually copied (may be < full tensor bytes)
-    uint8_t pad0[24];         // Preserve 64B cache-line layout
+    uint64_t dispatch_id;     // Monotonic scheduler dispatch sequence
+    uint8_t pack_mode;        // TensorDumpPackMode
+    uint8_t pad0[15];         // Preserve 64B cache-line layout
 
     // === Cache line 2 (64B) — strided view descriptor ===
     // start_offset placed first for 8B alignment without padding gaps; total = 8 + 20 + 20 + 16 = 64B.
@@ -236,6 +244,9 @@ struct TensorDumpInfo {
     uint32_t func_id;
     uint32_t arg_index;
     uint64_t buffer_addr;
+    uint64_t dispatch_id;
+    uint8_t pack_mode;
+    uint8_t reserved[15];
     uint64_t start_offset;                     // 1D ELEMENT offset of the view origin
     uint32_t shapes[PLATFORM_DUMP_MAX_DIMS];   // Current view shape
     uint32_t strides[PLATFORM_DUMP_MAX_DIMS];  // Element stride per dimension (strictly > 0, type-enforced)

diff --git a/src/a2a3/platform/include/host/tensor_dump_collector.h b/src/a2a3/platform/include/host/tensor_dump_collector.h
@@ -24,6 +24,7 @@
 #ifndef SRC_A2A3_PLATFORM_INCLUDE_HOST_TENSOR_DUMP_COLLECTOR_H_
 #define SRC_A2A3_PLATFORM_INCLUDE_HOST_TENSOR_DUMP_COLLECTOR_H_
 
+#include <array>
 #include <atomic>
 #include <condition_variable>
 #include <cstdint>
@@ -152,6 +153,17 @@ struct DumpedTensor {
     std::vector<uint8_t> bytes;
 };
 
+struct KernelArgsDumpEntry {
+    uint64_t dispatch_id;
+    uint32_t func_id;
+    uint32_t arg_index;
+    uint8_t pack_mode;
+    uint8_t dtype;
+    uint8_t ndims;
+    std::array<uint32_t, PLATFORM_DUMP_MAX_DIMS> shapes;
+    uint64_t value;
+};
+
 class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpCollector, DumpModule> {
 public:
     TensorDumpCollector() = default;
@@ -259,6 +271,7 @@ class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpColl
 
     // Collected dump tensors
     std::vector<DumpedTensor> collected_;
+    std::vector<KernelArgsDumpEntry> kernel_args_entries_;
     std::mutex collected_mutex_;
 
     // Stats
@@ -276,6 +289,7 @@ class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpColl
     void *alloc_single_buffer(size_t size, void **host_ptr_out);
     void process_dump_buffer(const DumpReadyBufferInfo &info);
     void start_writer_thread_once();
+    int export_kernel_args_dump_file();
 
     // Writer thread: streams tensor payloads to a single tensors.bin
     std::thread writer_thread_;

diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp
@@ -113,7 +113,8 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
     // The dump base address is only the backing storage location.
     set_platform_regs(k_args->regs);
     set_platform_dump_base(k_args->dump_data_base);
-    set_dump_tensor_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR));
+    bool dump_enabled = GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR);
+    set_dump_tensor_enabled(dump_enabled);
     set_platform_l2_perf_base(k_args->l2_perf_data_base);
     set_l2_swimlane_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE));
     set_platform_pmu_base(k_args->pmu_data_base);