hw-native-sys · zmnobug · May 18, 2026 · gemini-code-assist · May 18, 2026
diff --git a/docs/dfx/tensor-dump.md b/docs/dfx/tensor-dump.md
@@ -15,6 +15,13 @@ execution, and the host exports a JSON manifest plus a binary payload.
 The result is a stable, replayable record of every tensor a kernel
 saw, without the timing distortion of inline printing.
 
+The same device-to-host dump channel also carries task argument
+descriptors. When tensor dump is enabled, AICPU records the runtime
+args it sees at dispatch time: tensor buffer descriptors plus scalar
+values. These `args` entries share the tensor dump lifecycle, queues,
+arena, and output directory so they can be correlated with swimlane
+and PMU task ids without opening a second DFX path.
+
 ## 2. Overview
 
 - **Per-task input/output capture.** Inputs snapshotted before
@@ -26,6 +33,9 @@ saw, without the timing distortion of inline printing.
 - **Manifest + binary payload.** A single JSON manifest plus one
   `.bin` payload per run; each manifest entry has `bin_offset` /
   `bin_size` into the payload.
+- **Args descriptors.** The manifest also includes an `args` array
+  with per-dispatch tensor descriptors and scalar values observed on
+  device.
 - **Cross-architecture.** Same `--dump-tensor` flag, same on-disk
   format on `a2a3` and `a5`. Both runtimes are wired through.
 
@@ -90,6 +100,7 @@ Example manifest (one input tensor captured before dispatch):
     "byte_order": "little_endian"
   },
   "total_tensors": 1,
+  "total_args": 1,
   "before_dispatch": 1,
   "after_completion": 0,
   "input_tensors": 1,
@@ -99,6 +110,32 @@ Example manifest (one input tensor captured before dispatch):
   "dropped_records": 0,
   "dropped_overwrite": 0,
   "bin_file": "tensor_dump.bin",
+  "args": [
+    {
+      "task_id": "0x0000000200000a00",
+      "subtask_id": 0,
+      "func_id": 0,
+      "stage": "before_dispatch",
+      "tensor_count": 1,
+      "scalar_count": 1,
+      "payload_size": 128,
+      "overwritten": false,
+      "tensors": [
+        {
+          "arg_index": 0,
+          "buffer_addr": "0x100000",
+          "buffer_size": 65536,
+          "dtype": "float32",
+          "shape": [16384],
+          "raw_shape": [16384],
+          "offsets": [0],
+          "is_contiguous": true,
+          "is_all_offset_zero": true
+        }
+      ],
+      "scalars": ["0x40"]
+    }
+  ],
   "tensors": [
     {
       "task_id": "0x0000000200000a00",
@@ -155,6 +192,9 @@ python -m simpler_setup.tools.dump_viewer --func 0 --stage before --role input -
 # Export one specific entry by its manifest index
 python -m simpler_setup.tools.dump_viewer --index 42
 
+# List dumped task args in the latest run
+python -m simpler_setup.tools.dump_viewer --args
+
 # Pin to a specific dump directory
 python -m simpler_setup.tools.dump_viewer outputs/<case>_<ts>/tensor_dump \
     --task 0x0000000200000a00 --export

diff --git a/simpler_setup/tools/dump_viewer.py b/simpler_setup/tools/dump_viewer.py
@@ -15,6 +15,7 @@
     --stage  Filter by stage (before / after)
     --role   Filter by role (input / output / inout)
     --arg    Filter by arg_index (int)
+    --args   List dumped task args instead of tensors
 
 With no filters: lists all tensors.
 With filters: lists matching tensors. Add --export to save them to txt.
@@ -194,6 +195,20 @@ def list_tensors(tensors: list):
         )
 
 
+def list_args(args_records: list):
+    print(
+        f"{'idx':>6}  {'task_id':>18}  {'s':>1}  {'stage':>15}  {'func':>4}"
+        f"  {'tensors':>7}  {'scalars':>7}  {'overwritten':>11}"
+    )
+    print("-" * 92)
+    for i, rec in enumerate(args_records):
+        print(
+            f"{i:>6}  {rec['task_id']:>18}  {rec['subtask_id']:>1}  {rec['stage']:>15}"
+            f"  {rec['func_id']:>4}  {rec['tensor_count']:>7}  {rec['scalar_count']:>7}"
+            f"  {str(rec.get('overwritten', False)):>11}"
+        )
+
+
 def _resolve_dump_dir(dump_dir_arg: str | None) -> Path:
     if dump_dir_arg is not None:
         return Path(dump_dir_arg)
@@ -267,6 +282,7 @@ def main():
     parser.add_argument("--stage", "-s", help="Filter by stage (before / after)")
     parser.add_argument("--role", "-r", help="Filter by role (input / output / inout)")
     parser.add_argument("--arg", "-a", type=int, help="Filter by arg_index")
+    parser.add_argument("--args", action="store_true", help="List dumped task args instead of tensors")
     parser.add_argument("--index", "-i", type=int, help="Select tensor by index in manifest")
     parser.add_argument("--export", "-e", action="store_true", help="Export filtered tensors to txt")
     args = parser.parse_args()
@@ -283,6 +299,14 @@ def main():
     bin_path = dump_dir / manifest.get("bin_file", "tensors.bin")
     tensors = manifest["tensors"]
 
+    if args.args:
+        args_records = manifest.get("args", [])
+        if not args_records:
+            print("No args records found in manifest.", file=sys.stderr)
+            sys.exit(1)
+        list_args(args_records)
+        return
+
     filtered = _apply_filters(tensors, args)
 
     # --- Select by index ---

diff --git a/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h b/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h
@@ -75,6 +75,58 @@ int32_t count_callable_tensor_args(const CoreCallable &callable);
 bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage);
 bool try_log_tensor_dump_layout_mismatch();
 int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);
+int dump_args_record(int thread_idx, const ArgsDumpInfo &info);
+
+template <int MaxTensorArgs, int MaxScalarArgs, typename PayloadT>
+inline void dump_args_for_payload(
+    int32_t thread_idx, uint64_t task_id, uint8_t subtask_id, int32_t func_id, const PayloadT &payload,
+    TensorDumpStage stage
+) {
+    ArgsDumpTensorEntry tensor_entries[MaxTensorArgs] = {};
+    int32_t tensor_count = payload.tensor_count;
+    if (tensor_count < 0) {
+        tensor_count = 0;
+    }
+    if (tensor_count > MaxTensorArgs) {
+        tensor_count = MaxTensorArgs;
+    }
+    int32_t scalar_count = payload.scalar_count;
+    if (scalar_count < 0) {
+        scalar_count = 0;
+    }
+    if (scalar_count > MaxScalarArgs) {
+        scalar_count = MaxScalarArgs;
+    }
+
+    for (int32_t i = 0; i < tensor_count; i++) {
+        const auto &t = payload.tensors[i];
+        ArgsDumpTensorEntry &entry = tensor_entries[i];
+        entry.buffer_addr = t.buffer.addr;
+        entry.buffer_size = t.buffer.size;
+        entry.owner_task_id = t.owner_task_id.raw;
+        entry.ndims = t.ndims;
+        entry.dtype = static_cast<uint8_t>(t.dtype);
+        entry.is_contiguous = t.is_raw_eq_shapes ? 1 : 0;
+        entry.is_all_offset_zero = t.is_all_offset_zero ? 1 : 0;
+        const uint32_t *raw_shapes = t.get_raw_shapes();
+        for (uint32_t d = 0; d < t.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) {
+            entry.shapes[d] = t.shapes[d];
+            entry.offsets[d] = t.is_all_offset_zero ? 0 : t.offsets[d];
+            entry.raw_shapes[d] = raw_shapes[d];
+        }
+    }
+
+    ArgsDumpInfo info = {};
+    info.task_id = task_id;
+    info.subtask_id = subtask_id;
+    info.stage = stage;
+    info.func_id = static_cast<uint32_t>(func_id);
+    info.tensor_count = static_cast<uint32_t>(tensor_count);
+    info.scalar_count = static_cast<uint32_t>(scalar_count);
+    info.tensors = tensor_entries;
+    info.scalars = payload.scalars;
+    dump_args_record(thread_idx, info);
+}
 
 template <int MaxSubtaskSlots, typename SlotStateT, typename IsSubtaskActiveFn, typename GetFunctionBinAddrFn>
 inline void dump_tensors_for_task(
@@ -261,6 +313,14 @@ void dump_tensor_init(int num_dump_threads);
  */
 int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);
 
+/**
+ * Record a task's runtime argument descriptors.
+ *
+ * Copies an ArgsDumpPayloadHeader, ArgsDumpTensorEntry array, and raw scalar
+ * values into the existing dump arena, then appends an ARGS metadata record.
+ */
+int dump_args_record(int thread_idx, const ArgsDumpInfo &info);
+
 /**
  * Flush remaining tensor dump data for a thread.
  *

diff --git a/src/a2a3/platform/include/common/tensor_dump.h b/src/a2a3/platform/include/common/tensor_dump.h
@@ -70,6 +70,55 @@ enum class TensorDumpStage : uint8_t {
     AFTER_COMPLETION = 1,
 };
 
+// =============================================================================
+// DumpRecordKind - Logical record type carried by the dump channel
+// =============================================================================
+
+enum class DumpRecordKind : uint8_t {
+    TENSOR = 0,
+    ARGS = 1,
+};
+
+// =============================================================================
+// Args dump payload schema
+// =============================================================================
+
+constexpr uint32_t ARGS_DUMP_PAYLOAD_VERSION = 1;
+
+struct ArgsDumpPayloadHeader {
+    uint32_t version;
+    uint32_t tensor_count;
+    uint32_t scalar_count;
+    uint32_t tensor_entry_size;
+    uint32_t scalar_entry_size;
+    uint32_t reserved;
+};
+
+struct ArgsDumpTensorEntry {
+    uint64_t buffer_addr;
+    uint64_t buffer_size;
+    uint64_t owner_task_id;
+    uint32_t shapes[PLATFORM_DUMP_MAX_DIMS];
+    uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS];
+    uint32_t offsets[PLATFORM_DUMP_MAX_DIMS];
+    uint32_t ndims;
+    uint8_t dtype;
+    uint8_t is_contiguous;
+    uint8_t is_all_offset_zero;
+    uint8_t reserved;
+};
+
+struct ArgsDumpInfo {
+    uint64_t task_id;
+    uint8_t subtask_id;
+    TensorDumpStage stage;
+    uint32_t func_id;
+    uint32_t tensor_count;
+    uint32_t scalar_count;
+    const ArgsDumpTensorEntry *tensors;
+    const uint64_t *scalars;
+};
+
 // =============================================================================
 // TensorDumpRecord - Single Tensor Dump Entry (128B = 2 cache lines)
 // =============================================================================
@@ -92,7 +141,7 @@ struct alignas(64) TensorDumpRecord {
     uint8_t dtype;            // DataType raw enum value
     uint8_t truncated;        // 1 if payload was truncated (tensor > arena capacity)
     uint8_t is_contiguous;    // 1 when source view is already contiguous
-    uint8_t pad0_align;       // Explicit alignment before 64-bit payload offsets
+    uint8_t kind;             // DumpRecordKind; defaults to tensor for legacy records
     uint64_t payload_offset;  // Monotonic byte offset into thread arena
     uint64_t payload_size;    // Bytes actually copied (may be < full tensor bytes)
     uint8_t pad0[24];         // Preserve 64B cache-line layout

diff --git a/src/a2a3/platform/include/host/tensor_dump_collector.h b/src/a2a3/platform/include/host/tensor_dump_collector.h
@@ -176,6 +176,18 @@ struct DumpedTensor {
     std::vector<uint8_t> bytes;
 };
 
+struct DumpedArgs {
+    uint64_t task_id;
+    uint8_t subtask_id;
+    uint32_t func_id;
+    TensorDumpStage stage;
+    uint32_t tensor_count;
+    uint32_t scalar_count;
+    uint64_t payload_size;
+    bool overwritten;
+    std::vector<uint8_t> bytes;
+};
+
 class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpCollector, DumpModule> {
 public:
     TensorDumpCollector() = default;
@@ -283,6 +295,7 @@ class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpColl
 
     // Collected dump tensors
     std::vector<DumpedTensor> collected_;
+    std::vector<DumpedArgs> collected_args_;
     std::mutex collected_mutex_;
 
     // Stats