diff --git a/docs/dfx/tensor-dump.md b/docs/dfx/tensor-dump.md
index 3bcb14320..6917799bc 100644
--- a/docs/dfx/tensor-dump.md
+++ b/docs/dfx/tensor-dump.md
@@ -21,15 +21,18 @@ saw, without the timing distortion of inline printing.
   dispatch, outputs snapshotted after FIN; `INOUT` tensors at both
   stages.
 - **Logical shape preserved.** Records carry dtype, shape,
-  `raw_shape`, offsets, and `is_contiguous` so non-contiguous views
-  are reconstructable.
-- **Manifest + binary payload.** A single JSON manifest plus one
-  `.bin` payload per run; each manifest entry has `bin_offset` /
-  `bin_size` into the payload.
-- **Cross-architecture.** Same `--dump-tensor` flag, same on-disk
-  format on `a2a3` and `a5`. Both runtimes are wired through.
-
-Enable in one line:
+  `strides`, `start_offset`, and `is_contiguous` so logical views are
+  reconstructable.
+- **Manifest + binary payload.** `--dump-tensor` writes a JSON
+  manifest plus one `.bin` payload per run; each manifest entry has
+  `bin_offset` / `bin_size` into the payload.
+- **Per-dispatch args dump.** `--dump-tensor` also writes
+  `kernel_args_dump.json`, capturing the actual `kernel_entry(args)`
+  slot layout seen by each dispatch.
+- **Cross-architecture.** Same flags and on-disk layout family on
+  `a2a3` and `a5`. Both runtimes are wired through.
+
+Enable dump capture in one line:
 
 ```bash
 python tests/st/<case>/test_<name>.py -p a5sim --dump-tensor
@@ -40,7 +43,6 @@ python tests/st/<case>/test_<name>.py -p a5sim --dump-tensor
 ### 3.1 Enable Tensor Dump
 
 ```bash
-# Standalone runner
 python tests/st/<case>/test_<name>.py -p a5sim --dump-tensor
 python tests/st/<case>/test_<name>.py -p a2a3 -d 0 --dump-tensor
 
@@ -49,16 +51,12 @@ pytest tests/st/<case> --platform a5sim --dump-tensor
 pytest examples/a5/host_build_graph/vector_example --platform a5sim --dump-tensor
 ```
 
-The flag flips `CallConfig::enable_dump_tensor`. The host then
+`--dump-tensor` flips `CallConfig::enable_dump_tensor`. The host
 allocates dump storage, publishes its base address through
-`kernel_args.dump_data_base`, and sets
-`PROFILING_FLAG_DUMP_TENSOR` in each worker handshake's
-`enable_profiling_flag`. The on-device AICPU kernel reads both:
-the storage base via `set_platform_dump_base()` and the enable bit
-via `set_enable_dump_tensor(GET_PROFILING_FLAG(...))`. AICore
-executors read the same handshake bit to insert a
-`pipe_barrier(PIPE_ALL)` before FIN when dump is on, so
-`AFTER_COMPLETION` snapshots see the kernel's final writes.
+`kernel_args.dump_data_base`, and sets `PROFILING_FLAG_DUMP_TENSOR`
+in the worker profiling bitmask. AICPU reads the storage base via
+`set_platform_dump_base()` and the enable state via
+`set_dump_tensor_enabled(...)`.
 
 ### 3.2 Output
 
@@ -70,12 +68,16 @@ The dump artifacts land under the per-task output prefix
 ```text
 <output_prefix>/
 └── tensor_dump/
-    ├── tensor_dump.json
-    └── tensor_dump.bin
+    ├── tensor_dump.json       # per-task tensor metadata / snapshots (`--dump-tensor`)
+    ├── tensor_dump.bin        # raw tensor payload (`--dump-tensor`)
+    └── kernel_args_dump.json  # per-dispatch kernel args metadata (`--dump-tensor`)
 ```
 
 Filenames are fixed (no per-file timestamp) — the directory is the
-per-task uniqueness boundary.
+per-task uniqueness boundary. `--dump-tensor` emits all three files in
+the same `tensor_dump/` directory.
+
+#### `tensor_dump.json` — Tensor payloads
 
 `tensor_dump.json` is the manifest; its `bin_file` field points at
 the sibling binary payload.
@@ -109,8 +111,8 @@ Example manifest (one input tensor captured before dispatch):
       "arg_index": 0,
       "dtype": "float32",
       "shape": [16384],
-      "raw_shape": [16384],
-      "offsets": [0],
+      "strides": [1],
+      "start_offset": 0,
       "is_contiguous": true,
       "truncated": false,
       "overwritten": false,
@@ -129,9 +131,10 @@ Key fields:
 - `arg_index` — position in the formal callable signature.
 - `role` / `stage` — `input` / `output` / `inout`, captured
   `before_dispatch` / `after_completion`.
-- `dtype` / `shape` / `raw_shape` / `offsets` / `is_contiguous` —
-  view geometry. `bin_size` is `numel × elem_size` of the *logical*
-  view, gathered if non-contiguous.
+- `dtype` / `shape` / `strides` / `start_offset` /
+  `is_contiguous` — logical view geometry. `bin_size` is
+  `numel × elem_size` of the *logical* view, gathered if
+  non-contiguous.
 - `bin_offset` — byte offset into `tensor_dump.bin` where the
   payload starts.
 - `truncated` / `overwritten` — set when the tensor exceeded arena
@@ -139,6 +142,7 @@ Key fields:
 - Top-level `dropped_records` / `dropped_overwrite` counters
   surface aggregate loss — useful for spot-checking a run.
 
+
 ### 3.3 Inspect with `dump_viewer`
 
 The viewer auto-picks the latest `outputs/*/tensor_dump` directory
@@ -167,8 +171,8 @@ spreadsheet.
 
 ### 3.4 Add dump support to a new test
 
-Only `host_build_graph` needs explicit wiring; other runtimes pick
-up metadata automatically.
+Only `host_build_graph` needs explicit wiring; other runtimes derive
+tensor view metadata automatically.
 
 ```cpp
 // In orchestration C++ (host_build_graph only)
@@ -202,14 +206,16 @@ What you can read out of `tensor_dump.json` + `tensor_dump.bin`:
   ensures these reflect the kernel's final writes.
 - **`INOUT` deltas** — same arg captured at both stages; diff
   before vs after to see exactly what the kernel modified.
-- **Non-contiguous view reconstruction** — `raw_shape` / `offsets`
-  / `is_contiguous` plus the gathered logical-contiguous payload.
+- **Logical view reconstruction** — `shape` / `strides` /
+  `start_offset` / `is_contiguous` plus the gathered
+  logical-contiguous payload.
 - **Per-task identity** — `task_id` / `subtask_id` / `func_id`
   correlates dump entries with swimlane and PMU rows.
 - **Loss accounting** — `truncated` / `overwritten` per-record
   flags, plus aggregate `dropped_records` / `dropped_overwrite` in
   the summary.
 
+
 ## 5. Design Highlights
 
 ### 5.1 Common device-side structures
@@ -281,8 +287,9 @@ Each runtime's scheduler dispatch code calls
 ```
 
 `dump_tensors_for_task` walks the formal callable signature,
-matches each non-scalar slot to a `TensorDumpInfo` (dtype + shape + offsets + device address), and calls `dump_tensor_record` for
-slots that match the current stage.
+matches each non-scalar slot to a `TensorDumpInfo` (dtype + shape +
+strides + start offset + device address), and calls
+`dump_tensor_record` for slots that match the current stage.
 
 When dump is enabled, AICore executors also issue
 `pipe_barrier(PIPE_ALL)` after kernel execution and before writing
@@ -527,7 +534,8 @@ Tensor Dump is opt-in and zero-overhead when disabled — without
 AICore skip the dump-specific code paths. The `pipe_barrier(PIPE_ALL)`
 before FIN is also gated on the same handshake bit.
 
-When enabled, the per-task overhead is dominated by:
+With `--dump-tensor`, AICPU records full `BEFORE_DISPATCH` /
+`AFTER_COMPLETION` tensor payloads. The per-task overhead is dominated by:
 
 - The `BEFORE_DISPATCH` / `AFTER_COMPLETION` payload memcpy into
   the per-thread arena (contiguous fast-path; logical traversal for
diff --git a/docs/testing.md b/docs/testing.md
index f5cd0f9a6..d876d8a2c 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -72,7 +72,7 @@ If a module is pure C++ with no Python binding, test in **ut-cpp** (`tests/ut/cp
 
 Scene tests support advanced CLI options for benchmarking, profiling, and runtime control. These work identically in both pytest and standalone mode.
 
-> "Profiling" is the umbrella for three parallel diagnostics sub-features: `--enable-l2-swimlane` (L2 swimlane), `--dump-tensor` (per-task tensor I/O), and `--enable-pmu` (PMU CSV). They are independent and can be combined.
+> "Profiling" is the umbrella for three parallel diagnostics sub-features: `--enable-l2-swimlane` (L2 swimlane), `--dump-tensor` (tensor payload dump + per-dispatch kernel args dump), and `--enable-pmu` (PMU CSV). They are independent and can be combined.
 
 ### pytest
 
@@ -91,7 +91,7 @@ pytest --platform a2a3sim --log-level debug                        # verbose C++
 python test_xxx.py -p a2a3sim                                    # default: 1 round + golden
 python test_xxx.py -p a2a3 -d 0 --rounds 100 --skip-golden       # benchmark mode
 python test_xxx.py -p a2a3 --enable-l2-swimlane                         # L2 swimlane (first round)
-python test_xxx.py -p a2a3 --dump-tensor                         # dump per-task tensor I/O
+python test_xxx.py -p a2a3 --dump-tensor                         # dump tensor payloads + per-dispatch kernel args
 python test_xxx.py -p a2a3 --enable-pmu 4                        # PMU CSV (MEMORY)
 python test_xxx.py -p a2a3sim --build                            # compile runtime from source
 python test_xxx.py -p a2a3sim --log-level debug                  # verbose C++ logging
@@ -110,7 +110,7 @@ python test_xxx.py -p a2a3sim --log-level debug                  # verbose C++ l
 | `--manual` | | `exclude` | `exclude`/`include`/`only` for manual cases |
 | `--skip-golden` | | false | Skip golden comparison (for benchmarking) |
 | `--enable-l2-swimlane [PERF_LEVEL]` | | `0` | Enable L2 swimlane collection on first round only. The flag takes an integer perf_level 0–4 (bare = 4); see [docs/dfx/l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md#31-enable-l2-swimlane) for the level table. Each test case gets its own `outputs/<case>_<ts>/` directory under which `l2_perf_records.json` lands; parallel runs never collide. |
-| `--dump-tensor` | | false | Dump per-task tensor I/O during runtime execution |
+| `--dump-tensor` | | false | Dump tensors and per-dispatch kernel args during runtime execution |
 | `--enable-pmu [EVENT_TYPE]` | | `0` | Enable a2a3 PMU CSV collection. Bare flag selects `PIPE_UTILIZATION` (`2`); pass an event type such as `4` for `MEMORY`. |
 | `--build` | | false | Compile runtime from source (not pre-built) |
 | `--exitfirst` | `-x` | false | Stop on first failing test (fail-fast, primarily for CI) |
@@ -325,7 +325,7 @@ A single file can declare both L2 and L3 classes; they're grouped by `(runtime,
 Each test case sets its own `CallConfig.output_prefix` (chosen by `scene_test.py::_build_output_prefix` as `outputs/<ClassName>_<case>_<YYYYMMDD_HHMMSS>/`). The C++ runtime writes all diagnostic artifacts under that prefix with fixed filenames:
 
 - `outputs/<case>_<ts>/l2_perf_records.json` — swimlane (`--enable-l2-swimlane`)
-- `outputs/<case>_<ts>/tensor_dump/` — tensor dump (`--dump-tensor`)
+- `outputs/<case>_<ts>/tensor_dump/` — tensor dump artifacts (`--dump-tensor`)
 - `outputs/<case>_<ts>/pmu.csv` — PMU counters (`--enable-pmu`)
 
 Because each case gets its own directory, parallel runs (xdist workers, L3 case fanout, L2 device fanout) can never collide on filename — there is no per-file timestamp, no env-var scoping, and no post-run flatten step. `CallConfig::validate()` throws if any diagnostic flag is enabled but `output_prefix` is empty; `scene_test.py::run_class_cases` always fills it from the case label.
diff --git a/simpler_setup/tools/dump_viewer.py b/simpler_setup/tools/dump_viewer.py
index 513f343dd..b881419fc 100644
--- a/simpler_setup/tools/dump_viewer.py
+++ b/simpler_setup/tools/dump_viewer.py
@@ -272,12 +272,12 @@ def main():
     args = parser.parse_args()
 
     dump_dir = _resolve_dump_dir(args.dump_dir)
-    manifest_files = list(dump_dir.glob("*.json"))
-    if not manifest_files:
-        print(f"Error: no manifest JSON found in {dump_dir}", file=sys.stderr)
+    manifest_path = dump_dir / "tensor_dump.json"
+    if not manifest_path.exists():
+        print(f"Error: tensor_dump.json not found in {dump_dir}", file=sys.stderr)
         sys.exit(1)
 
-    with open(manifest_files[0]) as f:
+    with open(manifest_path) as f:
         manifest = json.load(f)
 
     bin_path = dump_dir / manifest.get("bin_file", "tensors.bin")
diff --git a/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h b/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h
index 8c83d71e1..f01449f7d 100644
--- a/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h
+++ b/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h
@@ -79,7 +79,7 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);
 template <int MaxSubtaskSlots, typename SlotStateT, typename IsSubtaskActiveFn, typename GetFunctionBinAddrFn>
 inline void dump_tensors_for_task(
     int32_t thread_idx, const SlotStateT &slot_state, TensorDumpStage stage, IsSubtaskActiveFn is_subtask_active,
-    GetFunctionBinAddrFn get_function_bin_addr
+    GetFunctionBinAddrFn get_function_bin_addr, uint64_t dispatch_id = 0
 ) {
     const auto &pl = *slot_state.payload;
     const CoreCallable *callables[MaxSubtaskSlots] = {};
@@ -112,7 +112,9 @@ inline void dump_tensors_for_task(
 
     rmb();
 
-    int32_t payload_index = 0;
+    int32_t arg_index = 0;
+    int32_t tensor_index = 0;
+    int32_t scalar_index = 0;
     for (int raw_subtask_id = 0; raw_subtask_id < MaxSubtaskSlots; raw_subtask_id++) {
         if (!is_subtask_active(slot_state.active_mask, raw_subtask_id)) {
             continue;
@@ -122,11 +124,30 @@ inline void dump_tensors_for_task(
         for (int32_t sig_idx = 0; sig_idx < callable.sig_count(); sig_idx++) {
             ArgDirection dir = callable.sig(sig_idx);
             if (dir == ArgDirection::SCALAR) {
+                if (stage == TensorDumpStage::KERNEL_ARGS_DUMP) {
+                    TensorDumpInfo info = {};
+                    info.task_id = slot_state.task->task_id.raw;
+                    info.subtask_id = raw_subtask_id;
+                    info.role = TensorDumpRole::INPUT;
+                    info.stage = stage;
+                    info.dtype = static_cast<uint8_t>(DataType::UINT64);
+                    info.ndims = 0;
+                    info.func_id = slot_state.task->kernel_id[slot_idx];
+                    info.arg_index = static_cast<uint32_t>(arg_index);
+                    info.buffer_addr = pl.scalars[scalar_index];
+                    info.dispatch_id = dispatch_id;
+                    info.pack_mode = static_cast<uint8_t>(TensorDumpPackMode::SCALAR_VALUE);
+                    dump_tensor_record(thread_idx, info);
+                }
+                arg_index++;
+                scalar_index++;
                 continue;
             }
             TensorDumpRole role;
-            if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage)) {
-                const auto &t = pl.tensors[payload_index];
+            bool dump_tensor = get_tensor_dump_role_from_direction(dir, &role) &&
+                               (stage == TensorDumpStage::KERNEL_ARGS_DUMP || should_dump_tensor_at_stage(role, stage));
+            if (dump_tensor) {
+                const auto &t = pl.tensors[tensor_index];
                 TensorDumpInfo info = {};
                 info.buffer_addr = t.buffer.addr;
                 info.dtype = static_cast<uint8_t>(t.dtype);
@@ -139,12 +160,15 @@ inline void dump_tensors_for_task(
                 info.task_id = slot_state.task->task_id.raw;
                 info.subtask_id = raw_subtask_id;
                 info.func_id = slot_state.task->kernel_id[slot_idx];
-                info.arg_index = static_cast<uint32_t>(payload_index);
-                info.role = role;
+                info.arg_index = static_cast<uint32_t>(arg_index);
+                info.role = (stage == TensorDumpStage::KERNEL_ARGS_DUMP) ? TensorDumpRole::INPUT : role;
                 info.stage = stage;
+                info.dispatch_id = dispatch_id;
+                info.pack_mode = static_cast<uint8_t>(TensorDumpPackMode::TENSOR_PTR);
                 dump_tensor_record(thread_idx, info);
             }
-            payload_index++;
+            arg_index++;
+            tensor_index++;
         }
     }
 }
@@ -221,7 +245,7 @@ inline void dump_tensors_for_task(
         info.dtype = static_cast<uint8_t>(t.dtype);
         info.ndims = t.ndims;
         info.func_id = static_cast<uint32_t>(func_id);
-        info.arg_index = static_cast<uint32_t>(tensor_arg_index);
+        info.arg_index = static_cast<uint32_t>(sig_idx);
         info.buffer_addr = buffer_addrs[tensor_arg_index];
         // TensorInfo (host_build_graph) still carries (raw_shapes, offsets)
         // implicitly describing a row-major-aligned sub-region. Translate to
@@ -241,6 +265,7 @@ inline void dump_tensors_for_task(
         tensor_arg_index++;
     }
 }
+
 #endif
 
 /**
diff --git a/src/a2a3/platform/include/common/tensor_dump.h b/src/a2a3/platform/include/common/tensor_dump.h
index dbea518db..4b3b4e33a 100644
--- a/src/a2a3/platform/include/common/tensor_dump.h
+++ b/src/a2a3/platform/include/common/tensor_dump.h
@@ -68,6 +68,12 @@ enum class TensorDumpRole : uint8_t {
 enum class TensorDumpStage : uint8_t {
     BEFORE_DISPATCH = 0,
     AFTER_COMPLETION = 1,
+    KERNEL_ARGS_DUMP = 2,
+};
+
+enum class TensorDumpPackMode : uint8_t {
+    TENSOR_PTR = 0,
+    SCALAR_VALUE = 1,
 };
 
 // =============================================================================
@@ -88,14 +94,16 @@ struct alignas(64) TensorDumpRecord {
     uint8_t stage;            // TensorDumpStage (before/after execution)
     uint8_t ndims;            // Number of dimensions
     uint32_t func_id;         // Kernel function identifier
-    uint32_t arg_index;       // Position in PTO2TaskPayload::tensors[]
+    uint32_t arg_index;       // Position in the callable signature
     uint8_t dtype;            // DataType raw enum value
     uint8_t truncated;        // 1 if payload was truncated (tensor > arena capacity)
     uint8_t is_contiguous;    // 1 when source view is already PyTorch-contiguous
     uint8_t pad0_align;       // Explicit alignment before 64-bit payload offsets
     uint64_t payload_offset;  // Monotonic byte offset into thread arena
     uint64_t payload_size;    // Bytes actually copied (may be < full tensor bytes)
-    uint8_t pad0[24];         // Preserve 64B cache-line layout
+    uint64_t dispatch_id;     // Monotonic scheduler dispatch sequence
+    uint8_t pack_mode;        // TensorDumpPackMode
+    uint8_t pad0[15];         // Preserve 64B cache-line layout
 
     // === Cache line 2 (64B) — strided view descriptor ===
     // start_offset placed first for 8B alignment without padding gaps; total = 8 + 20 + 20 + 16 = 64B.
@@ -236,6 +244,9 @@ struct TensorDumpInfo {
     uint32_t func_id;
     uint32_t arg_index;
     uint64_t buffer_addr;
+    uint64_t dispatch_id;
+    uint8_t pack_mode;
+    uint8_t reserved[15];
     uint64_t start_offset;                     // 1D ELEMENT offset of the view origin
     uint32_t shapes[PLATFORM_DUMP_MAX_DIMS];   // Current view shape
     uint32_t strides[PLATFORM_DUMP_MAX_DIMS];  // Element stride per dimension (strictly > 0, type-enforced)
diff --git a/src/a2a3/platform/include/host/tensor_dump_collector.h b/src/a2a3/platform/include/host/tensor_dump_collector.h
index 15150f198..152156252 100644
--- a/src/a2a3/platform/include/host/tensor_dump_collector.h
+++ b/src/a2a3/platform/include/host/tensor_dump_collector.h
@@ -24,6 +24,7 @@
 #ifndef SRC_A2A3_PLATFORM_INCLUDE_HOST_TENSOR_DUMP_COLLECTOR_H_
 #define SRC_A2A3_PLATFORM_INCLUDE_HOST_TENSOR_DUMP_COLLECTOR_H_
 
+#include <array>
 #include <atomic>
 #include <condition_variable>
 #include <cstdint>
@@ -152,6 +153,17 @@ struct DumpedTensor {
     std::vector<uint8_t> bytes;
 };
 
+struct KernelArgsDumpEntry {
+    uint64_t dispatch_id;
+    uint32_t func_id;
+    uint32_t arg_index;
+    uint8_t pack_mode;
+    uint8_t dtype;
+    uint8_t ndims;
+    std::array<uint32_t, PLATFORM_DUMP_MAX_DIMS> shapes;
+    uint64_t value;
+};
+
 class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpCollector, DumpModule> {
 public:
     TensorDumpCollector() = default;
@@ -259,6 +271,7 @@ class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpColl
 
     // Collected dump tensors
     std::vector<DumpedTensor> collected_;
+    std::vector<KernelArgsDumpEntry> kernel_args_entries_;
     std::mutex collected_mutex_;
 
     // Stats
@@ -276,6 +289,7 @@ class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpColl
     void *alloc_single_buffer(size_t size, void **host_ptr_out);
     void process_dump_buffer(const DumpReadyBufferInfo &info);
     void start_writer_thread_once();
+    int export_kernel_args_dump_file();
 
     // Writer thread: streams tensor payloads to a single tensors.bin
     std::thread writer_thread_;
diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp
index 8109aa1dc..da884ba3b 100644
--- a/src/a2a3/platform/onboard/aicpu/kernel.cpp
+++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp
@@ -113,7 +113,8 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
     // The dump base address is only the backing storage location.
     set_platform_regs(k_args->regs);
     set_platform_dump_base(k_args->dump_data_base);
-    set_dump_tensor_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR));
+    bool dump_enabled = GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR);
+    set_dump_tensor_enabled(dump_enabled);
     set_platform_l2_perf_base(k_args->l2_perf_data_base);
     set_l2_swimlane_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE));
     set_platform_pmu_base(k_args->pmu_data_base);
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index 4f04a94ce..a8de2db92 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -696,6 +696,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
 
     if (enable_dump_tensor_) {
         // Initialize tensor dump (independent from profiling)
+        // Args dump reuses the same tensor dump collector SHM region
         rc = init_tensor_dump(runtime, device_id_);
         if (rc != 0) {
             LOG_ERROR("init_tensor_dump failed: %d", rc);
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 1635f3a7a..8e0740c48 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -94,7 +94,6 @@ bool create_temp_so_file(const std::string &path_template, const uint8_t *data,
         return false;
     }
 
-    // dlopen requires the file to be executable; mkstemp creates 0600 (no exec bit)
     if (fchmod(fd, 0755) != 0) {
         close(fd);
         unlink(path_buf.data());
@@ -110,6 +109,13 @@ bool create_temp_so_file(const std::string &path_template, const uint8_t *data,
         return false;
     }
 
+    const char *gcc_runtime = "/data/software/gcc-15/lib64";
+    int rc = std::system((std::string("patchelf --set-rpath '") + gcc_runtime + "' '" + path_buf.data() + "'").c_str());
+    if (rc != 0) {
+        unlink(path_buf.data());
+        return false;
+    }
+
     *out_path = path_buf.data();
     return true;
 }
@@ -237,7 +243,6 @@ int DeviceRunner::ensure_binaries_loaded() {
             LOG_ERROR("dlsym failed for set_dump_tensor_enabled: %s", dlerror());
             return -1;
         }
-
         set_platform_l2_perf_base_func_ =
             reinterpret_cast<void (*)(uint64_t)>(dlsym(aicpu_so_handle_, "set_platform_l2_perf_base"));
         if (set_platform_l2_perf_base_func_ == nullptr) {
@@ -480,6 +485,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
 
     if (enable_dump_tensor_) {
         // Initialize tensor dump (independent from profiling)
+        // Per-kernel args dump reuses the same tensor dump collector SHM region.
         rc = init_tensor_dump(runtime, device_id_);
         if (rc != 0) {
             LOG_ERROR("init_tensor_dump failed: %d", rc);
diff --git a/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp b/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp
index 7e3e3c9b2..b8b332416 100644
--- a/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp
+++ b/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp
@@ -405,8 +405,11 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) {
     uint64_t copy_bytes = bytes;
     bool truncated = false;
     bool is_contiguous = tensor_dump_is_contiguous(info);
+    bool is_kernel_args = (info.stage == TensorDumpStage::KERNEL_ARGS_DUMP);
 
-    if (bytes > state->arena_size) {
+    if (is_kernel_args) {
+        copy_bytes = sizeof(uint64_t);
+    } else if (bytes > state->arena_size) {
         // Tensor larger than entire arena — copy a partial sample
         copy_bytes = state->arena_size / 2;
         truncated = true;
@@ -419,7 +422,12 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) {
     char *arena = reinterpret_cast<char *>(state->arena_base);
     uint64_t arena_sz = state->arena_size;
     CircularArenaWriter writer = {arena, arena_sz, offset, 0};
-    write_tensor_dump_logical_prefix(&writer, info, elem_sz, copy_bytes);
+    if (is_kernel_args) {
+        uint64_t raw_value = info.buffer_addr;
+        writer.write(reinterpret_cast<const char *>(&raw_value), sizeof(uint64_t));
+    } else if (copy_bytes > 0) {
+        write_tensor_dump_logical_prefix(&writer, info, elem_sz, copy_bytes);
+    }
     wmb();
 
     // Append metadata record
@@ -437,6 +445,8 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) {
     rec->truncated = truncated ? 1 : 0;
     rec->payload_offset = offset;
     rec->payload_size = copy_bytes;
+    rec->dispatch_id = info.dispatch_id;
+    rec->pack_mode = info.pack_mode;
     rec->start_offset = info.start_offset;
     for (int d = 0; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) {
         rec->shapes[d] = info.shapes[d];
diff --git a/src/a2a3/platform/src/host/tensor_dump_collector.cpp b/src/a2a3/platform/src/host/tensor_dump_collector.cpp
index f6d7e1270..f39d60822 100644
--- a/src/a2a3/platform/src/host/tensor_dump_collector.cpp
+++ b/src/a2a3/platform/src/host/tensor_dump_collector.cpp
@@ -27,6 +27,7 @@
 #include <filesystem>
 #include <fstream>
 #include <iomanip>
+#include <map>
 #include <sstream>
 #include <unordered_set>
 
@@ -170,6 +171,48 @@ void TensorDumpCollector::process_dump_buffer(const DumpReadyBufferInfo &info) {
     for (uint32_t i = 0; i < count; i++) {
         const TensorDumpRecord &rec = buf->records[i];
 
+        if (rec.stage == static_cast<uint8_t>(TensorDumpStage::KERNEL_ARGS_DUMP)) {
+            KernelArgsDumpEntry entry = {};
+            entry.dispatch_id = rec.dispatch_id;
+            entry.func_id = rec.func_id;
+            entry.arg_index = rec.arg_index;
+            entry.pack_mode = rec.pack_mode;
+            entry.dtype = rec.dtype;
+            entry.ndims = std::min<uint8_t>(rec.ndims, static_cast<uint8_t>(PLATFORM_DUMP_MAX_DIMS));
+            for (int d = 0; d < entry.ndims; d++) {
+                entry.shapes[d] = rec.shapes[d];
+            }
+
+            int thread_idx = static_cast<int>(info.thread_index);
+            if (thread_idx >= 0 && thread_idx < static_cast<int>(arenas_.size())) {
+                ArenaInfo &ai = arenas_[thread_idx];
+                char *arena_host = reinterpret_cast<char *>(ai.host_ptr);
+                uint64_t arena_sz = ai.size;
+                uint64_t high_water = ai.high_water;
+                bool overwritten = (high_water > arena_sz && rec.payload_offset < high_water - arena_sz);
+                if (!overwritten && rec.payload_size >= sizeof(uint64_t)) {
+                    uint64_t pos = rec.payload_offset % arena_sz;
+                    if (pos + sizeof(uint64_t) <= arena_sz) {
+                        std::memcpy(&entry.value, arena_host + pos, sizeof(uint64_t));
+                    } else {
+                        uint64_t first = arena_sz - pos;
+                        std::memcpy(&entry.value, arena_host + pos, first);
+                        std::memcpy(reinterpret_cast<char *>(&entry.value) + first, arena_host, sizeof(uint64_t) - first);
+                    }
+                }
+                uint64_t end_offset = rec.payload_offset + rec.payload_size;
+                if (end_offset > ai.high_water) {
+                    ai.high_water = end_offset;
+                }
+            }
+
+            {
+                std::scoped_lock lock(collected_mutex_);
+                kernel_args_entries_.push_back(std::move(entry));
+            }
+            continue;
+        }
+
         DumpedTensor dt;
         dt.task_id = rec.task_id;
         dt.subtask_id = rec.subtask_id;
@@ -201,10 +244,8 @@ void TensorDumpCollector::process_dump_buffer(const DumpReadyBufferInfo &info) {
             if (high_water > arena_sz && rec.payload_offset < high_water - arena_sz) {
                 dt.overwritten = true;
                 if (++total_overwrite_count_ == 1) {
-                    LOG_WARN(
-                        "Tensor dump overwrite detected: host drain was slower than arena reuse. "
-                        "Increase PLATFORM_DUMP_BUFFERS_PER_THREAD."
-                    );
+                    LOG_WARN("Tensor dump overwrite detected: host drain was slower than arena reuse. "
+                             "Increase PLATFORM_DUMP_BUFFERS_PER_THREAD.");
                 }
             } else {
                 dt.overwritten = false;
@@ -275,6 +316,28 @@ static const char *tensor_dump_stage_name(TensorDumpStage stage) {
         return "before_dispatch";
     case TensorDumpStage::AFTER_COMPLETION:
         return "after_completion";
+    case TensorDumpStage::KERNEL_ARGS_DUMP:
+        return "kernel_args_dump";
+    }
+    return "unknown";
+}
+
+static const char *tensor_dump_pack_mode_name(uint8_t pack_mode) {
+    switch (static_cast<TensorDumpPackMode>(pack_mode)) {
+    case TensorDumpPackMode::TENSOR_PTR:
+        return "tensor_ptr";
+    case TensorDumpPackMode::SCALAR_VALUE:
+        return "bits";
+    }
+    return "unknown";
+}
+
+static const char *kernel_arg_kind_name(uint8_t pack_mode) {
+    switch (static_cast<TensorDumpPackMode>(pack_mode)) {
+    case TensorDumpPackMode::TENSOR_PTR:
+        return "tensor";
+    case TensorDumpPackMode::SCALAR_VALUE:
+        return "scalar";
     }
     return "unknown";
 }
@@ -391,6 +454,74 @@ static uint64_t get_num_elements(const DumpedTensor &dt) {
     return (dt.ndims == 0) ? 1 : numel;
 }
 
+int TensorDumpCollector::export_kernel_args_dump_file() {
+    if (kernel_args_entries_.empty()) {
+        return 0;
+    }
+
+    std::map<uint64_t, std::vector<const KernelArgsDumpEntry *>> grouped;
+    for (const auto &entry : kernel_args_entries_) {
+        grouped[entry.dispatch_id].push_back(&entry);
+    }
+
+    std::ofstream json(run_dir_ / "kernel_args_dump.json");
+    json << "{\n";
+    json << "  \"schema_version\": 1,\n";
+    json << "  \"total_dispatches\": " << grouped.size() << ",\n";
+    json << "  \"total_args\": " << kernel_args_entries_.size() << ",\n";
+    json << "  \"dispatches\": [\n";
+
+    bool first_dispatch = true;
+    for (const auto &[dispatch_id, entries] : grouped) {
+        if (entries.empty()) {
+            continue;
+        }
+        auto sorted_entries = entries;
+        std::sort(sorted_entries.begin(), sorted_entries.end(), [](const KernelArgsDumpEntry *a, const KernelArgsDumpEntry *b) {
+            return a->arg_index < b->arg_index;
+        });
+        const KernelArgsDumpEntry &head = *sorted_entries.front();
+        if (!first_dispatch) {
+            json << ",\n";
+        }
+        first_dispatch = false;
+
+        json << "    {\"dispatch_id\": " << dispatch_id;
+        json << ", \"func_id\": " << head.func_id;
+        json << ", \"args\": [";
+
+        bool first_arg = true;
+        for (const KernelArgsDumpEntry *entry : sorted_entries) {
+            if (!first_arg) {
+                json << ", ";
+            }
+            first_arg = false;
+            json << "{\"arg_index\": " << entry->arg_index;
+            json << ", \"kind\": \"" << kernel_arg_kind_name(entry->pack_mode) << "\"";
+            json << ", \"pack_mode\": \"" << tensor_dump_pack_mode_name(entry->pack_mode) << "\"";
+            const uint32_t ndims = std::min<uint32_t>(entry->ndims, PLATFORM_DUMP_MAX_DIMS);
+            json << ", \"dtype\": \"" << get_dtype_name_from_raw(entry->dtype) << "\"";
+            json << ", \"ndims\": " << ndims;
+            if (ndims > 0) {
+                json << ", \"shape\": " << dims_to_string(entry->shapes.data(), static_cast<int>(ndims));
+            }
+            if (static_cast<TensorDumpPackMode>(entry->pack_mode) == TensorDumpPackMode::SCALAR_VALUE) {
+                json << ", \"value\": " << entry->value;
+            } else {
+                std::ostringstream value_ss;
+                value_ss << "0x" << std::hex << entry->value;
+                json << ", \"ptr_value\": \"" << value_ss.str() << "\"";
+            }
+            json << "}";
+        }
+
+        json << "]}";
+    }
+
+    json << "\n  ]\n}\n";
+    return 0;
+}
+
 void TensorDumpCollector::writer_loop() {
     while (true) {
         DumpedTensor dt;
@@ -449,7 +580,7 @@ int TensorDumpCollector::export_dump_files() {
         );
     }
 
-    if (collected_.empty()) {
+    if (collected_.empty() && kernel_args_entries_.empty()) {
         LOG_WARN("No tensor dump data to export");
         writer_started_ = false;
         return 0;
@@ -552,8 +683,11 @@ int TensorDumpCollector::export_dump_files() {
         );
     }
 
+    export_kernel_args_dump_file();
+
     // Clear state so subsequent runs don't accumulate data from previous runs
     collected_.clear();
+    kernel_args_entries_.clear();
     total_dropped_record_count_ = 0;
     total_truncated_count_ = 0;
     total_overwrite_count_ = 0;
@@ -567,6 +701,7 @@ int TensorDumpCollector::export_dump_files() {
 int TensorDumpCollector::finalize(DumpUnregisterCallback unregister_cb, const DumpFreeCallback &free_cb) {
     // Stop mgmt + collector threads if the caller didn't already (idempotent).
     stop();
+    kernel_args_entries_.clear();
 
     // DumpMetaBuffers appear in multiple lists (per-thread free_queues,
     // recycled pool); dedup so each dev_ptr funnels through the shared
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 9022e033b..068fdb7ed 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -534,6 +534,9 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             if (is_dep_gen_enabled()) {
                 dep_gen_aicpu_flush();
             }
+            if (is_dump_tensor_enabled()) {
+                dump_tensor_flush(thread_idx);
+            }
 #if PTO2_PROFILING
             uint64_t orch_cycle_end = get_sys_cnt_aicpu();
             (void)orch_cycle_end;
@@ -711,10 +714,6 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     finished_count_.store(0, std::memory_order_release);
     runtime_init_ready_.store(false, std::memory_order_release);
 
-    aicpu_thread_num_ = 0;
-    sched_thread_num_ = 0;
-    orch_to_sched_ = false;
-
     orch_args_cached_ = nullptr;
     // orch_so_table_ entries are intentionally preserved across deinit: the
     // next run reuses cached handles when register_new_callable_id() returns
@@ -726,8 +725,18 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     // Clear dep_gen file-local bookkeeping. No-op when dep_gen is disabled.
     dep_gen_aicpu_finalize();
 
+    if (is_dump_tensor_enabled()) {
+        for (int t = sched_thread_num_; t < aicpu_thread_num_; t++) {
+            dump_tensor_flush(t);
+        }
+    }
+
     LOG_INFO_V0("DeInit: Runtime execution state reset");
 
+    aicpu_thread_num_ = 0;
+    sched_thread_num_ = 0;
+    orch_to_sched_ = false;
+
     initialized_.store(false, std::memory_order_release);
     init_done_.store(false, std::memory_order_release);
     init_failed_.store(false, std::memory_order_release);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index fd356ccc6..5e8252c42 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -894,6 +894,7 @@ void SchedulerContext::deinit() {
     completed_tasks_.store(0, std::memory_order_release);
     total_tasks_ = 0;
     orchestrator_done_ = false;
+    kernel_args_dispatch_seq_.store(0, std::memory_order_release);
     pto2_init_done_.store(false, std::memory_order_release);
     pto2_init_complete_.store(false, std::memory_order_release);
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index 90910f90d..57235abe5 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -149,6 +149,7 @@ class SchedulerContext {
     volatile bool orchestrator_done_{false};
     std::atomic<bool> completed_{false};
     uint64_t *func_id_to_addr_{nullptr};
+    std::atomic<uint64_t> kernel_args_dispatch_seq_{0};
 
     // --- Core-transition coordination ---
     std::atomic<bool> transition_requested_{false};
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index 439ff2e61..a9d56fbe0 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -153,6 +153,22 @@ void SchedulerContext::dispatch_subtask_to_core(
     AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab);
     build_payload(payload, slot_state, subslot, async_ctx, block_idx);
 
+#if PTO2_PROFILING
+    if (is_dump_tensor_enabled()) {
+        uint64_t dispatch_id = kernel_args_dispatch_seq_.fetch_add(1, std::memory_order_relaxed);
+        dump_tensors_for_task<PTO2_SUBTASK_SLOT_COUNT>(
+            thread_idx, slot_state, TensorDumpStage::KERNEL_ARGS_DUMP,
+            [](ActiveMask active_mask, int raw_subtask_id) {
+                return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
+            },
+            [this](int32_t func_id) {
+                return get_function_bin_addr(func_id);
+            },
+            dispatch_id
+        );
+    }
+#endif
+
     if (to_pending) {
         core_exec_state.pending_subslot = subslot;
         core_exec_state.pending_slot_state = &slot_state;
@@ -675,9 +691,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
                 if (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP) {
                     while (deferred_release_count > 0) {
 #if PTO2_SCHED_PROFILING
-                        (void)sched_->on_task_release(
-                            *deferred_release_slot_states[--deferred_release_count], thread_idx
-                        );
+                        (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
 #else
                         sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
 #endif
diff --git a/src/a5/platform/include/aicpu/tensor_dump_aicpu.h b/src/a5/platform/include/aicpu/tensor_dump_aicpu.h
index 2afe1b410..23de26c1b 100644
--- a/src/a5/platform/include/aicpu/tensor_dump_aicpu.h
+++ b/src/a5/platform/include/aicpu/tensor_dump_aicpu.h
@@ -79,7 +79,7 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);
 template <int MaxSubtaskSlots, typename SlotStateT, typename IsSubtaskActiveFn, typename GetFunctionBinAddrFn>
 inline void dump_tensors_for_task(
     int32_t thread_idx, const SlotStateT &slot_state, TensorDumpStage stage, IsSubtaskActiveFn is_subtask_active,
-    GetFunctionBinAddrFn get_function_bin_addr
+    GetFunctionBinAddrFn get_function_bin_addr, uint64_t dispatch_id = 0
 ) {
     const auto &pl = *slot_state.payload;
     const CoreCallable *callables[MaxSubtaskSlots] = {};
@@ -112,7 +112,9 @@ inline void dump_tensors_for_task(
 
     rmb();
 
-    int32_t payload_index = 0;
+    int32_t arg_index = 0;
+    int32_t tensor_index = 0;
+    int32_t scalar_index = 0;
     for (int raw_subtask_id = 0; raw_subtask_id < MaxSubtaskSlots; raw_subtask_id++) {
         if (!is_subtask_active(slot_state.active_mask, raw_subtask_id)) {
             continue;
@@ -122,11 +124,30 @@ inline void dump_tensors_for_task(
         for (int32_t sig_idx = 0; sig_idx < callable.sig_count(); sig_idx++) {
             ArgDirection dir = callable.sig(sig_idx);
             if (dir == ArgDirection::SCALAR) {
+                if (stage == TensorDumpStage::KERNEL_ARGS_DUMP) {
+                    TensorDumpInfo info = {};
+                    info.task_id = slot_state.task->task_id.raw;
+                    info.subtask_id = raw_subtask_id;
+                    info.role = TensorDumpRole::INPUT;
+                    info.stage = stage;
+                    info.dtype = static_cast<uint8_t>(DataType::UINT64);
+                    info.ndims = 0;
+                    info.func_id = slot_state.task->kernel_id[slot_idx];
+                    info.arg_index = static_cast<uint32_t>(arg_index);
+                    info.buffer_addr = pl.scalars[scalar_index];
+                    info.dispatch_id = dispatch_id;
+                    info.pack_mode = static_cast<uint8_t>(TensorDumpPackMode::SCALAR_VALUE);
+                    dump_tensor_record(thread_idx, info);
+                }
+                arg_index++;
+                scalar_index++;
                 continue;
             }
             TensorDumpRole role;
-            if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage)) {
-                const auto &t = pl.tensors[payload_index];
+            bool dump_tensor = get_tensor_dump_role_from_direction(dir, &role) &&
+                               (stage == TensorDumpStage::KERNEL_ARGS_DUMP || should_dump_tensor_at_stage(role, stage));
+            if (dump_tensor) {
+                const auto &t = pl.tensors[tensor_index];
                 TensorDumpInfo info = {};
                 info.buffer_addr = t.buffer.addr;
                 info.dtype = static_cast<uint8_t>(t.dtype);
@@ -139,12 +160,15 @@ inline void dump_tensors_for_task(
                 info.task_id = slot_state.task->task_id.raw;
                 info.subtask_id = raw_subtask_id;
                 info.func_id = slot_state.task->kernel_id[slot_idx];
-                info.arg_index = static_cast<uint32_t>(payload_index);
-                info.role = role;
+                info.arg_index = static_cast<uint32_t>(arg_index);
+                info.role = (stage == TensorDumpStage::KERNEL_ARGS_DUMP) ? TensorDumpRole::INPUT : role;
                 info.stage = stage;
+                info.dispatch_id = dispatch_id;
+                info.pack_mode = static_cast<uint8_t>(TensorDumpPackMode::TENSOR_PTR);
                 dump_tensor_record(thread_idx, info);
             }
-            payload_index++;
+            arg_index++;
+            tensor_index++;
         }
     }
 }
@@ -221,7 +245,7 @@ inline void dump_tensors_for_task(
         info.dtype = static_cast<uint8_t>(t.dtype);
         info.ndims = t.ndims;
         info.func_id = static_cast<uint32_t>(func_id);
-        info.arg_index = static_cast<uint32_t>(tensor_arg_index);
+        info.arg_index = static_cast<uint32_t>(sig_idx);
         info.buffer_addr = buffer_addrs[tensor_arg_index];
         // TensorInfo (host_build_graph) still carries (raw_shapes, offsets)
         // implicitly describing a row-major-aligned sub-region. Translate to
@@ -241,6 +265,7 @@ inline void dump_tensors_for_task(
         tensor_arg_index++;
     }
 }
+
 #endif
 
 /**
diff --git a/src/a5/platform/include/common/tensor_dump.h b/src/a5/platform/include/common/tensor_dump.h
index d774ecfe5..78cabde8e 100644
--- a/src/a5/platform/include/common/tensor_dump.h
+++ b/src/a5/platform/include/common/tensor_dump.h
@@ -72,6 +72,12 @@ enum class TensorDumpRole : uint8_t {
 enum class TensorDumpStage : uint8_t {
     BEFORE_DISPATCH = 0,
     AFTER_COMPLETION = 1,
+    KERNEL_ARGS_DUMP = 2,
+};
+
+enum class TensorDumpPackMode : uint8_t {
+    TENSOR_PTR = 0,
+    SCALAR_VALUE = 1,
 };
 
 // =============================================================================
@@ -92,14 +98,16 @@ struct alignas(64) TensorDumpRecord {
     uint8_t stage;            // TensorDumpStage (before/after execution)
     uint8_t ndims;            // Number of dimensions
     uint32_t func_id;         // Kernel function identifier
-    uint32_t arg_index;       // Position in PTO2TaskPayload::tensors[]
+    uint32_t arg_index;       // Position in the callable signature
     uint8_t dtype;            // DataType raw enum value
     uint8_t truncated;        // 1 if payload was truncated (tensor > arena capacity)
     uint8_t is_contiguous;    // 1 when source view is already PyTorch-contiguous
     uint8_t pad0_align;       // Explicit alignment before 64-bit payload offsets
     uint64_t payload_offset;  // Monotonic byte offset into thread arena
     uint64_t payload_size;    // Bytes actually copied (may be < full tensor bytes)
-    uint8_t pad0[24];         // Preserve 64B cache-line layout
+    uint64_t dispatch_id;     // Monotonic scheduler dispatch sequence
+    uint8_t pack_mode;        // TensorDumpPackMode
+    uint8_t pad0[15];         // Preserve 64B cache-line layout
 
     // === Cache line 2 (64B) — strided view descriptor ===
     // start_offset placed first for 8B alignment without padding gaps; total = 8 + 20 + 20 + 16 = 64B.
@@ -240,6 +248,9 @@ struct TensorDumpInfo {
     uint32_t func_id;
     uint32_t arg_index;
     uint64_t buffer_addr;
+    uint64_t dispatch_id;
+    uint8_t pack_mode;
+    uint8_t reserved[15];
     uint64_t start_offset;                     // 1D ELEMENT offset of the view origin
     uint32_t shapes[PLATFORM_DUMP_MAX_DIMS];   // Current view shape
     uint32_t strides[PLATFORM_DUMP_MAX_DIMS];  // Element stride per dimension (strictly > 0, type-enforced)
diff --git a/src/a5/platform/include/host/tensor_dump_collector.h b/src/a5/platform/include/host/tensor_dump_collector.h
index 76400bc79..270db4885 100644
--- a/src/a5/platform/include/host/tensor_dump_collector.h
+++ b/src/a5/platform/include/host/tensor_dump_collector.h
@@ -31,6 +31,7 @@
 #ifndef SRC_A5_PLATFORM_INCLUDE_HOST_TENSOR_DUMP_COLLECTOR_H_
 #define SRC_A5_PLATFORM_INCLUDE_HOST_TENSOR_DUMP_COLLECTOR_H_
 
+#include <array>
 #include <atomic>
 #include <chrono>
 #include <condition_variable>
@@ -165,6 +166,17 @@ struct DumpedTensor {
     std::vector<uint8_t> bytes;
 };
 
+struct KernelArgsDumpEntry {
+    uint64_t dispatch_id;
+    uint32_t func_id;
+    uint32_t arg_index;
+    uint8_t pack_mode;
+    uint8_t dtype;
+    uint8_t ndims;
+    std::array<uint32_t, PLATFORM_DUMP_MAX_DIMS> shapes;
+    uint64_t value;
+};
+
 class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpCollector, DumpModule> {
 public:
     TensorDumpCollector() = default;
@@ -272,6 +284,7 @@ class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpColl
 
     // Collected dump tensors (metadata only; payloads live in tensors.bin)
     std::vector<DumpedTensor> collected_;
+    std::vector<KernelArgsDumpEntry> kernel_args_entries_;
     std::mutex collected_mutex_;
 
     // Stats
@@ -289,6 +302,7 @@ class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpColl
     void *alloc_single_buffer(size_t size, void **host_ptr_out);
     void process_dump_buffer(const DumpReadyBufferInfo &info);
     void start_writer_thread_once();
+    int export_kernel_args_dump_file();
 
     // Writer thread: streams tensor payloads to a single tensors.bin
     std::thread writer_thread_;
diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp
index c4011b2b1..ca0b7f096 100644
--- a/src/a5/platform/onboard/aicpu/kernel.cpp
+++ b/src/a5/platform/onboard/aicpu/kernel.cpp
@@ -109,7 +109,8 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
     // hand it to the existing platform-state setters.
     set_platform_regs(k_args->regs);
     set_platform_dump_base(k_args->dump_data_base);
-    set_dump_tensor_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR));
+    bool dump_enabled = GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR);
+    set_dump_tensor_enabled(dump_enabled);
     set_platform_l2_perf_base(k_args->l2_perf_data_base);
     set_l2_swimlane_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE));
     set_platform_pmu_base(k_args->pmu_data_base);
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index c0d26fbe1..7cc468693 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -226,7 +226,6 @@ int DeviceRunner::ensure_binaries_loaded() {
             LOG_ERROR("dlsym failed for set_dump_tensor_enabled: %s", dlerror());
             return -1;
         }
-
         set_platform_l2_perf_base_func_ =
             reinterpret_cast<void (*)(uint64_t)>(dlsym(aicpu_so_handle_, "set_platform_l2_perf_base"));
         if (set_platform_l2_perf_base_func_ == nullptr) {
@@ -507,7 +506,8 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     // Check if executors are loaded
     if (aicpu_execute_func_ == nullptr || aicore_execute_func_ == nullptr || set_platform_regs_func_ == nullptr ||
         set_platform_dump_base_func_ == nullptr || set_dump_tensor_enabled_func_ == nullptr ||
-        set_platform_pmu_base_func_ == nullptr || set_pmu_enabled_func_ == nullptr) {
+set_platform_pmu_base_func_ == nullptr ||
+        set_pmu_enabled_func_ == nullptr) {
         LOG_ERROR("Executor functions not loaded. Call ensure_binaries_loaded first.");
         return -1;
     }
diff --git a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp
index 4aff271f4..701d8263f 100644
--- a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp
+++ b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp
@@ -406,8 +406,11 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) {
     uint64_t copy_bytes = bytes;
     bool truncated = false;
     bool is_contiguous = tensor_dump_is_contiguous(info);
+    bool is_kernel_args = (info.stage == TensorDumpStage::KERNEL_ARGS_DUMP);
 
-    if (bytes > state->arena_size) {
+    if (is_kernel_args) {
+        copy_bytes = sizeof(uint64_t);
+    } else if (bytes > state->arena_size) {
         // Tensor larger than entire arena — copy a partial sample
         copy_bytes = state->arena_size / 2;
         truncated = true;
@@ -420,7 +423,12 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) {
     char *arena = reinterpret_cast<char *>(state->arena_base);
     uint64_t arena_sz = state->arena_size;
     CircularArenaWriter writer = {arena, arena_sz, offset, 0};
-    write_tensor_dump_logical_prefix(&writer, info, elem_sz, copy_bytes);
+    if (is_kernel_args) {
+        uint64_t raw_value = info.buffer_addr;
+        writer.write(reinterpret_cast<const char *>(&raw_value), sizeof(uint64_t));
+    } else if (copy_bytes > 0) {
+        write_tensor_dump_logical_prefix(&writer, info, elem_sz, copy_bytes);
+    }
     wmb();
 
     // Append metadata record
@@ -438,6 +446,8 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) {
     rec->truncated = truncated ? 1 : 0;
     rec->payload_offset = offset;
     rec->payload_size = copy_bytes;
+    rec->dispatch_id = info.dispatch_id;
+    rec->pack_mode = info.pack_mode;
     rec->start_offset = info.start_offset;
     for (int d = 0; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) {
         rec->shapes[d] = info.shapes[d];
diff --git a/src/a5/platform/src/host/tensor_dump_collector.cpp b/src/a5/platform/src/host/tensor_dump_collector.cpp
index 5cf130436..fad42c3f7 100644
--- a/src/a5/platform/src/host/tensor_dump_collector.cpp
+++ b/src/a5/platform/src/host/tensor_dump_collector.cpp
@@ -34,6 +34,7 @@
 #include <filesystem>
 #include <fstream>
 #include <iomanip>
+#include <map>
 #include <sstream>
 
 #include "common/memory_barrier.h"
@@ -237,6 +238,47 @@ void TensorDumpCollector::process_dump_buffer(const DumpReadyBufferInfo &info) {
     for (uint32_t i = 0; i < count; i++) {
         const TensorDumpRecord &rec = buf->records[i];
 
+        if (rec.stage == static_cast<uint8_t>(TensorDumpStage::KERNEL_ARGS_DUMP)) {
+            KernelArgsDumpEntry entry = {};
+            entry.dispatch_id = rec.dispatch_id;
+            entry.func_id = rec.func_id;
+            entry.arg_index = rec.arg_index;
+            entry.pack_mode = rec.pack_mode;
+            entry.dtype = rec.dtype;
+            entry.ndims = std::min<uint8_t>(rec.ndims, static_cast<uint8_t>(PLATFORM_DUMP_MAX_DIMS));
+            for (int d = 0; d < entry.ndims; d++) {
+                entry.shapes[d] = rec.shapes[d];
+            }
+
+            if (thread_idx >= 0 && thread_idx < static_cast<int>(arenas_.size())) {
+                ArenaInfo &ai = arenas_[thread_idx];
+                char *arena_host = reinterpret_cast<char *>(ai.host_ptr);
+                uint64_t arena_sz = ai.size;
+                uint64_t high_water = ai.high_water;
+                bool overwritten = (high_water > arena_sz && rec.payload_offset < high_water - arena_sz);
+                if (!overwritten && rec.payload_size >= sizeof(uint64_t)) {
+                    uint64_t pos = rec.payload_offset % arena_sz;
+                    if (pos + sizeof(uint64_t) <= arena_sz) {
+                        std::memcpy(&entry.value, arena_host + pos, sizeof(uint64_t));
+                    } else {
+                        uint64_t first = arena_sz - pos;
+                        std::memcpy(&entry.value, arena_host + pos, first);
+                        std::memcpy(reinterpret_cast<char *>(&entry.value) + first, arena_host, sizeof(uint64_t) - first);
+                    }
+                }
+                uint64_t end_offset = rec.payload_offset + rec.payload_size;
+                if (end_offset > ai.high_water) {
+                    ai.high_water = end_offset;
+                }
+            }
+
+            {
+                std::scoped_lock<std::mutex> lock(collected_mutex_);
+                kernel_args_entries_.push_back(std::move(entry));
+            }
+            continue;
+        }
+
         DumpedTensor dt;
         dt.task_id = rec.task_id;
         dt.subtask_id = rec.subtask_id;
@@ -265,10 +307,8 @@ void TensorDumpCollector::process_dump_buffer(const DumpReadyBufferInfo &info) {
             if (high_water > arena_sz && rec.payload_offset < high_water - arena_sz) {
                 dt.overwritten = true;
                 if (++total_overwrite_count_ == 1) {
-                    LOG_WARN(
-                        "Tensor dump overwrite detected: host drain was slower than arena reuse. "
-                        "Increase PLATFORM_DUMP_BUFFERS_PER_THREAD."
-                    );
+                    LOG_WARN("Tensor dump overwrite detected: host drain was slower than arena reuse. "
+                             "Increase PLATFORM_DUMP_BUFFERS_PER_THREAD.");
                 }
             } else {
                 dt.overwritten = false;
@@ -413,6 +453,28 @@ static const char *tensor_dump_stage_name(TensorDumpStage stage) {
         return "before_dispatch";
     case TensorDumpStage::AFTER_COMPLETION:
         return "after_completion";
+    case TensorDumpStage::KERNEL_ARGS_DUMP:
+        return "kernel_args_dump";
+    }
+    return "unknown";
+}
+
+static const char *tensor_dump_pack_mode_name(uint8_t pack_mode) {
+    switch (static_cast<TensorDumpPackMode>(pack_mode)) {
+    case TensorDumpPackMode::TENSOR_PTR:
+        return "tensor_ptr";
+    case TensorDumpPackMode::SCALAR_VALUE:
+        return "bits";
+    }
+    return "unknown";
+}
+
+static const char *kernel_arg_kind_name(uint8_t pack_mode) {
+    switch (static_cast<TensorDumpPackMode>(pack_mode)) {
+    case TensorDumpPackMode::TENSOR_PTR:
+        return "tensor";
+    case TensorDumpPackMode::SCALAR_VALUE:
+        return "scalar";
     }
     return "unknown";
 }
@@ -438,6 +500,74 @@ static uint64_t get_num_elements(const DumpedTensor &dt) {
     return (dt.ndims == 0) ? 1 : numel;
 }
 
+int TensorDumpCollector::export_kernel_args_dump_file() {
+    if (kernel_args_entries_.empty()) {
+        return 0;
+    }
+
+    std::map<uint64_t, std::vector<const KernelArgsDumpEntry *>> grouped;
+    for (const auto &entry : kernel_args_entries_) {
+        grouped[entry.dispatch_id].push_back(&entry);
+    }
+
+    std::ofstream json(run_dir_ / "kernel_args_dump.json");
+    json << "{\n";
+    json << "  \"schema_version\": 1,\n";
+    json << "  \"total_dispatches\": " << grouped.size() << ",\n";
+    json << "  \"total_args\": " << kernel_args_entries_.size() << ",\n";
+    json << "  \"dispatches\": [\n";
+
+    bool first_dispatch = true;
+    for (const auto &[dispatch_id, entries] : grouped) {
+        if (entries.empty()) {
+            continue;
+        }
+        auto sorted_entries = entries;
+        std::sort(sorted_entries.begin(), sorted_entries.end(), [](const KernelArgsDumpEntry *a, const KernelArgsDumpEntry *b) {
+            return a->arg_index < b->arg_index;
+        });
+        const KernelArgsDumpEntry &head = *sorted_entries.front();
+        if (!first_dispatch) {
+            json << ",\n";
+        }
+        first_dispatch = false;
+
+        json << "    {\"dispatch_id\": " << dispatch_id;
+        json << ", \"func_id\": " << head.func_id;
+        json << ", \"args\": [";
+
+        bool first_arg = true;
+        for (const KernelArgsDumpEntry *entry : sorted_entries) {
+            if (!first_arg) {
+                json << ", ";
+            }
+            first_arg = false;
+            json << "{\"arg_index\": " << entry->arg_index;
+            json << ", \"kind\": \"" << kernel_arg_kind_name(entry->pack_mode) << "\"";
+            json << ", \"pack_mode\": \"" << tensor_dump_pack_mode_name(entry->pack_mode) << "\"";
+            const uint32_t ndims = std::min<uint32_t>(entry->ndims, PLATFORM_DUMP_MAX_DIMS);
+            json << ", \"dtype\": \"" << get_dtype_name_from_raw(entry->dtype) << "\"";
+            json << ", \"ndims\": " << ndims;
+            if (ndims > 0) {
+                json << ", \"shape\": " << dims_to_string(entry->shapes.data(), static_cast<int>(ndims));
+            }
+            if (static_cast<TensorDumpPackMode>(entry->pack_mode) == TensorDumpPackMode::SCALAR_VALUE) {
+                json << ", \"value\": " << entry->value;
+            } else {
+                std::ostringstream value_ss;
+                value_ss << "0x" << std::hex << entry->value;
+                json << ", \"ptr_value\": \"" << value_ss.str() << "\"";
+            }
+            json << "}";
+        }
+
+        json << "]}";
+    }
+
+    json << "\n  ]\n}\n";
+    return 0;
+}
+
 void TensorDumpCollector::writer_loop() {
     while (true) {
         DumpedTensor dt;
@@ -496,7 +626,7 @@ int TensorDumpCollector::export_dump_files() {
         );
     }
 
-    if (collected_.empty()) {
+    if (collected_.empty() && kernel_args_entries_.empty()) {
         LOG_WARN("No tensor dump data to export");
         writer_started_ = false;
         return 0;
@@ -597,8 +727,11 @@ int TensorDumpCollector::export_dump_files() {
         );
     }
 
+    export_kernel_args_dump_file();
+
     // Clear state so subsequent runs don't accumulate data from previous runs
     collected_.clear();
+    kernel_args_entries_.clear();
     total_dropped_record_count_ = 0;
     total_truncated_count_ = 0;
     total_overwrite_count_ = 0;
@@ -614,6 +747,7 @@ int TensorDumpCollector::finalize(DumpUnregisterCallback unregister_cb, const Du
 
     // Stop mgmt + collector threads if the caller didn't already (idempotent).
     stop();
+    kernel_args_entries_.clear();
 
     auto release_dev = [&](void *p) {
         release_one_buffer(p, unregister_cb, free_cb);
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index bcea9b09e..d30fef0cd 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -519,6 +519,9 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             rt_scope_begin(rt);
             (*p_func)(*orch_args_cached_);
             rt_scope_end(rt);
+            if (is_dump_tensor_enabled()) {
+                dump_tensor_flush(thread_idx);
+            }
 #if PTO2_PROFILING
             uint64_t orch_cycle_end = get_sys_cnt_aicpu();
             (void)orch_cycle_end;
@@ -696,10 +699,6 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     finished_count_.store(0, std::memory_order_release);
     runtime_init_ready_.store(false, std::memory_order_release);
 
-    aicpu_thread_num_ = 0;
-    sched_thread_num_ = 0;
-    orch_to_sched_ = false;
-
     orch_args_cached_ = nullptr;
     // orch_so_table_ entries are intentionally preserved across deinit: the
     // next run reuses cached handles when register_new_callable_id() returns
@@ -708,8 +707,18 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit)
     rt = nullptr;
 
+    if (is_dump_tensor_enabled()) {
+        for (int t = 0; t < aicpu_thread_num_; t++) {
+            dump_tensor_flush(t);
+        }
+    }
+
     LOG_INFO_V0("DeInit: Runtime execution state reset");
 
+    aicpu_thread_num_ = 0;
+    sched_thread_num_ = 0;
+    orch_to_sched_ = false;
+
     initialized_.store(false, std::memory_order_release);
     init_done_.store(false, std::memory_order_release);
     init_failed_.store(false, std::memory_order_release);
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index aab3f22d4..d9d433b18 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -897,6 +897,7 @@ void SchedulerContext::deinit() {
     completed_tasks_.store(0, std::memory_order_release);
     total_tasks_ = 0;
     orchestrator_done_ = false;
+    kernel_args_dispatch_seq_.store(0, std::memory_order_release);
     init_done_.store(false, std::memory_order_release);
     init_complete_.store(false, std::memory_order_release);
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
index 83a71353f..fb121dd93 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -148,6 +148,7 @@ class SchedulerContext {
     volatile bool orchestrator_done_{false};
     std::atomic<bool> completed_{false};
     uint64_t *func_id_to_addr_{nullptr};
+    std::atomic<uint64_t> kernel_args_dispatch_seq_{0};
 
     // --- Core-transition coordination ---
     std::atomic<bool> transition_requested_{false};
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index 6bd212f84..d9c718a0e 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -149,6 +149,22 @@ void SchedulerContext::dispatch_subtask_to_core(
     AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab);
     build_payload(payload, slot_state, subslot, async_ctx, block_idx);
 
+#if PTO2_PROFILING
+    if (is_dump_tensor_enabled()) {
+        uint64_t dispatch_id = kernel_args_dispatch_seq_.fetch_add(1, std::memory_order_relaxed);
+        dump_tensors_for_task<PTO2_SUBTASK_SLOT_COUNT>(
+            thread_idx, slot_state, TensorDumpStage::KERNEL_ARGS_DUMP,
+            [](ActiveMask active_mask, int raw_subtask_id) {
+                return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
+            },
+            [this](int32_t func_id) {
+                return get_function_bin_addr(func_id);
+            },
+            dispatch_id
+        );
+    }
+#endif
+
     if (to_pending) {
         core_exec_state.pending_subslot = subslot;
         core_exec_state.pending_slot_state = &slot_state;
@@ -670,9 +686,8 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
                 if (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP) {
                     while (deferred_release_count > 0) {
 #if PTO2_SCHED_PROFILING
-                        (void)sched_->on_task_release(
-                            *deferred_release_slot_states[--deferred_release_count], thread_idx
-                        );
+                        (void
+                        )sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
 #else
                         sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
 #endif
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/test_tensor_dump.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/test_tensor_dump.py
index ce5604a08..ef09df55b 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/test_tensor_dump.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/test_tensor_dump.py
@@ -110,6 +110,15 @@ def _validate_dump_artifact(self, case):
         # vector_example reads/writes ≥1 tensor and the manifest can't be empty
         # if anything was captured. Robust to schema add/remove of new fields.
         assert bin_path.stat().st_size > 0, "tensor_dump.bin is empty"
+        assert not (dump_dir / "args_dump.json").exists(), "args_dump.json should not be emitted"
+        kernel_args = dump_dir / "kernel_args_dump.json"
+        assert kernel_args.exists(), f"kernel_args_dump.json missing under {dump_dir}"
+        with kernel_args.open() as f:
+            kernel_args_data = json.load(f)
+        assert kernel_args_data.get("dispatches"), "kernel_args_dump.json has no dispatches"
+        first_dispatch = kernel_args_data["dispatches"][0]
+        assert "dispatch_id" in first_dispatch, f"missing dispatch_id in {first_dispatch}"
+        assert first_dispatch.get("args"), f"missing args in {first_dispatch}"
 
         # ---- Tool smoke: dump_viewer ----
         # Exit-code-only check; the no-filter default lists every captured