From 536fa1b7b8412c7a023fe24cb13fcc11755e7f92 Mon Sep 17 00:00:00 2001
From: zm <zhaomin88@huawei.com>
Date: Tue, 26 May 2026 19:50:50 +0800
Subject: [PATCH] feat: support task-selective tensor dump

  - Add Arg::dump(...) for selecting tensor arguments to dump within a task
  - Keep selective dump masks in a platform-owned table instead of expanding PTO2TaskPayload
  - Filter unmarked tasks and unselected tensor arguments at AICPU dump collection time
  - Preserve legacy full tensor dump behavior when selective mode is not enabled
  - Add partial tensor dump regression coverage and update tensor dump documentation
---
 docs/dfx/tensor-dump.md                       |  39 ++++++-
 .../include/aicpu/tensor_dump_aicpu.h         |  27 ++++-
 .../platform/include/common/tensor_dump.h     |  12 ++
 .../platform/src/aicpu/tensor_dump_aicpu.cpp  | 106 +++++++++++++++++-
 .../runtime/host_build_graph/build_config.py  |   4 +-
 .../runtime/pto_runtime2_types.h              |   9 ++
 .../tensormap_and_ringbuffer/build_config.py  |   8 +-
 .../orchestration/pto_orchestration_api.h     |   2 +
 .../runtime/pto_orchestrator.cpp              |  11 ++
 .../runtime/pto_types.h                       |  53 +++++++++
 .../include/aicpu/tensor_dump_aicpu.h         |  27 ++++-
 src/a5/platform/include/common/tensor_dump.h  |  12 ++
 .../platform/src/aicpu/tensor_dump_aicpu.cpp  | 106 +++++++++++++++++-
 .../runtime/host_build_graph/build_config.py  |   4 +-
 .../runtime/pto_runtime2_types.h              |  32 ++++++
 .../tensormap_and_ringbuffer/build_config.py  |   8 +-
 .../orchestration/pto_orchestration_api.h     |   2 +
 .../runtime/pto_orchestrator.cpp              |  11 ++
 .../runtime/pto_types.h                       |  53 +++++++++
 .../orchestration/partial_dump_orch.cpp       |  78 +++++++++++++
 .../dfx/tensor_dump/test_tensor_dump.py       |  41 +++++++
 21 files changed, 626 insertions(+), 19 deletions(-)
 create mode 100644 src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h
 create mode 100644 tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/kernels/orchestration/partial_dump_orch.cpp

diff --git a/docs/dfx/tensor-dump.md b/docs/dfx/tensor-dump.md
index 3bcb14320..60f3bd8ba 100644
--- a/docs/dfx/tensor-dump.md
+++ b/docs/dfx/tensor-dump.md
@@ -60,7 +60,44 @@ executors read the same handshake bit to insert a
 `pipe_barrier(PIPE_ALL)` before FIN when dump is on, so
 `AFTER_COMPLETION` snapshots see the kernel's final writes.
 
-### 3.2 Output
+### 3.2 Select Specific Task Tensors
+
+By default, `--dump-tensor` dumps every task's tensor inputs and
+outputs. Device-side orchestration can opt into tensor-argument selection
+by enabling selective mode at the beginning of the orchestration and
+marking the tensor arguments on each `Arg` before submission:
+
+```cpp
+enable_dump_tensor_selective();
+
+Arg args;
+args.add_input(x);
+args.add_input(y);
+args.add_output(z);
+args.dump(x, y, z);
+rt_submit_aiv_task(FUNC_ADD, args);
+```
+
+`dump(...)` selects tensor arguments from the current `Arg`; it does not
+execute a dump immediately. The selected tensors must already belong to
+that `Arg`. The runtime uses the argument direction already provided by
+`add_input()`, `add_output()`, or `add_inout()` to decide when each
+selected tensor is captured:
+
+- input tensors are dumped before dispatch.
+- output tensors are dumped after completion.
+- inout tensors follow the existing inout dump behavior.
+
+With `enable_dump_tensor_selective()`, tasks without any `dump(...)` marker
+are skipped during AICPU collection. For marked tasks, only the selected
+tensor arguments are dumped. Without `enable_dump_tensor_selective()`, the
+legacy full-dump behavior is unchanged even if an `Arg` carries a
+`dump(...)` marker.
+
+`--dump-tensor` remains the top-level enable switch; `Arg::dump(...)`
+only narrows what gets recorded after tensor dump is enabled.
+
+### 3.3 Output
 
 The dump artifacts land under the per-task output prefix
 (`CallConfig::output_prefix`, set by
diff --git a/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h b/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h
index 8c83d71e1..d6babd8f6 100644
--- a/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h
+++ b/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h
@@ -20,7 +20,15 @@
 #ifndef PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_
 #define PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_
 
+#include <stdint.h>
+
+#ifndef __cplusplus
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
 #include <cinttypes>
+#endif
 
 #include "common/memory_barrier.h"
 #include "common/tensor_dump.h"
@@ -64,6 +72,10 @@ void set_dump_tensor_enabled(bool enable);
  * @return true if tensor dump is enabled
  */
 bool is_dump_tensor_enabled();
+void set_dump_tensor_selective_mode(bool enable);
+bool is_dump_tensor_selective_mode();
+void set_dump_tensor_task_mask(uint64_t task_id, TensorDumpArgMask mask);
+TensorDumpArgMask get_dump_tensor_task_mask(uint64_t task_id);
 
 #ifdef __cplusplus
 }
@@ -73,6 +85,8 @@ bool is_dump_tensor_enabled();
 bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role);
 int32_t count_callable_tensor_args(const CoreCallable &callable);
 bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage);
+bool should_dump_task(TensorDumpArgMask arg_mask);
+bool should_dump_tensor_arg(TensorDumpArgMask arg_mask, int32_t arg_index);
 bool try_log_tensor_dump_layout_mismatch();
 int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);
 
@@ -82,6 +96,13 @@ inline void dump_tensors_for_task(
     GetFunctionBinAddrFn get_function_bin_addr
 ) {
     const auto &pl = *slot_state.payload;
+    TensorDumpArgMask dump_arg_mask = TENSOR_DUMP_ARG_MASK_NONE;
+    if (is_dump_tensor_selective_mode()) {
+        dump_arg_mask = get_dump_tensor_task_mask(slot_state.task->task_id.raw);
+    }
+    if (!should_dump_task(dump_arg_mask)) {
+        return;
+    }
     const CoreCallable *callables[MaxSubtaskSlots] = {};
     int32_t total_tensor_args = 0;
 
@@ -125,7 +146,8 @@ inline void dump_tensors_for_task(
                 continue;
             }
             TensorDumpRole role;
-            if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage)) {
+            if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage) &&
+                should_dump_tensor_arg(dump_arg_mask, payload_index)) {
                 const auto &t = pl.tensors[payload_index];
                 TensorDumpInfo info = {};
                 info.buffer_addr = t.buffer.addr;
@@ -241,7 +263,6 @@ inline void dump_tensors_for_task(
         tensor_arg_index++;
     }
 }
-#endif
 
 /**
  * Initialize tensor dump.
@@ -279,4 +300,6 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);
  */
 void dump_tensor_flush(int thread_idx);
 
+#endif
+
 #endif  // PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_
diff --git a/src/a2a3/platform/include/common/tensor_dump.h b/src/a2a3/platform/include/common/tensor_dump.h
index dbea518db..48afba1cf 100644
--- a/src/a2a3/platform/include/common/tensor_dump.h
+++ b/src/a2a3/platform/include/common/tensor_dump.h
@@ -44,6 +44,7 @@
 #include <cstdint>
 
 #include "common/platform_config.h"
+#include "host_build_graph/runtime/pto_runtime2_types.h"
 
 // =============================================================================
 // Constants
@@ -70,6 +71,17 @@ enum class TensorDumpStage : uint8_t {
     AFTER_COMPLETION = 1,
 };
 
+using TensorDumpArgMask = uint64_t;
+
+// Bitmask stored in the platform-owned mask pool when orchestration selects
+// specific task tensor arguments for dump. Bit N corresponds to tensors[N].
+// Zero preserves legacy "dump all tasks" behavior unless selective mode is enabled.
+constexpr TensorDumpArgMask TENSOR_DUMP_ARG_MASK_NONE = 0;
+constexpr uint32_t TENSOR_DUMP_ARG_MASK_BITS = 64;
+constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_RINGS = PTO2_MAX_RING_DEPTH;
+constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_SLOTS = PTO2_TASK_WINDOW_SIZE;
+constexpr uint32_t TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK = TENSOR_DUMP_MASK_POOL_MAX_SLOTS - 1;
+
 // =============================================================================
 // TensorDumpRecord - Single Tensor Dump Entry (128B = 2 cache lines)
 // =============================================================================
diff --git a/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp b/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp
index 7e3e3c9b2..1373868e0 100644
--- a/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp
+++ b/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp
@@ -20,6 +20,7 @@
 
 #include "aicpu/tensor_dump_aicpu.h"
 
+#include <cstdlib>
 #include <cstring>
 
 #include "common/memory_barrier.h"
@@ -53,11 +54,99 @@ extern "C" void set_platform_dump_base(uint64_t dump_data_base) { g_platform_dum
 extern "C" uint64_t get_platform_dump_base() { return g_platform_dump_base; }
 
 static bool g_enable_dump_tensor = false;
+static bool g_dump_tensor_selective_mode = false;
+struct DumpTaskMaskEntry {
+    uint64_t task_id;
+    TensorDumpArgMask mask;
+};
+static constexpr uint64_t DUMP_TASK_MASK_EMPTY_TASK_ID = UINT64_MAX;
+static constexpr uint32_t DUMP_TASK_MASK_TABLE_CAPACITY = 32768;
+static DumpTaskMaskEntry *g_dump_mask_table = nullptr;
+static bool ensure_dump_mask_table() {
+    if (g_dump_mask_table != nullptr) {
+        return true;
+    }
+    g_dump_mask_table =
+        static_cast<DumpTaskMaskEntry *>(malloc(sizeof(DumpTaskMaskEntry) * DUMP_TASK_MASK_TABLE_CAPACITY));
+    if (g_dump_mask_table == nullptr) {
+        LOG_ERROR("Failed to allocate tensor dump selective mask table");
+        return false;
+    }
+    for (uint32_t i = 0; i < DUMP_TASK_MASK_TABLE_CAPACITY; i++) {
+        g_dump_mask_table[i].task_id = DUMP_TASK_MASK_EMPTY_TASK_ID;
+        g_dump_mask_table[i].mask = TENSOR_DUMP_ARG_MASK_NONE;
+    }
+    return true;
+}
 
-extern "C" void set_dump_tensor_enabled(bool enable) { g_enable_dump_tensor = enable; }
+static void clear_dump_mask_table() {
+    if (g_dump_mask_table == nullptr) {
+        return;
+    }
+    for (uint32_t i = 0; i < DUMP_TASK_MASK_TABLE_CAPACITY; i++) {
+        g_dump_mask_table[i].task_id = DUMP_TASK_MASK_EMPTY_TASK_ID;
+        g_dump_mask_table[i].mask = TENSOR_DUMP_ARG_MASK_NONE;
+    }
+}
+
+extern "C" void set_dump_tensor_enabled(bool enable) {
+    g_enable_dump_tensor = enable;
+    g_dump_tensor_selective_mode = false;
+    clear_dump_mask_table();
+}
 
 extern "C" bool is_dump_tensor_enabled() { return g_enable_dump_tensor; }
 
+extern "C" void set_dump_tensor_selective_mode(bool enable) { g_dump_tensor_selective_mode = enable; }
+
+extern "C" bool is_dump_tensor_selective_mode() { return g_dump_tensor_selective_mode; }
+
+extern "C" void set_dump_tensor_task_mask(uint64_t task_id, TensorDumpArgMask mask) {
+    if (mask == TENSOR_DUMP_ARG_MASK_NONE) {
+        return;
+    }
+    if (!ensure_dump_mask_table()) {
+        return;
+    }
+    uint32_t ring_id = static_cast<uint32_t>(task_id >> 32);
+    if (ring_id >= TENSOR_DUMP_MASK_POOL_MAX_RINGS) {
+        return;
+    }
+    uint32_t slot = static_cast<uint32_t>(task_id) & TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK;
+    uint32_t idx = (ring_id * TENSOR_DUMP_MASK_POOL_MAX_SLOTS + slot) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1);
+    for (uint32_t probe = 0; probe < DUMP_TASK_MASK_TABLE_CAPACITY; probe++) {
+        DumpTaskMaskEntry &entry = g_dump_mask_table[(idx + probe) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1)];
+        if (entry.task_id == DUMP_TASK_MASK_EMPTY_TASK_ID || entry.task_id == task_id) {
+            entry.task_id = task_id;
+            entry.mask = mask;
+            return;
+        }
+    }
+    LOG_ERROR("tensor dump selective mask table is full");
+}
+
+extern "C" TensorDumpArgMask get_dump_tensor_task_mask(uint64_t task_id) {
+    if (g_dump_mask_table == nullptr) {
+        return TENSOR_DUMP_ARG_MASK_NONE;
+    }
+    uint32_t ring_id = static_cast<uint32_t>(task_id >> 32);
+    if (ring_id >= TENSOR_DUMP_MASK_POOL_MAX_RINGS) {
+        return TENSOR_DUMP_ARG_MASK_NONE;
+    }
+    uint32_t slot = static_cast<uint32_t>(task_id) & TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK;
+    uint32_t idx = (ring_id * TENSOR_DUMP_MASK_POOL_MAX_SLOTS + slot) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1);
+    for (uint32_t probe = 0; probe < DUMP_TASK_MASK_TABLE_CAPACITY; probe++) {
+        const DumpTaskMaskEntry &entry = g_dump_mask_table[(idx + probe) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1)];
+        if (entry.task_id == task_id) {
+            return entry.mask;
+        }
+        if (entry.task_id == DUMP_TASK_MASK_EMPTY_TASK_ID) {
+            return TENSOR_DUMP_ARG_MASK_NONE;
+        }
+    }
+    return TENSOR_DUMP_ARG_MASK_NONE;
+}
+
 bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role) {
     switch (dir) {
     case ArgDirection::IN:
@@ -97,6 +186,21 @@ bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage) {
     return false;
 }
 
+bool should_dump_task(TensorDumpArgMask arg_mask) {
+    if (!is_dump_tensor_selective_mode()) {
+        return true;
+    }
+    return arg_mask != TENSOR_DUMP_ARG_MASK_NONE;
+}
+
+bool should_dump_tensor_arg(TensorDumpArgMask arg_mask, int32_t arg_index) {
+    if (!is_dump_tensor_selective_mode()) {
+        return true;
+    }
+    if (arg_index < 0 || arg_index >= static_cast<int32_t>(TENSOR_DUMP_ARG_MASK_BITS)) return false;
+    return (arg_mask & (TensorDumpArgMask{1} << arg_index)) != 0;
+}
+
 bool try_log_tensor_dump_layout_mismatch() {
     if (s_logged_dump_layout_mismatch) {
         return false;
diff --git a/src/a2a3/runtime/host_build_graph/build_config.py b/src/a2a3/runtime/host_build_graph/build_config.py
index 76e7face5..a1c96c8fa 100644
--- a/src/a2a3/runtime/host_build_graph/build_config.py
+++ b/src/a2a3/runtime/host_build_graph/build_config.py
@@ -11,7 +11,7 @@
 
 BUILD_CONFIG = {
     "aicore": {"include_dirs": ["runtime"], "source_dirs": ["aicore", "runtime"]},
-    "aicpu": {"include_dirs": ["runtime"], "source_dirs": ["aicpu", "runtime"]},
-    "host": {"include_dirs": ["runtime", "orchestration"], "source_dirs": ["host", "runtime"]},
+    "aicpu": {"include_dirs": ["runtime", ".."], "source_dirs": ["aicpu", "runtime"]},
+    "host": {"include_dirs": ["runtime", "orchestration", ".."], "source_dirs": ["host", "runtime"]},
     "orchestration": {"include_dirs": ["runtime", "orchestration"], "source_dirs": []},
 }
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
index 7e6d00b99..4d4bb9313 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
@@ -20,4 +20,13 @@
 #define PTO2_PROFILING 1
 #endif
 
+// =============================================================================
+// Tensor Dump Configuration
+// =============================================================================
+
+// Tensor dump uses these defaults to size its selective mask table so task-id
+// ring/slot lookup stays aligned with PTO2 task id layout.
+#define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
+#define PTO2_MAX_RING_DEPTH 4        // Number of task-id ring layers
+
 #endif  // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py b/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py
index 48881f9dc..2cfba734d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py
@@ -19,8 +19,8 @@
 # by the Tensor constructor's validation logic).
 
 BUILD_CONFIG = {
-    "aicore": {"include_dirs": ["runtime", "common"], "source_dirs": ["aicore", "orchestration"]},
-    "aicpu": {"include_dirs": ["runtime", "common"], "source_dirs": ["aicpu", "runtime", "orchestration"]},
-    "host": {"include_dirs": ["runtime", "common"], "source_dirs": ["host", "runtime/shared", "orchestration"]},
-    "orchestration": {"include_dirs": ["runtime", "orchestration", "common"], "source_dirs": ["orchestration"]},
+    "aicore": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicore", "orchestration"]},
+    "aicpu": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicpu", "runtime", "orchestration"]},
+    "host": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["host", "runtime/shared", "orchestration"]},
+    "orchestration": {"include_dirs": ["runtime", "orchestration", "common", ".."], "source_dirs": ["orchestration"]},
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
index 6c0640a40..eabe3ec3f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -138,6 +138,8 @@ struct PTO2Runtime {
     PTO2ScopeMode pending_scope_mode;
 };
 
+static inline void enable_dump_tensor_selective() { set_tensor_dump_selective_requested(true); }
+
 // =============================================================================
 // Inline Convenience Wrappers (call through ops table)
 // =============================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 321d66d2b..813f1f846 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -36,6 +36,9 @@
 #include "pto_types.h"
 #include "tensor.h"
 
+extern "C" void set_dump_tensor_selective_mode(bool enable);
+extern "C" void set_dump_tensor_task_mask(uint64_t task_id, uint64_t mask);
+
 // Verify the captured Tensor blob size in DepGenRecord matches the runtime
 // Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without
 // including runtime/tensor.h, so this check lives at the orch callsite.
@@ -674,6 +677,14 @@ static TaskOutputTensors submit_task_common(
     }
 
     payload.init(args, result, prepared.alloc_result, layout);
+#if PTO2_PROFILING
+    if (args.tensor_dump_selective_requested()) {
+        set_dump_tensor_selective_mode(true);
+    }
+    if (args.tensor_dump_arg_mask() != 0) {
+        set_dump_tensor_task_mask(task_id.raw, args.tensor_dump_arg_mask());
+    }
+#endif
 
     CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, task_id.raw);
 #if PTO2_ORCH_PROFILING
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
index 22dbae0af..79743f89e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
@@ -45,6 +45,15 @@
 #define MAX_TENSOR_ARGS CORE_MAX_TENSOR_ARGS
 #define MAX_SCALAR_ARGS CORE_MAX_SCALAR_ARGS
 
+inline bool &tensor_dump_selective_requested_ref() {
+    static bool requested = false;
+    return requested;
+}
+
+inline void set_tensor_dump_selective_requested(bool enable) { tensor_dump_selective_requested_ref() = enable; }
+
+inline bool is_tensor_dump_selective_requested() { return tensor_dump_selective_requested_ref(); }
+
 typedef enum {
     ASYNC_ENGINE_SDMA = 0,
     ASYNC_ENGINE_ROCE = 1,
@@ -180,6 +189,8 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
         clear();
         has_error = false;
         error_msg = nullptr;
+        tensor_dump_arg_mask_ = 0;
+        tensor_dump_selective_requested_ = is_tensor_dump_selective_requested();
         explicit_deps_ = nullptr;
         explicit_dep_count_ = 0;
     }
@@ -191,6 +202,25 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
         }
     }
 
+    template <typename... Args>
+    void dump(Args &&...args) {
+        static_assert(sizeof...(Args) >= 1, "dump: at least one tensor argument required");
+        static_assert(
+            (std::is_lvalue_reference_v<Args> && ...),
+            "dump: temporaries are not allowed — pass tensors already added to this Arg"
+        );
+        static_assert(
+            ((std::is_same_v<std::decay_t<Args>, Tensor> || std::is_same_v<std::decay_t<Args>, TensorCreateInfo>) &&
+             ...),
+            "dump: all arguments must be Tensor or TensorCreateInfo"
+        );
+        tensor_dump_selective_requested_ = is_tensor_dump_selective_requested();
+        (mark_tensor_dump_arg(args), ...);
+    }
+
+    uint64_t tensor_dump_arg_mask() const { return tensor_dump_arg_mask_; }
+    bool tensor_dump_selective_requested() const { return tensor_dump_selective_requested_; }
+
     template <typename... Args>
     void add_input(Args &&...args) {
         if (!check_add_tensor_valid<false>(args...)) {
@@ -354,9 +384,32 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
 
 private:
     // Caller-owned dependency array; lifetime must extend through submit.
+    static_assert(MAX_TENSOR_ARGS <= 64, "tensor dump arg mask assumes at most 64 tensor arguments");
+    uint64_t tensor_dump_arg_mask_{0};
+    bool tensor_dump_selective_requested_{is_tensor_dump_selective_requested()};
     const PTO2TaskId *explicit_deps_{nullptr};
     uint32_t explicit_dep_count_{0};
 
+    void mark_tensor_dump_arg(const Tensor &tensor) {
+        for (int32_t i = 0; i < tensor_count_; i++) {
+            if (tags_[i] != TensorArgType::OUTPUT && tensors_[i].ptr == &tensor) {
+                tensor_dump_arg_mask_ |= (uint64_t{1} << i);
+                return;
+            }
+        }
+        set_error("dump: tensor is not part of this Arg");
+    }
+
+    void mark_tensor_dump_arg(const TensorCreateInfo &create_info) {
+        for (int32_t i = 0; i < tensor_count_; i++) {
+            if (tags_[i] == TensorArgType::OUTPUT && tensors_[i].create_info == &create_info) {
+                tensor_dump_arg_mask_ |= (uint64_t{1} << i);
+                return;
+            }
+        }
+        set_error("dump: TensorCreateInfo is not part of this Arg");
+    }
+
     template <bool is_output, typename... Args>
     bool check_add_tensor_valid(Args &&...) {
         static_assert(sizeof...(Args) >= 1, "at least one argument required");
diff --git a/src/a5/platform/include/aicpu/tensor_dump_aicpu.h b/src/a5/platform/include/aicpu/tensor_dump_aicpu.h
index 2afe1b410..5d124b082 100644
--- a/src/a5/platform/include/aicpu/tensor_dump_aicpu.h
+++ b/src/a5/platform/include/aicpu/tensor_dump_aicpu.h
@@ -20,7 +20,15 @@
 #ifndef SRC_A5_PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_
 #define SRC_A5_PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_
 
+#include <stdint.h>
+
+#ifndef __cplusplus
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
 #include <cinttypes>
+#endif
 
 #include "common/memory_barrier.h"
 #include "common/tensor_dump.h"
@@ -64,6 +72,10 @@ void set_dump_tensor_enabled(bool enable);
  * @return true if tensor dump is enabled
  */
 bool is_dump_tensor_enabled();
+void set_dump_tensor_selective_mode(bool enable);
+bool is_dump_tensor_selective_mode();
+void set_dump_tensor_task_mask(uint64_t task_id, TensorDumpArgMask mask);
+TensorDumpArgMask get_dump_tensor_task_mask(uint64_t task_id);
 
 #ifdef __cplusplus
 }
@@ -73,6 +85,8 @@ bool is_dump_tensor_enabled();
 bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role);
 int32_t count_callable_tensor_args(const CoreCallable &callable);
 bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage);
+bool should_dump_task(TensorDumpArgMask arg_mask);
+bool should_dump_tensor_arg(TensorDumpArgMask arg_mask, int32_t arg_index);
 bool try_log_tensor_dump_layout_mismatch();
 int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);
 
@@ -82,6 +96,13 @@ inline void dump_tensors_for_task(
     GetFunctionBinAddrFn get_function_bin_addr
 ) {
     const auto &pl = *slot_state.payload;
+    TensorDumpArgMask dump_arg_mask = TENSOR_DUMP_ARG_MASK_NONE;
+    if (is_dump_tensor_selective_mode()) {
+        dump_arg_mask = get_dump_tensor_task_mask(slot_state.task->task_id.raw);
+    }
+    if (!should_dump_task(dump_arg_mask)) {
+        return;
+    }
     const CoreCallable *callables[MaxSubtaskSlots] = {};
     int32_t total_tensor_args = 0;
 
@@ -125,7 +146,8 @@ inline void dump_tensors_for_task(
                 continue;
             }
             TensorDumpRole role;
-            if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage)) {
+            if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage) &&
+                should_dump_tensor_arg(dump_arg_mask, payload_index)) {
                 const auto &t = pl.tensors[payload_index];
                 TensorDumpInfo info = {};
                 info.buffer_addr = t.buffer.addr;
@@ -241,7 +263,6 @@ inline void dump_tensors_for_task(
         tensor_arg_index++;
     }
 }
-#endif
 
 /**
  * Initialize tensor dump.
@@ -279,4 +300,6 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);
  */
 void dump_tensor_flush(int thread_idx);
 
+#endif
+
 #endif  // SRC_A5_PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_
diff --git a/src/a5/platform/include/common/tensor_dump.h b/src/a5/platform/include/common/tensor_dump.h
index d774ecfe5..9b7c64433 100644
--- a/src/a5/platform/include/common/tensor_dump.h
+++ b/src/a5/platform/include/common/tensor_dump.h
@@ -48,6 +48,7 @@
 #include <cstdint>
 
 #include "common/platform_config.h"
+#include "host_build_graph/runtime/pto_runtime2_types.h"
 
 // =============================================================================
 // Constants
@@ -74,6 +75,17 @@ enum class TensorDumpStage : uint8_t {
     AFTER_COMPLETION = 1,
 };
 
+using TensorDumpArgMask = uint64_t;
+
+// Bitmask stored in the platform-owned mask pool when orchestration selects
+// specific task tensor arguments for dump. Bit N corresponds to tensors[N].
+// Zero preserves legacy "dump all tasks" behavior unless selective mode is enabled.
+constexpr TensorDumpArgMask TENSOR_DUMP_ARG_MASK_NONE = 0;
+constexpr uint32_t TENSOR_DUMP_ARG_MASK_BITS = 64;
+constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_RINGS = PTO2_MAX_RING_DEPTH;
+constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_SLOTS = PTO2_TASK_WINDOW_SIZE;
+constexpr uint32_t TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK = TENSOR_DUMP_MASK_POOL_MAX_SLOTS - 1;
+
 // =============================================================================
 // TensorDumpRecord - Single Tensor Dump Entry (128B = 2 cache lines)
 // =============================================================================
diff --git a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp
index 4aff271f4..33c04f783 100644
--- a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp
+++ b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp
@@ -21,6 +21,7 @@
 
 #include "aicpu/tensor_dump_aicpu.h"
 
+#include <cstdlib>
 #include <cstring>
 
 #include "common/memory_barrier.h"
@@ -54,11 +55,99 @@ extern "C" void set_platform_dump_base(uint64_t dump_data_base) { g_platform_dum
 extern "C" uint64_t get_platform_dump_base() { return g_platform_dump_base; }
 
 static bool g_enable_dump_tensor = false;
+static bool g_dump_tensor_selective_mode = false;
+struct DumpTaskMaskEntry {
+    uint64_t task_id;
+    TensorDumpArgMask mask;
+};
+static constexpr uint64_t DUMP_TASK_MASK_EMPTY_TASK_ID = UINT64_MAX;
+static constexpr uint32_t DUMP_TASK_MASK_TABLE_CAPACITY = 32768;
+static DumpTaskMaskEntry *g_dump_mask_table = nullptr;
+static bool ensure_dump_mask_table() {
+    if (g_dump_mask_table != nullptr) {
+        return true;
+    }
+    g_dump_mask_table =
+        static_cast<DumpTaskMaskEntry *>(malloc(sizeof(DumpTaskMaskEntry) * DUMP_TASK_MASK_TABLE_CAPACITY));
+    if (g_dump_mask_table == nullptr) {
+        LOG_ERROR("Failed to allocate tensor dump selective mask table");
+        return false;
+    }
+    for (uint32_t i = 0; i < DUMP_TASK_MASK_TABLE_CAPACITY; i++) {
+        g_dump_mask_table[i].task_id = DUMP_TASK_MASK_EMPTY_TASK_ID;
+        g_dump_mask_table[i].mask = TENSOR_DUMP_ARG_MASK_NONE;
+    }
+    return true;
+}
 
-extern "C" void set_dump_tensor_enabled(bool enable) { g_enable_dump_tensor = enable; }
+static void clear_dump_mask_table() {
+    if (g_dump_mask_table == nullptr) {
+        return;
+    }
+    for (uint32_t i = 0; i < DUMP_TASK_MASK_TABLE_CAPACITY; i++) {
+        g_dump_mask_table[i].task_id = DUMP_TASK_MASK_EMPTY_TASK_ID;
+        g_dump_mask_table[i].mask = TENSOR_DUMP_ARG_MASK_NONE;
+    }
+}
+
+extern "C" void set_dump_tensor_enabled(bool enable) {
+    g_enable_dump_tensor = enable;
+    g_dump_tensor_selective_mode = false;
+    clear_dump_mask_table();
+}
 
 extern "C" bool is_dump_tensor_enabled() { return g_enable_dump_tensor; }
 
+extern "C" void set_dump_tensor_selective_mode(bool enable) { g_dump_tensor_selective_mode = enable; }
+
+extern "C" bool is_dump_tensor_selective_mode() { return g_dump_tensor_selective_mode; }
+
+extern "C" void set_dump_tensor_task_mask(uint64_t task_id, TensorDumpArgMask mask) {
+    if (mask == TENSOR_DUMP_ARG_MASK_NONE) {
+        return;
+    }
+    if (!ensure_dump_mask_table()) {
+        return;
+    }
+    uint32_t ring_id = static_cast<uint32_t>(task_id >> 32);
+    if (ring_id >= TENSOR_DUMP_MASK_POOL_MAX_RINGS) {
+        return;
+    }
+    uint32_t slot = static_cast<uint32_t>(task_id) & TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK;
+    uint32_t idx = (ring_id * TENSOR_DUMP_MASK_POOL_MAX_SLOTS + slot) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1);
+    for (uint32_t probe = 0; probe < DUMP_TASK_MASK_TABLE_CAPACITY; probe++) {
+        DumpTaskMaskEntry &entry = g_dump_mask_table[(idx + probe) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1)];
+        if (entry.task_id == DUMP_TASK_MASK_EMPTY_TASK_ID || entry.task_id == task_id) {
+            entry.task_id = task_id;
+            entry.mask = mask;
+            return;
+        }
+    }
+    LOG_ERROR("tensor dump selective mask table is full");
+}
+
+extern "C" TensorDumpArgMask get_dump_tensor_task_mask(uint64_t task_id) {
+    if (g_dump_mask_table == nullptr) {
+        return TENSOR_DUMP_ARG_MASK_NONE;
+    }
+    uint32_t ring_id = static_cast<uint32_t>(task_id >> 32);
+    if (ring_id >= TENSOR_DUMP_MASK_POOL_MAX_RINGS) {
+        return TENSOR_DUMP_ARG_MASK_NONE;
+    }
+    uint32_t slot = static_cast<uint32_t>(task_id) & TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK;
+    uint32_t idx = (ring_id * TENSOR_DUMP_MASK_POOL_MAX_SLOTS + slot) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1);
+    for (uint32_t probe = 0; probe < DUMP_TASK_MASK_TABLE_CAPACITY; probe++) {
+        const DumpTaskMaskEntry &entry = g_dump_mask_table[(idx + probe) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1)];
+        if (entry.task_id == task_id) {
+            return entry.mask;
+        }
+        if (entry.task_id == DUMP_TASK_MASK_EMPTY_TASK_ID) {
+            return TENSOR_DUMP_ARG_MASK_NONE;
+        }
+    }
+    return TENSOR_DUMP_ARG_MASK_NONE;
+}
+
 bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role) {
     switch (dir) {
     case ArgDirection::IN:
@@ -98,6 +187,21 @@ bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage) {
     return false;
 }
 
+bool should_dump_task(TensorDumpArgMask arg_mask) {
+    if (!is_dump_tensor_selective_mode()) {
+        return true;
+    }
+    return arg_mask != TENSOR_DUMP_ARG_MASK_NONE;
+}
+
+bool should_dump_tensor_arg(TensorDumpArgMask arg_mask, int32_t arg_index) {
+    if (!is_dump_tensor_selective_mode()) {
+        return true;
+    }
+    if (arg_index < 0 || arg_index >= static_cast<int32_t>(TENSOR_DUMP_ARG_MASK_BITS)) return false;
+    return (arg_mask & (TensorDumpArgMask{1} << arg_index)) != 0;
+}
+
 bool try_log_tensor_dump_layout_mismatch() {
     if (s_logged_dump_layout_mismatch) {
         return false;
diff --git a/src/a5/runtime/host_build_graph/build_config.py b/src/a5/runtime/host_build_graph/build_config.py
index 76e7face5..a1c96c8fa 100644
--- a/src/a5/runtime/host_build_graph/build_config.py
+++ b/src/a5/runtime/host_build_graph/build_config.py
@@ -11,7 +11,7 @@
 
 BUILD_CONFIG = {
     "aicore": {"include_dirs": ["runtime"], "source_dirs": ["aicore", "runtime"]},
-    "aicpu": {"include_dirs": ["runtime"], "source_dirs": ["aicpu", "runtime"]},
-    "host": {"include_dirs": ["runtime", "orchestration"], "source_dirs": ["host", "runtime"]},
+    "aicpu": {"include_dirs": ["runtime", ".."], "source_dirs": ["aicpu", "runtime"]},
+    "host": {"include_dirs": ["runtime", "orchestration", ".."], "source_dirs": ["host", "runtime"]},
     "orchestration": {"include_dirs": ["runtime", "orchestration"], "source_dirs": []},
 }
diff --git a/src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h
new file mode 100644
index 000000000..5bf0057c6
--- /dev/null
+++ b/src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
+#define SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
+
+// =============================================================================
+// Profiling Configuration
+// =============================================================================
+
+#ifndef PTO2_PROFILING
+#define PTO2_PROFILING 1
+#endif
+
+// =============================================================================
+// Tensor Dump Configuration
+// =============================================================================
+
+// Tensor dump uses these defaults to size its selective mask table so task-id
+// ring/slot lookup stays aligned with PTO2 task id layout.
+#define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
+#define PTO2_MAX_RING_DEPTH 4        // Number of task-id ring layers
+
+#endif  // SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/build_config.py b/src/a5/runtime/tensormap_and_ringbuffer/build_config.py
index 48881f9dc..2cfba734d 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/build_config.py
+++ b/src/a5/runtime/tensormap_and_ringbuffer/build_config.py
@@ -19,8 +19,8 @@
 # by the Tensor constructor's validation logic).
 
 BUILD_CONFIG = {
-    "aicore": {"include_dirs": ["runtime", "common"], "source_dirs": ["aicore", "orchestration"]},
-    "aicpu": {"include_dirs": ["runtime", "common"], "source_dirs": ["aicpu", "runtime", "orchestration"]},
-    "host": {"include_dirs": ["runtime", "common"], "source_dirs": ["host", "runtime/shared", "orchestration"]},
-    "orchestration": {"include_dirs": ["runtime", "orchestration", "common"], "source_dirs": ["orchestration"]},
+    "aicore": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicore", "orchestration"]},
+    "aicpu": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicpu", "runtime", "orchestration"]},
+    "host": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["host", "runtime/shared", "orchestration"]},
+    "orchestration": {"include_dirs": ["runtime", "orchestration", "common", ".."], "source_dirs": ["orchestration"]},
 }
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
index 1f7116f67..f71766618 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -138,6 +138,8 @@ struct PTO2Runtime {
     PTO2ScopeMode pending_scope_mode;
 };
 
+static inline void enable_dump_tensor_selective() { set_tensor_dump_selective_requested(true); }
+
 // =============================================================================
 // Inline Convenience Wrappers (call through ops table)
 // =============================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index c97e2efce..05ac105a8 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -34,6 +34,9 @@
 #include "pto_types.h"
 #include "tensor.h"
 
+extern "C" void set_dump_tensor_selective_mode(bool enable);
+extern "C" void set_dump_tensor_task_mask(uint64_t task_id, uint64_t mask);
+
 // =============================================================================
 // Orchestrator Profiling (compile-time toggle)
 // =============================================================================
@@ -622,6 +625,14 @@ static TaskOutputTensors submit_task_common(
     }
 
     payload.init(args, result, prepared.alloc_result, layout);
+#if PTO2_PROFILING
+    if (args.tensor_dump_selective_requested()) {
+        set_dump_tensor_selective_mode(true);
+    }
+    if (args.tensor_dump_arg_mask() != 0) {
+        set_dump_tensor_task_mask(task_id.raw, args.tensor_dump_arg_mask());
+    }
+#endif
 
     CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, task_id.raw);
 #if PTO2_ORCH_PROFILING
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
index 31ca1c1fa..b0ab9bb55 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
@@ -47,6 +47,15 @@
 #define MAX_TENSOR_ARGS CORE_MAX_TENSOR_ARGS
 #define MAX_SCALAR_ARGS CORE_MAX_SCALAR_ARGS
 
+inline bool &tensor_dump_selective_requested_ref() {
+    static bool requested = false;
+    return requested;
+}
+
+inline void set_tensor_dump_selective_requested(bool enable) { tensor_dump_selective_requested_ref() = enable; }
+
+inline bool is_tensor_dump_selective_requested() { return tensor_dump_selective_requested_ref(); }
+
 typedef enum {
     ASYNC_ENGINE_SDMA = 0,
     ASYNC_ENGINE_ROCE = 1,
@@ -178,6 +187,8 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
 
     void clear() {
         TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS, TensorArgType>::clear();
+        tensor_dump_arg_mask_ = 0;
+        tensor_dump_selective_requested_ = is_tensor_dump_selective_requested();
         explicit_deps_ = nullptr;
         explicit_dep_count_ = 0;
     }
@@ -195,6 +206,25 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
         }
     }
 
+    template <typename... Args>
+    void dump(Args &&...args) {
+        static_assert(sizeof...(Args) >= 1, "dump: at least one tensor argument required");
+        static_assert(
+            (std::is_lvalue_reference_v<Args> && ...),
+            "dump: temporaries are not allowed — pass tensors already added to this Arg"
+        );
+        static_assert(
+            ((std::is_same_v<std::decay_t<Args>, Tensor> || std::is_same_v<std::decay_t<Args>, TensorCreateInfo>) &&
+             ...),
+            "dump: all arguments must be Tensor or TensorCreateInfo"
+        );
+        tensor_dump_selective_requested_ = is_tensor_dump_selective_requested();
+        (mark_tensor_dump_arg(args), ...);
+    }
+
+    uint64_t tensor_dump_arg_mask() const { return tensor_dump_arg_mask_; }
+    bool tensor_dump_selective_requested() const { return tensor_dump_selective_requested_; }
+
     template <typename... Args>
     void add_input(Args &&...args) {
         if (!check_add_tensor_valid<false>(std::forward<Args>(args)...)) {
@@ -358,9 +388,32 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
 
 private:
     // Caller-owned dependency array; lifetime must extend through submit.
+    static_assert(MAX_TENSOR_ARGS <= 64, "tensor dump arg mask assumes at most 64 tensor arguments");
+    uint64_t tensor_dump_arg_mask_{0};
+    bool tensor_dump_selective_requested_{is_tensor_dump_selective_requested()};
     const PTO2TaskId *explicit_deps_{nullptr};
     uint32_t explicit_dep_count_{0};
 
+    void mark_tensor_dump_arg(const Tensor &tensor) {
+        for (int32_t i = 0; i < tensor_count_; i++) {
+            if (tags_[i] != TensorArgType::OUTPUT && tensors_[i].ptr == &tensor) {
+                tensor_dump_arg_mask_ |= (uint64_t{1} << i);
+                return;
+            }
+        }
+        set_error("dump: tensor is not part of this Arg");
+    }
+
+    void mark_tensor_dump_arg(const TensorCreateInfo &create_info) {
+        for (int32_t i = 0; i < tensor_count_; i++) {
+            if (tags_[i] == TensorArgType::OUTPUT && tensors_[i].create_info == &create_info) {
+                tensor_dump_arg_mask_ |= (uint64_t{1} << i);
+                return;
+            }
+        }
+        set_error("dump: TensorCreateInfo is not part of this Arg");
+    }
+
     template <bool is_output, typename... Args>
     bool check_add_tensor_valid(Args &&...) {
         static_assert(sizeof...(Args) >= 1, "at least one argument required");
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/kernels/orchestration/partial_dump_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/kernels/orchestration/partial_dump_orch.cpp
new file mode 100644
index 000000000..0a60aaf1b
--- /dev/null
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/kernels/orchestration/partial_dump_orch.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 3,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
+    enable_dump_tensor_selective();
+
+    Tensor ext_a = from_tensor_arg(orch_args.tensor(0));
+    Tensor ext_b = from_tensor_arg(orch_args.tensor(1));
+    Tensor ext_f = from_tensor_arg(orch_args.tensor(2));
+
+    uint32_t size = orch_args.tensor(0).shapes[0];
+    uint32_t inter_shapes[1] = {size};
+    TensorCreateInfo inter_ci(inter_shapes, 1, DataType::FLOAT32);
+
+    Arg params_t0;
+    params_t0.add_input(ext_a);
+    params_t0.add_input(ext_b);
+    params_t0.add_output(inter_ci);
+    TaskOutputTensors outs_t0 = rt_submit_aiv_task(0, params_t0);
+    const Tensor &c = outs_t0.get_ref(0);
+
+    PTO2_SCOPE() {
+        Arg params_t1;
+        params_t1.add_input(c);
+        params_t1.add_output(inter_ci);
+        params_t1.add_scalar(1.0f);
+        params_t1.add_scalar(3u);
+        TaskOutputTensors outs_t1 = rt_submit_aiv_task(1, params_t1);
+        const Tensor &d = outs_t1.get_ref(0);
+
+        Arg params_t2;
+        params_t2.add_input(c);
+        params_t2.add_output(inter_ci);
+        params_t2.add_scalar(2.0f);
+        params_t2.add_scalar(3u);
+        TaskOutputTensors outs_t2 = rt_submit_aiv_task(1, params_t2);
+        const Tensor &e = outs_t2.get_ref(0);
+
+        Arg params_t3;
+        params_t3.add_input(d);
+        params_t3.add_input(e);
+        params_t3.add_output(inter_ci);
+        params_t3.add_scalar(3u);
+        TaskOutputTensors outs_t3 = rt_submit_aiv_task(2, params_t3);
+        const Tensor &g = outs_t3.get_ref(0);
+
+        Arg params_t4;
+        params_t4.add_input(g);
+        params_t4.add_input(c);
+        params_t4.add_output(ext_f);
+        params_t4.dump(g, c, ext_f);
+        rt_submit_aiv_task(0, params_t4);
+    }
+}
+
+}  // extern "C"
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/test_tensor_dump.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/test_tensor_dump.py
index ce5604a08..19b0f7155 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/test_tensor_dump.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/test_tensor_dump.py
@@ -122,5 +122,46 @@ def _validate_dump_artifact(self, case):
         )
 
 
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestTensorDumpPartial(SceneTestCase):
+    """Vector example with one task explicitly selected for tensor dump."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/partial_dump_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": TestTensorDump.CALLABLE["incores"],
+    }
+
+    CASES = TestTensorDump.CASES
+
+    def generate_args(self, params):
+        return TestTensorDump.generate_args(self, params)
+
+    def compute_golden(self, args, params):
+        return TestTensorDump.compute_golden(self, args, params)
+
+    def test_run(self, st_platform, st_worker, request):
+        super().test_run(st_platform, st_worker, request)
+        if not request.config.getoption("--dump-tensor", default=False):
+            return
+        safe_label = _sanitize_for_filename("TestTensorDumpPartial_default")
+        matches = sorted(_outputs_dir().glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime)
+        assert matches, "partial tensor dump output directory missing"
+        dump_dir = matches[-1] / "tensor_dump"
+        manifest = dump_dir / "tensor_dump.json"
+        assert manifest.exists(), f"tensor_dump.json missing under {dump_dir}"
+        with manifest.open() as f:
+            data = json.load(f)
+        tensors = data.get("tensors", [])
+        assert len(tensors) == 3
+        assert data.get("before_dispatch") == 2
+        assert data.get("after_completion") == 1
+        assert {t["task_id"] for t in tensors} == {"0x0000000100000003"}
+        assert [t["role"] for t in tensors] == ["input", "input", "output"]
+
+
 if __name__ == "__main__":
     SceneTestCase.run_module(__name__)