From 536fa1b7b8412c7a023fe24cb13fcc11755e7f92 Mon Sep 17 00:00:00 2001 From: zm Date: Tue, 26 May 2026 19:50:50 +0800 Subject: [PATCH] feat: support task-selective tensor dump - Add Arg::dump(...) for selecting tensor arguments to dump within a task - Keep selective dump masks in a platform-owned table instead of expanding PTO2TaskPayload - Filter unmarked tasks and unselected tensor arguments at AICPU dump collection time - Preserve legacy full tensor dump behavior when selective mode is not enabled - Add partial tensor dump regression coverage and update tensor dump documentation --- docs/dfx/tensor-dump.md | 39 ++++++- .../include/aicpu/tensor_dump_aicpu.h | 27 ++++- .../platform/include/common/tensor_dump.h | 12 ++ .../platform/src/aicpu/tensor_dump_aicpu.cpp | 106 +++++++++++++++++- .../runtime/host_build_graph/build_config.py | 4 +- .../runtime/pto_runtime2_types.h | 9 ++ .../tensormap_and_ringbuffer/build_config.py | 8 +- .../orchestration/pto_orchestration_api.h | 2 + .../runtime/pto_orchestrator.cpp | 11 ++ .../runtime/pto_types.h | 53 +++++++++ .../include/aicpu/tensor_dump_aicpu.h | 27 ++++- src/a5/platform/include/common/tensor_dump.h | 12 ++ .../platform/src/aicpu/tensor_dump_aicpu.cpp | 106 +++++++++++++++++- .../runtime/host_build_graph/build_config.py | 4 +- .../runtime/pto_runtime2_types.h | 32 ++++++ .../tensormap_and_ringbuffer/build_config.py | 8 +- .../orchestration/pto_orchestration_api.h | 2 + .../runtime/pto_orchestrator.cpp | 11 ++ .../runtime/pto_types.h | 53 +++++++++ .../orchestration/partial_dump_orch.cpp | 78 +++++++++++++ .../dfx/tensor_dump/test_tensor_dump.py | 41 +++++++ 21 files changed, 626 insertions(+), 19 deletions(-) create mode 100644 src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h create mode 100644 tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/kernels/orchestration/partial_dump_orch.cpp diff --git a/docs/dfx/tensor-dump.md b/docs/dfx/tensor-dump.md index 3bcb14320..60f3bd8ba 100644 --- a/docs/dfx/tensor-dump.md +++ b/docs/dfx/tensor-dump.md @@ -60,7 +60,44 @@ executors read the same handshake bit to insert a `pipe_barrier(PIPE_ALL)` before FIN when dump is on, so `AFTER_COMPLETION` snapshots see the kernel's final writes. -### 3.2 Output +### 3.2 Select Specific Task Tensors + +By default, `--dump-tensor` dumps every task's tensor inputs and +outputs. Device-side orchestration can opt into tensor-argument selection +by enabling selective mode at the beginning of the orchestration and +marking the tensor arguments on each `Arg` before submission: + +```cpp +enable_dump_tensor_selective(); + +Arg args; +args.add_input(x); +args.add_input(y); +args.add_output(z); +args.dump(x, y, z); +rt_submit_aiv_task(FUNC_ADD, args); +``` + +`dump(...)` selects tensor arguments from the current `Arg`; it does not +execute a dump immediately. The selected tensors must already belong to +that `Arg`. The runtime uses the argument direction already provided by +`add_input()`, `add_output()`, or `add_inout()` to decide when each +selected tensor is captured: + +- input tensors are dumped before dispatch. +- output tensors are dumped after completion. +- inout tensors follow the existing inout dump behavior. + +With `enable_dump_tensor_selective()`, tasks without any `dump(...)` marker +are skipped during AICPU collection. For marked tasks, only the selected +tensor arguments are dumped. Without `enable_dump_tensor_selective()`, the +legacy full-dump behavior is unchanged even if an `Arg` carries a +`dump(...)` marker. + +`--dump-tensor` remains the top-level enable switch; `Arg::dump(...)` +only narrows what gets recorded after tensor dump is enabled. + +### 3.3 Output The dump artifacts land under the per-task output prefix (`CallConfig::output_prefix`, set by diff --git a/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h b/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h index 8c83d71e1..d6babd8f6 100644 --- a/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h +++ b/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h @@ -20,7 +20,15 @@ #ifndef PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_ #define PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_ +#include + +#ifndef __cplusplus +#include +#endif + +#ifdef __cplusplus #include +#endif #include "common/memory_barrier.h" #include "common/tensor_dump.h" @@ -64,6 +72,10 @@ void set_dump_tensor_enabled(bool enable); * @return true if tensor dump is enabled */ bool is_dump_tensor_enabled(); +void set_dump_tensor_selective_mode(bool enable); +bool is_dump_tensor_selective_mode(); +void set_dump_tensor_task_mask(uint64_t task_id, TensorDumpArgMask mask); +TensorDumpArgMask get_dump_tensor_task_mask(uint64_t task_id); #ifdef __cplusplus } @@ -73,6 +85,8 @@ bool is_dump_tensor_enabled(); bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role); int32_t count_callable_tensor_args(const CoreCallable &callable); bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage); +bool should_dump_task(TensorDumpArgMask arg_mask); +bool should_dump_tensor_arg(TensorDumpArgMask arg_mask, int32_t arg_index); bool try_log_tensor_dump_layout_mismatch(); int dump_tensor_record(int thread_idx, const TensorDumpInfo &info); @@ -82,6 +96,13 @@ inline void dump_tensors_for_task( GetFunctionBinAddrFn get_function_bin_addr ) { const auto &pl = *slot_state.payload; + TensorDumpArgMask dump_arg_mask = TENSOR_DUMP_ARG_MASK_NONE; + if (is_dump_tensor_selective_mode()) { + dump_arg_mask = get_dump_tensor_task_mask(slot_state.task->task_id.raw); + } + if (!should_dump_task(dump_arg_mask)) { + return; + } const CoreCallable *callables[MaxSubtaskSlots] = {}; int32_t total_tensor_args = 0; @@ -125,7 +146,8 @@ inline void dump_tensors_for_task( continue; } TensorDumpRole role; - if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage)) { + if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage) && + should_dump_tensor_arg(dump_arg_mask, payload_index)) { const auto &t = pl.tensors[payload_index]; TensorDumpInfo info = {}; info.buffer_addr = t.buffer.addr; @@ -241,7 +263,6 @@ inline void dump_tensors_for_task( tensor_arg_index++; } } -#endif /** * Initialize tensor dump. @@ -279,4 +300,6 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info); */ void dump_tensor_flush(int thread_idx); +#endif + #endif // PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_ diff --git a/src/a2a3/platform/include/common/tensor_dump.h b/src/a2a3/platform/include/common/tensor_dump.h index dbea518db..48afba1cf 100644 --- a/src/a2a3/platform/include/common/tensor_dump.h +++ b/src/a2a3/platform/include/common/tensor_dump.h @@ -44,6 +44,7 @@ #include #include "common/platform_config.h" +#include "host_build_graph/runtime/pto_runtime2_types.h" // ============================================================================= // Constants @@ -70,6 +71,17 @@ enum class TensorDumpStage : uint8_t { AFTER_COMPLETION = 1, }; +using TensorDumpArgMask = uint64_t; + +// Bitmask stored in the platform-owned mask pool when orchestration selects +// specific task tensor arguments for dump. Bit N corresponds to tensors[N]. +// Zero preserves legacy "dump all tasks" behavior unless selective mode is enabled. +constexpr TensorDumpArgMask TENSOR_DUMP_ARG_MASK_NONE = 0; +constexpr uint32_t TENSOR_DUMP_ARG_MASK_BITS = 64; +constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_RINGS = PTO2_MAX_RING_DEPTH; +constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_SLOTS = PTO2_TASK_WINDOW_SIZE; +constexpr uint32_t TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK = TENSOR_DUMP_MASK_POOL_MAX_SLOTS - 1; + // ============================================================================= // TensorDumpRecord - Single Tensor Dump Entry (128B = 2 cache lines) // ============================================================================= diff --git a/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp b/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp index 7e3e3c9b2..1373868e0 100644 --- a/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp +++ b/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp @@ -20,6 +20,7 @@ #include "aicpu/tensor_dump_aicpu.h" +#include #include #include "common/memory_barrier.h" @@ -53,11 +54,99 @@ extern "C" void set_platform_dump_base(uint64_t dump_data_base) { g_platform_dum extern "C" uint64_t get_platform_dump_base() { return g_platform_dump_base; } static bool g_enable_dump_tensor = false; +static bool g_dump_tensor_selective_mode = false; +struct DumpTaskMaskEntry { + uint64_t task_id; + TensorDumpArgMask mask; +}; +static constexpr uint64_t DUMP_TASK_MASK_EMPTY_TASK_ID = UINT64_MAX; +static constexpr uint32_t DUMP_TASK_MASK_TABLE_CAPACITY = 32768; +static DumpTaskMaskEntry *g_dump_mask_table = nullptr; +static bool ensure_dump_mask_table() { + if (g_dump_mask_table != nullptr) { + return true; + } + g_dump_mask_table = + static_cast(malloc(sizeof(DumpTaskMaskEntry) * DUMP_TASK_MASK_TABLE_CAPACITY)); + if (g_dump_mask_table == nullptr) { + LOG_ERROR("Failed to allocate tensor dump selective mask table"); + return false; + } + for (uint32_t i = 0; i < DUMP_TASK_MASK_TABLE_CAPACITY; i++) { + g_dump_mask_table[i].task_id = DUMP_TASK_MASK_EMPTY_TASK_ID; + g_dump_mask_table[i].mask = TENSOR_DUMP_ARG_MASK_NONE; + } + return true; +} -extern "C" void set_dump_tensor_enabled(bool enable) { g_enable_dump_tensor = enable; } +static void clear_dump_mask_table() { + if (g_dump_mask_table == nullptr) { + return; + } + for (uint32_t i = 0; i < DUMP_TASK_MASK_TABLE_CAPACITY; i++) { + g_dump_mask_table[i].task_id = DUMP_TASK_MASK_EMPTY_TASK_ID; + g_dump_mask_table[i].mask = TENSOR_DUMP_ARG_MASK_NONE; + } +} + +extern "C" void set_dump_tensor_enabled(bool enable) { + g_enable_dump_tensor = enable; + g_dump_tensor_selective_mode = false; + clear_dump_mask_table(); +} extern "C" bool is_dump_tensor_enabled() { return g_enable_dump_tensor; } +extern "C" void set_dump_tensor_selective_mode(bool enable) { g_dump_tensor_selective_mode = enable; } + +extern "C" bool is_dump_tensor_selective_mode() { return g_dump_tensor_selective_mode; } + +extern "C" void set_dump_tensor_task_mask(uint64_t task_id, TensorDumpArgMask mask) { + if (mask == TENSOR_DUMP_ARG_MASK_NONE) { + return; + } + if (!ensure_dump_mask_table()) { + return; + } + uint32_t ring_id = static_cast(task_id >> 32); + if (ring_id >= TENSOR_DUMP_MASK_POOL_MAX_RINGS) { + return; + } + uint32_t slot = static_cast(task_id) & TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK; + uint32_t idx = (ring_id * TENSOR_DUMP_MASK_POOL_MAX_SLOTS + slot) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1); + for (uint32_t probe = 0; probe < DUMP_TASK_MASK_TABLE_CAPACITY; probe++) { + DumpTaskMaskEntry &entry = g_dump_mask_table[(idx + probe) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1)]; + if (entry.task_id == DUMP_TASK_MASK_EMPTY_TASK_ID || entry.task_id == task_id) { + entry.task_id = task_id; + entry.mask = mask; + return; + } + } + LOG_ERROR("tensor dump selective mask table is full"); +} + +extern "C" TensorDumpArgMask get_dump_tensor_task_mask(uint64_t task_id) { + if (g_dump_mask_table == nullptr) { + return TENSOR_DUMP_ARG_MASK_NONE; + } + uint32_t ring_id = static_cast(task_id >> 32); + if (ring_id >= TENSOR_DUMP_MASK_POOL_MAX_RINGS) { + return TENSOR_DUMP_ARG_MASK_NONE; + } + uint32_t slot = static_cast(task_id) & TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK; + uint32_t idx = (ring_id * TENSOR_DUMP_MASK_POOL_MAX_SLOTS + slot) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1); + for (uint32_t probe = 0; probe < DUMP_TASK_MASK_TABLE_CAPACITY; probe++) { + const DumpTaskMaskEntry &entry = g_dump_mask_table[(idx + probe) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1)]; + if (entry.task_id == task_id) { + return entry.mask; + } + if (entry.task_id == DUMP_TASK_MASK_EMPTY_TASK_ID) { + return TENSOR_DUMP_ARG_MASK_NONE; + } + } + return TENSOR_DUMP_ARG_MASK_NONE; +} + bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role) { switch (dir) { case ArgDirection::IN: @@ -97,6 +186,21 @@ bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage) { return false; } +bool should_dump_task(TensorDumpArgMask arg_mask) { + if (!is_dump_tensor_selective_mode()) { + return true; + } + return arg_mask != TENSOR_DUMP_ARG_MASK_NONE; +} + +bool should_dump_tensor_arg(TensorDumpArgMask arg_mask, int32_t arg_index) { + if (!is_dump_tensor_selective_mode()) { + return true; + } + if (arg_index < 0 || arg_index >= static_cast(TENSOR_DUMP_ARG_MASK_BITS)) return false; + return (arg_mask & (TensorDumpArgMask{1} << arg_index)) != 0; +} + bool try_log_tensor_dump_layout_mismatch() { if (s_logged_dump_layout_mismatch) { return false; diff --git a/src/a2a3/runtime/host_build_graph/build_config.py b/src/a2a3/runtime/host_build_graph/build_config.py index 76e7face5..a1c96c8fa 100644 --- a/src/a2a3/runtime/host_build_graph/build_config.py +++ b/src/a2a3/runtime/host_build_graph/build_config.py @@ -11,7 +11,7 @@ BUILD_CONFIG = { "aicore": {"include_dirs": ["runtime"], "source_dirs": ["aicore", "runtime"]}, - "aicpu": {"include_dirs": ["runtime"], "source_dirs": ["aicpu", "runtime"]}, - "host": {"include_dirs": ["runtime", "orchestration"], "source_dirs": ["host", "runtime"]}, + "aicpu": {"include_dirs": ["runtime", ".."], "source_dirs": ["aicpu", "runtime"]}, + "host": {"include_dirs": ["runtime", "orchestration", ".."], "source_dirs": ["host", "runtime"]}, "orchestration": {"include_dirs": ["runtime", "orchestration"], "source_dirs": []}, } diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h index 7e6d00b99..4d4bb9313 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h @@ -20,4 +20,13 @@ #define PTO2_PROFILING 1 #endif +// ============================================================================= +// Tensor Dump Configuration +// ============================================================================= + +// Tensor dump uses these defaults to size its selective mask table so task-id +// ring/slot lookup stays aligned with PTO2 task id layout. +#define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2) +#define PTO2_MAX_RING_DEPTH 4 // Number of task-id ring layers + #endif // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py b/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py index 48881f9dc..2cfba734d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py @@ -19,8 +19,8 @@ # by the Tensor constructor's validation logic). BUILD_CONFIG = { - "aicore": {"include_dirs": ["runtime", "common"], "source_dirs": ["aicore", "orchestration"]}, - "aicpu": {"include_dirs": ["runtime", "common"], "source_dirs": ["aicpu", "runtime", "orchestration"]}, - "host": {"include_dirs": ["runtime", "common"], "source_dirs": ["host", "runtime/shared", "orchestration"]}, - "orchestration": {"include_dirs": ["runtime", "orchestration", "common"], "source_dirs": ["orchestration"]}, + "aicore": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicore", "orchestration"]}, + "aicpu": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicpu", "runtime", "orchestration"]}, + "host": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["host", "runtime/shared", "orchestration"]}, + "orchestration": {"include_dirs": ["runtime", "orchestration", "common", ".."], "source_dirs": ["orchestration"]}, } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h index 6c0640a40..eabe3ec3f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h @@ -138,6 +138,8 @@ struct PTO2Runtime { PTO2ScopeMode pending_scope_mode; }; +static inline void enable_dump_tensor_selective() { set_tensor_dump_selective_requested(true); } + // ============================================================================= // Inline Convenience Wrappers (call through ops table) // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 321d66d2b..813f1f846 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -36,6 +36,9 @@ #include "pto_types.h" #include "tensor.h" +extern "C" void set_dump_tensor_selective_mode(bool enable); +extern "C" void set_dump_tensor_task_mask(uint64_t task_id, uint64_t mask); + // Verify the captured Tensor blob size in DepGenRecord matches the runtime // Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without // including runtime/tensor.h, so this check lives at the orch callsite. @@ -674,6 +677,14 @@ static TaskOutputTensors submit_task_common( } payload.init(args, result, prepared.alloc_result, layout); +#if PTO2_PROFILING + if (args.tensor_dump_selective_requested()) { + set_dump_tensor_selective_mode(true); + } + if (args.tensor_dump_arg_mask() != 0) { + set_dump_tensor_task_mask(task_id.raw, args.tensor_dump_arg_mask()); + } +#endif CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, task_id.raw); #if PTO2_ORCH_PROFILING diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h index 22dbae0af..79743f89e 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h @@ -45,6 +45,15 @@ #define MAX_TENSOR_ARGS CORE_MAX_TENSOR_ARGS #define MAX_SCALAR_ARGS CORE_MAX_SCALAR_ARGS +inline bool &tensor_dump_selective_requested_ref() { + static bool requested = false; + return requested; +} + +inline void set_tensor_dump_selective_requested(bool enable) { tensor_dump_selective_requested_ref() = enable; } + +inline bool is_tensor_dump_selective_requested() { return tensor_dump_selective_requested_ref(); } + typedef enum { ASYNC_ENGINE_SDMA = 0, ASYNC_ENGINE_ROCE = 1, @@ -180,6 +189,8 @@ struct Arg : TaskArgsTpl + void dump(Args &&...args) { + static_assert(sizeof...(Args) >= 1, "dump: at least one tensor argument required"); + static_assert( + (std::is_lvalue_reference_v && ...), + "dump: temporaries are not allowed — pass tensors already added to this Arg" + ); + static_assert( + ((std::is_same_v, Tensor> || std::is_same_v, TensorCreateInfo>) && + ...), + "dump: all arguments must be Tensor or TensorCreateInfo" + ); + tensor_dump_selective_requested_ = is_tensor_dump_selective_requested(); + (mark_tensor_dump_arg(args), ...); + } + + uint64_t tensor_dump_arg_mask() const { return tensor_dump_arg_mask_; } + bool tensor_dump_selective_requested() const { return tensor_dump_selective_requested_; } + template void add_input(Args &&...args) { if (!check_add_tensor_valid(args...)) { @@ -354,9 +384,32 @@ struct Arg : TaskArgsTpl bool check_add_tensor_valid(Args &&...) { static_assert(sizeof...(Args) >= 1, "at least one argument required"); diff --git a/src/a5/platform/include/aicpu/tensor_dump_aicpu.h b/src/a5/platform/include/aicpu/tensor_dump_aicpu.h index 2afe1b410..5d124b082 100644 --- a/src/a5/platform/include/aicpu/tensor_dump_aicpu.h +++ b/src/a5/platform/include/aicpu/tensor_dump_aicpu.h @@ -20,7 +20,15 @@ #ifndef SRC_A5_PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_ #define SRC_A5_PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_ +#include + +#ifndef __cplusplus +#include +#endif + +#ifdef __cplusplus #include +#endif #include "common/memory_barrier.h" #include "common/tensor_dump.h" @@ -64,6 +72,10 @@ void set_dump_tensor_enabled(bool enable); * @return true if tensor dump is enabled */ bool is_dump_tensor_enabled(); +void set_dump_tensor_selective_mode(bool enable); +bool is_dump_tensor_selective_mode(); +void set_dump_tensor_task_mask(uint64_t task_id, TensorDumpArgMask mask); +TensorDumpArgMask get_dump_tensor_task_mask(uint64_t task_id); #ifdef __cplusplus } @@ -73,6 +85,8 @@ bool is_dump_tensor_enabled(); bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role); int32_t count_callable_tensor_args(const CoreCallable &callable); bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage); +bool should_dump_task(TensorDumpArgMask arg_mask); +bool should_dump_tensor_arg(TensorDumpArgMask arg_mask, int32_t arg_index); bool try_log_tensor_dump_layout_mismatch(); int dump_tensor_record(int thread_idx, const TensorDumpInfo &info); @@ -82,6 +96,13 @@ inline void dump_tensors_for_task( GetFunctionBinAddrFn get_function_bin_addr ) { const auto &pl = *slot_state.payload; + TensorDumpArgMask dump_arg_mask = TENSOR_DUMP_ARG_MASK_NONE; + if (is_dump_tensor_selective_mode()) { + dump_arg_mask = get_dump_tensor_task_mask(slot_state.task->task_id.raw); + } + if (!should_dump_task(dump_arg_mask)) { + return; + } const CoreCallable *callables[MaxSubtaskSlots] = {}; int32_t total_tensor_args = 0; @@ -125,7 +146,8 @@ inline void dump_tensors_for_task( continue; } TensorDumpRole role; - if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage)) { + if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage) && + should_dump_tensor_arg(dump_arg_mask, payload_index)) { const auto &t = pl.tensors[payload_index]; TensorDumpInfo info = {}; info.buffer_addr = t.buffer.addr; @@ -241,7 +263,6 @@ inline void dump_tensors_for_task( tensor_arg_index++; } } -#endif /** * Initialize tensor dump. @@ -279,4 +300,6 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info); */ void dump_tensor_flush(int thread_idx); +#endif + #endif // SRC_A5_PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_ diff --git a/src/a5/platform/include/common/tensor_dump.h b/src/a5/platform/include/common/tensor_dump.h index d774ecfe5..9b7c64433 100644 --- a/src/a5/platform/include/common/tensor_dump.h +++ b/src/a5/platform/include/common/tensor_dump.h @@ -48,6 +48,7 @@ #include #include "common/platform_config.h" +#include "host_build_graph/runtime/pto_runtime2_types.h" // ============================================================================= // Constants @@ -74,6 +75,17 @@ enum class TensorDumpStage : uint8_t { AFTER_COMPLETION = 1, }; +using TensorDumpArgMask = uint64_t; + +// Bitmask stored in the platform-owned mask pool when orchestration selects +// specific task tensor arguments for dump. Bit N corresponds to tensors[N]. +// Zero preserves legacy "dump all tasks" behavior unless selective mode is enabled. +constexpr TensorDumpArgMask TENSOR_DUMP_ARG_MASK_NONE = 0; +constexpr uint32_t TENSOR_DUMP_ARG_MASK_BITS = 64; +constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_RINGS = PTO2_MAX_RING_DEPTH; +constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_SLOTS = PTO2_TASK_WINDOW_SIZE; +constexpr uint32_t TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK = TENSOR_DUMP_MASK_POOL_MAX_SLOTS - 1; + // ============================================================================= // TensorDumpRecord - Single Tensor Dump Entry (128B = 2 cache lines) // ============================================================================= diff --git a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp index 4aff271f4..33c04f783 100644 --- a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp +++ b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp @@ -21,6 +21,7 @@ #include "aicpu/tensor_dump_aicpu.h" +#include #include #include "common/memory_barrier.h" @@ -54,11 +55,99 @@ extern "C" void set_platform_dump_base(uint64_t dump_data_base) { g_platform_dum extern "C" uint64_t get_platform_dump_base() { return g_platform_dump_base; } static bool g_enable_dump_tensor = false; +static bool g_dump_tensor_selective_mode = false; +struct DumpTaskMaskEntry { + uint64_t task_id; + TensorDumpArgMask mask; +}; +static constexpr uint64_t DUMP_TASK_MASK_EMPTY_TASK_ID = UINT64_MAX; +static constexpr uint32_t DUMP_TASK_MASK_TABLE_CAPACITY = 32768; +static DumpTaskMaskEntry *g_dump_mask_table = nullptr; +static bool ensure_dump_mask_table() { + if (g_dump_mask_table != nullptr) { + return true; + } + g_dump_mask_table = + static_cast(malloc(sizeof(DumpTaskMaskEntry) * DUMP_TASK_MASK_TABLE_CAPACITY)); + if (g_dump_mask_table == nullptr) { + LOG_ERROR("Failed to allocate tensor dump selective mask table"); + return false; + } + for (uint32_t i = 0; i < DUMP_TASK_MASK_TABLE_CAPACITY; i++) { + g_dump_mask_table[i].task_id = DUMP_TASK_MASK_EMPTY_TASK_ID; + g_dump_mask_table[i].mask = TENSOR_DUMP_ARG_MASK_NONE; + } + return true; +} -extern "C" void set_dump_tensor_enabled(bool enable) { g_enable_dump_tensor = enable; } +static void clear_dump_mask_table() { + if (g_dump_mask_table == nullptr) { + return; + } + for (uint32_t i = 0; i < DUMP_TASK_MASK_TABLE_CAPACITY; i++) { + g_dump_mask_table[i].task_id = DUMP_TASK_MASK_EMPTY_TASK_ID; + g_dump_mask_table[i].mask = TENSOR_DUMP_ARG_MASK_NONE; + } +} + +extern "C" void set_dump_tensor_enabled(bool enable) { + g_enable_dump_tensor = enable; + g_dump_tensor_selective_mode = false; + clear_dump_mask_table(); +} extern "C" bool is_dump_tensor_enabled() { return g_enable_dump_tensor; } +extern "C" void set_dump_tensor_selective_mode(bool enable) { g_dump_tensor_selective_mode = enable; } + +extern "C" bool is_dump_tensor_selective_mode() { return g_dump_tensor_selective_mode; } + +extern "C" void set_dump_tensor_task_mask(uint64_t task_id, TensorDumpArgMask mask) { + if (mask == TENSOR_DUMP_ARG_MASK_NONE) { + return; + } + if (!ensure_dump_mask_table()) { + return; + } + uint32_t ring_id = static_cast(task_id >> 32); + if (ring_id >= TENSOR_DUMP_MASK_POOL_MAX_RINGS) { + return; + } + uint32_t slot = static_cast(task_id) & TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK; + uint32_t idx = (ring_id * TENSOR_DUMP_MASK_POOL_MAX_SLOTS + slot) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1); + for (uint32_t probe = 0; probe < DUMP_TASK_MASK_TABLE_CAPACITY; probe++) { + DumpTaskMaskEntry &entry = g_dump_mask_table[(idx + probe) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1)]; + if (entry.task_id == DUMP_TASK_MASK_EMPTY_TASK_ID || entry.task_id == task_id) { + entry.task_id = task_id; + entry.mask = mask; + return; + } + } + LOG_ERROR("tensor dump selective mask table is full"); +} + +extern "C" TensorDumpArgMask get_dump_tensor_task_mask(uint64_t task_id) { + if (g_dump_mask_table == nullptr) { + return TENSOR_DUMP_ARG_MASK_NONE; + } + uint32_t ring_id = static_cast(task_id >> 32); + if (ring_id >= TENSOR_DUMP_MASK_POOL_MAX_RINGS) { + return TENSOR_DUMP_ARG_MASK_NONE; + } + uint32_t slot = static_cast(task_id) & TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK; + uint32_t idx = (ring_id * TENSOR_DUMP_MASK_POOL_MAX_SLOTS + slot) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1); + for (uint32_t probe = 0; probe < DUMP_TASK_MASK_TABLE_CAPACITY; probe++) { + const DumpTaskMaskEntry &entry = g_dump_mask_table[(idx + probe) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1)]; + if (entry.task_id == task_id) { + return entry.mask; + } + if (entry.task_id == DUMP_TASK_MASK_EMPTY_TASK_ID) { + return TENSOR_DUMP_ARG_MASK_NONE; + } + } + return TENSOR_DUMP_ARG_MASK_NONE; +} + bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role) { switch (dir) { case ArgDirection::IN: @@ -98,6 +187,21 @@ bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage) { return false; } +bool should_dump_task(TensorDumpArgMask arg_mask) { + if (!is_dump_tensor_selective_mode()) { + return true; + } + return arg_mask != TENSOR_DUMP_ARG_MASK_NONE; +} + +bool should_dump_tensor_arg(TensorDumpArgMask arg_mask, int32_t arg_index) { + if (!is_dump_tensor_selective_mode()) { + return true; + } + if (arg_index < 0 || arg_index >= static_cast(TENSOR_DUMP_ARG_MASK_BITS)) return false; + return (arg_mask & (TensorDumpArgMask{1} << arg_index)) != 0; +} + bool try_log_tensor_dump_layout_mismatch() { if (s_logged_dump_layout_mismatch) { return false; diff --git a/src/a5/runtime/host_build_graph/build_config.py b/src/a5/runtime/host_build_graph/build_config.py index 76e7face5..a1c96c8fa 100644 --- a/src/a5/runtime/host_build_graph/build_config.py +++ b/src/a5/runtime/host_build_graph/build_config.py @@ -11,7 +11,7 @@ BUILD_CONFIG = { "aicore": {"include_dirs": ["runtime"], "source_dirs": ["aicore", "runtime"]}, - "aicpu": {"include_dirs": ["runtime"], "source_dirs": ["aicpu", "runtime"]}, - "host": {"include_dirs": ["runtime", "orchestration"], "source_dirs": ["host", "runtime"]}, + "aicpu": {"include_dirs": ["runtime", ".."], "source_dirs": ["aicpu", "runtime"]}, + "host": {"include_dirs": ["runtime", "orchestration", ".."], "source_dirs": ["host", "runtime"]}, "orchestration": {"include_dirs": ["runtime", "orchestration"], "source_dirs": []}, } diff --git a/src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h new file mode 100644 index 000000000..5bf0057c6 --- /dev/null +++ b/src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ +#define SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ + +// ============================================================================= +// Profiling Configuration +// ============================================================================= + +#ifndef PTO2_PROFILING +#define PTO2_PROFILING 1 +#endif + +// ============================================================================= +// Tensor Dump Configuration +// ============================================================================= + +// Tensor dump uses these defaults to size its selective mask table so task-id +// ring/slot lookup stays aligned with PTO2 task id layout. +#define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2) +#define PTO2_MAX_RING_DEPTH 4 // Number of task-id ring layers + +#endif // SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ diff --git a/src/a5/runtime/tensormap_and_ringbuffer/build_config.py b/src/a5/runtime/tensormap_and_ringbuffer/build_config.py index 48881f9dc..2cfba734d 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/build_config.py +++ b/src/a5/runtime/tensormap_and_ringbuffer/build_config.py @@ -19,8 +19,8 @@ # by the Tensor constructor's validation logic). BUILD_CONFIG = { - "aicore": {"include_dirs": ["runtime", "common"], "source_dirs": ["aicore", "orchestration"]}, - "aicpu": {"include_dirs": ["runtime", "common"], "source_dirs": ["aicpu", "runtime", "orchestration"]}, - "host": {"include_dirs": ["runtime", "common"], "source_dirs": ["host", "runtime/shared", "orchestration"]}, - "orchestration": {"include_dirs": ["runtime", "orchestration", "common"], "source_dirs": ["orchestration"]}, + "aicore": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicore", "orchestration"]}, + "aicpu": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicpu", "runtime", "orchestration"]}, + "host": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["host", "runtime/shared", "orchestration"]}, + "orchestration": {"include_dirs": ["runtime", "orchestration", "common", ".."], "source_dirs": ["orchestration"]}, } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h index 1f7116f67..f71766618 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h @@ -138,6 +138,8 @@ struct PTO2Runtime { PTO2ScopeMode pending_scope_mode; }; +static inline void enable_dump_tensor_selective() { set_tensor_dump_selective_requested(true); } + // ============================================================================= // Inline Convenience Wrappers (call through ops table) // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index c97e2efce..05ac105a8 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -34,6 +34,9 @@ #include "pto_types.h" #include "tensor.h" +extern "C" void set_dump_tensor_selective_mode(bool enable); +extern "C" void set_dump_tensor_task_mask(uint64_t task_id, uint64_t mask); + // ============================================================================= // Orchestrator Profiling (compile-time toggle) // ============================================================================= @@ -622,6 +625,14 @@ static TaskOutputTensors submit_task_common( } payload.init(args, result, prepared.alloc_result, layout); +#if PTO2_PROFILING + if (args.tensor_dump_selective_requested()) { + set_dump_tensor_selective_mode(true); + } + if (args.tensor_dump_arg_mask() != 0) { + set_dump_tensor_task_mask(task_id.raw, args.tensor_dump_arg_mask()); + } +#endif CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, task_id.raw); #if PTO2_ORCH_PROFILING diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h index 31ca1c1fa..b0ab9bb55 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h @@ -47,6 +47,15 @@ #define MAX_TENSOR_ARGS CORE_MAX_TENSOR_ARGS #define MAX_SCALAR_ARGS CORE_MAX_SCALAR_ARGS +inline bool &tensor_dump_selective_requested_ref() { + static bool requested = false; + return requested; +} + +inline void set_tensor_dump_selective_requested(bool enable) { tensor_dump_selective_requested_ref() = enable; } + +inline bool is_tensor_dump_selective_requested() { return tensor_dump_selective_requested_ref(); } + typedef enum { ASYNC_ENGINE_SDMA = 0, ASYNC_ENGINE_ROCE = 1, @@ -178,6 +187,8 @@ struct Arg : TaskArgsTpl::clear(); + tensor_dump_arg_mask_ = 0; + tensor_dump_selective_requested_ = is_tensor_dump_selective_requested(); explicit_deps_ = nullptr; explicit_dep_count_ = 0; } @@ -195,6 +206,25 @@ struct Arg : TaskArgsTpl + void dump(Args &&...args) { + static_assert(sizeof...(Args) >= 1, "dump: at least one tensor argument required"); + static_assert( + (std::is_lvalue_reference_v && ...), + "dump: temporaries are not allowed — pass tensors already added to this Arg" + ); + static_assert( + ((std::is_same_v, Tensor> || std::is_same_v, TensorCreateInfo>) && + ...), + "dump: all arguments must be Tensor or TensorCreateInfo" + ); + tensor_dump_selective_requested_ = is_tensor_dump_selective_requested(); + (mark_tensor_dump_arg(args), ...); + } + + uint64_t tensor_dump_arg_mask() const { return tensor_dump_arg_mask_; } + bool tensor_dump_selective_requested() const { return tensor_dump_selective_requested_; } + template void add_input(Args &&...args) { if (!check_add_tensor_valid(std::forward(args)...)) { @@ -358,9 +388,32 @@ struct Arg : TaskArgsTpl bool check_add_tensor_valid(Args &&...) { static_assert(sizeof...(Args) >= 1, "at least one argument required"); diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/kernels/orchestration/partial_dump_orch.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/kernels/orchestration/partial_dump_orch.cpp new file mode 100644 index 000000000..0a60aaf1b --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/kernels/orchestration/partial_dump_orch.cpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{ + .expected_arg_count = 3, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) { + enable_dump_tensor_selective(); + + Tensor ext_a = from_tensor_arg(orch_args.tensor(0)); + Tensor ext_b = from_tensor_arg(orch_args.tensor(1)); + Tensor ext_f = from_tensor_arg(orch_args.tensor(2)); + + uint32_t size = orch_args.tensor(0).shapes[0]; + uint32_t inter_shapes[1] = {size}; + TensorCreateInfo inter_ci(inter_shapes, 1, DataType::FLOAT32); + + Arg params_t0; + params_t0.add_input(ext_a); + params_t0.add_input(ext_b); + params_t0.add_output(inter_ci); + TaskOutputTensors outs_t0 = rt_submit_aiv_task(0, params_t0); + const Tensor &c = outs_t0.get_ref(0); + + PTO2_SCOPE() { + Arg params_t1; + params_t1.add_input(c); + params_t1.add_output(inter_ci); + params_t1.add_scalar(1.0f); + params_t1.add_scalar(3u); + TaskOutputTensors outs_t1 = rt_submit_aiv_task(1, params_t1); + const Tensor &d = outs_t1.get_ref(0); + + Arg params_t2; + params_t2.add_input(c); + params_t2.add_output(inter_ci); + params_t2.add_scalar(2.0f); + params_t2.add_scalar(3u); + TaskOutputTensors outs_t2 = rt_submit_aiv_task(1, params_t2); + const Tensor &e = outs_t2.get_ref(0); + + Arg params_t3; + params_t3.add_input(d); + params_t3.add_input(e); + params_t3.add_output(inter_ci); + params_t3.add_scalar(3u); + TaskOutputTensors outs_t3 = rt_submit_aiv_task(2, params_t3); + const Tensor &g = outs_t3.get_ref(0); + + Arg params_t4; + params_t4.add_input(g); + params_t4.add_input(c); + params_t4.add_output(ext_f); + params_t4.dump(g, c, ext_f); + rt_submit_aiv_task(0, params_t4); + } +} + +} // extern "C" diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/test_tensor_dump.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/test_tensor_dump.py index ce5604a08..19b0f7155 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/test_tensor_dump.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/tensor_dump/test_tensor_dump.py @@ -122,5 +122,46 @@ def _validate_dump_artifact(self, case): ) +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestTensorDumpPartial(SceneTestCase): + """Vector example with one task explicitly selected for tensor dump.""" + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/partial_dump_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT], + }, + "incores": TestTensorDump.CALLABLE["incores"], + } + + CASES = TestTensorDump.CASES + + def generate_args(self, params): + return TestTensorDump.generate_args(self, params) + + def compute_golden(self, args, params): + return TestTensorDump.compute_golden(self, args, params) + + def test_run(self, st_platform, st_worker, request): + super().test_run(st_platform, st_worker, request) + if not request.config.getoption("--dump-tensor", default=False): + return + safe_label = _sanitize_for_filename("TestTensorDumpPartial_default") + matches = sorted(_outputs_dir().glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime) + assert matches, "partial tensor dump output directory missing" + dump_dir = matches[-1] / "tensor_dump" + manifest = dump_dir / "tensor_dump.json" + assert manifest.exists(), f"tensor_dump.json missing under {dump_dir}" + with manifest.open() as f: + data = json.load(f) + tensors = data.get("tensors", []) + assert len(tensors) == 3 + assert data.get("before_dispatch") == 2 + assert data.get("after_completion") == 1 + assert {t["task_id"] for t in tensors} == {"0x0000000100000003"} + assert [t["role"] for t in tensors] == ["input", "input", "output"] + + if __name__ == "__main__": SceneTestCase.run_module(__name__)