Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion docs/dfx/tensor-dump.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,44 @@ executors read the same handshake bit to insert a
`pipe_barrier(PIPE_ALL)` before FIN when dump is on, so
`AFTER_COMPLETION` snapshots see the kernel's final writes.

### 3.2 Output
### 3.2 Select Specific Task Tensors

By default, `--dump-tensor` dumps every task's tensor inputs and
outputs. Device-side orchestration can opt into tensor-argument selection
by enabling selective mode at the beginning of the orchestration and
marking the tensor arguments on each `Arg` before submission:

```cpp
enable_dump_tensor_selective();

Arg args;
args.add_input(x);
args.add_input(y);
args.add_output(z);
args.dump(x, y, z);
rt_submit_aiv_task(FUNC_ADD, args);
```

`dump(...)` selects tensor arguments from the current `Arg`; it does not
execute a dump immediately. The selected tensors must already belong to
that `Arg`. The runtime uses the argument direction already provided by
`add_input()`, `add_output()`, or `add_inout()` to decide when each
selected tensor is captured:

- input tensors are dumped before dispatch.
- output tensors are dumped after completion.
- inout tensors follow the existing inout dump behavior.

With `enable_dump_tensor_selective()`, tasks without any `dump(...)` marker
are skipped during AICPU collection. For marked tasks, only the selected
tensor arguments are dumped. Without `enable_dump_tensor_selective()`, the
legacy full-dump behavior is unchanged even if an `Arg` carries a
`dump(...)` marker.

`--dump-tensor` remains the top-level enable switch; `Arg::dump(...)`
only narrows what gets recorded after tensor dump is enabled.

### 3.3 Output

The dump artifacts land under the per-task output prefix
(`CallConfig::output_prefix`, set by
Expand Down
27 changes: 25 additions & 2 deletions src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,15 @@
#ifndef PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_
#define PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_

#include <stdint.h>

#ifndef __cplusplus
#include <stdbool.h>
#endif

#ifdef __cplusplus
#include <cinttypes>
#endif

#include "common/memory_barrier.h"
#include "common/tensor_dump.h"
Expand Down Expand Up @@ -64,6 +72,10 @@ void set_dump_tensor_enabled(bool enable);
* @return true if tensor dump is enabled
*/
bool is_dump_tensor_enabled();
void set_dump_tensor_selective_mode(bool enable);
bool is_dump_tensor_selective_mode();
void set_dump_tensor_task_mask(uint64_t task_id, TensorDumpArgMask mask);
TensorDumpArgMask get_dump_tensor_task_mask(uint64_t task_id);

#ifdef __cplusplus
}
Expand All @@ -73,6 +85,8 @@ bool is_dump_tensor_enabled();
bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role);
int32_t count_callable_tensor_args(const CoreCallable &callable);
bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage);
bool should_dump_task(TensorDumpArgMask arg_mask);
bool should_dump_tensor_arg(TensorDumpArgMask arg_mask, int32_t arg_index);
bool try_log_tensor_dump_layout_mismatch();
int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);

Expand All @@ -82,6 +96,13 @@ inline void dump_tensors_for_task(
GetFunctionBinAddrFn get_function_bin_addr
) {
const auto &pl = *slot_state.payload;
TensorDumpArgMask dump_arg_mask = TENSOR_DUMP_ARG_MASK_NONE;
if (is_dump_tensor_selective_mode()) {
dump_arg_mask = get_dump_tensor_task_mask(slot_state.task->task_id.raw);
}
if (!should_dump_task(dump_arg_mask)) {
return;
}
const CoreCallable *callables[MaxSubtaskSlots] = {};
int32_t total_tensor_args = 0;

Expand Down Expand Up @@ -125,7 +146,8 @@ inline void dump_tensors_for_task(
continue;
}
TensorDumpRole role;
if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage)) {
if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage) &&
should_dump_tensor_arg(dump_arg_mask, payload_index)) {
const auto &t = pl.tensors[payload_index];
TensorDumpInfo info = {};
info.buffer_addr = t.buffer.addr;
Expand Down Expand Up @@ -241,7 +263,6 @@ inline void dump_tensors_for_task(
tensor_arg_index++;
}
}
#endif

/**
* Initialize tensor dump.
Expand Down Expand Up @@ -279,4 +300,6 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);
*/
void dump_tensor_flush(int thread_idx);

#endif

#endif // PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_
12 changes: 12 additions & 0 deletions src/a2a3/platform/include/common/tensor_dump.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include <cstdint>

#include "common/platform_config.h"
#include "host_build_graph/runtime/pto_runtime2_types.h"

// =============================================================================
// Constants
Expand All @@ -70,6 +71,17 @@ enum class TensorDumpStage : uint8_t {
AFTER_COMPLETION = 1,
};

using TensorDumpArgMask = uint64_t;

// Bitmask stored in the platform-owned mask pool when orchestration selects
// specific task tensor arguments for dump. Bit N corresponds to tensors[N].
// Zero preserves legacy "dump all tasks" behavior unless selective mode is enabled.
constexpr TensorDumpArgMask TENSOR_DUMP_ARG_MASK_NONE = 0;
constexpr uint32_t TENSOR_DUMP_ARG_MASK_BITS = 64;
constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_RINGS = PTO2_MAX_RING_DEPTH;
constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_SLOTS = PTO2_TASK_WINDOW_SIZE;
constexpr uint32_t TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK = TENSOR_DUMP_MASK_POOL_MAX_SLOTS - 1;

// =============================================================================
// TensorDumpRecord - Single Tensor Dump Entry (128B = 2 cache lines)
// =============================================================================
Expand Down
106 changes: 105 additions & 1 deletion src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "aicpu/tensor_dump_aicpu.h"

#include <cstdlib>
#include <cstring>

#include "common/memory_barrier.h"
Expand Down Expand Up @@ -53,11 +54,99 @@ extern "C" void set_platform_dump_base(uint64_t dump_data_base) { g_platform_dum
extern "C" uint64_t get_platform_dump_base() { return g_platform_dump_base; }

static bool g_enable_dump_tensor = false;
static bool g_dump_tensor_selective_mode = false;
struct DumpTaskMaskEntry {
uint64_t task_id;
TensorDumpArgMask mask;
};
static constexpr uint64_t DUMP_TASK_MASK_EMPTY_TASK_ID = UINT64_MAX;
static constexpr uint32_t DUMP_TASK_MASK_TABLE_CAPACITY = 32768;
static DumpTaskMaskEntry *g_dump_mask_table = nullptr;
static bool ensure_dump_mask_table() {
if (g_dump_mask_table != nullptr) {
return true;
}
g_dump_mask_table =
static_cast<DumpTaskMaskEntry *>(malloc(sizeof(DumpTaskMaskEntry) * DUMP_TASK_MASK_TABLE_CAPACITY));
if (g_dump_mask_table == nullptr) {
LOG_ERROR("Failed to allocate tensor dump selective mask table");
return false;
}
for (uint32_t i = 0; i < DUMP_TASK_MASK_TABLE_CAPACITY; i++) {
g_dump_mask_table[i].task_id = DUMP_TASK_MASK_EMPTY_TASK_ID;
g_dump_mask_table[i].mask = TENSOR_DUMP_ARG_MASK_NONE;
}
return true;
}

extern "C" void set_dump_tensor_enabled(bool enable) { g_enable_dump_tensor = enable; }
static void clear_dump_mask_table() {
if (g_dump_mask_table == nullptr) {
return;
}
for (uint32_t i = 0; i < DUMP_TASK_MASK_TABLE_CAPACITY; i++) {
g_dump_mask_table[i].task_id = DUMP_TASK_MASK_EMPTY_TASK_ID;
g_dump_mask_table[i].mask = TENSOR_DUMP_ARG_MASK_NONE;
}
}

extern "C" void set_dump_tensor_enabled(bool enable) {
g_enable_dump_tensor = enable;
g_dump_tensor_selective_mode = false;
clear_dump_mask_table();
}

extern "C" bool is_dump_tensor_enabled() { return g_enable_dump_tensor; }

extern "C" void set_dump_tensor_selective_mode(bool enable) { g_dump_tensor_selective_mode = enable; }

extern "C" bool is_dump_tensor_selective_mode() { return g_dump_tensor_selective_mode; }

extern "C" void set_dump_tensor_task_mask(uint64_t task_id, TensorDumpArgMask mask) {
if (mask == TENSOR_DUMP_ARG_MASK_NONE) {
return;
}
if (!ensure_dump_mask_table()) {
return;
}
uint32_t ring_id = static_cast<uint32_t>(task_id >> 32);
if (ring_id >= TENSOR_DUMP_MASK_POOL_MAX_RINGS) {
return;
}
uint32_t slot = static_cast<uint32_t>(task_id) & TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK;
uint32_t idx = (ring_id * TENSOR_DUMP_MASK_POOL_MAX_SLOTS + slot) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1);
for (uint32_t probe = 0; probe < DUMP_TASK_MASK_TABLE_CAPACITY; probe++) {
DumpTaskMaskEntry &entry = g_dump_mask_table[(idx + probe) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1)];
if (entry.task_id == DUMP_TASK_MASK_EMPTY_TASK_ID || entry.task_id == task_id) {
entry.task_id = task_id;
entry.mask = mask;
return;
}
}
LOG_ERROR("tensor dump selective mask table is full");
}

extern "C" TensorDumpArgMask get_dump_tensor_task_mask(uint64_t task_id) {
if (g_dump_mask_table == nullptr) {
return TENSOR_DUMP_ARG_MASK_NONE;
}
uint32_t ring_id = static_cast<uint32_t>(task_id >> 32);
if (ring_id >= TENSOR_DUMP_MASK_POOL_MAX_RINGS) {
return TENSOR_DUMP_ARG_MASK_NONE;
}
uint32_t slot = static_cast<uint32_t>(task_id) & TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK;
uint32_t idx = (ring_id * TENSOR_DUMP_MASK_POOL_MAX_SLOTS + slot) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1);
for (uint32_t probe = 0; probe < DUMP_TASK_MASK_TABLE_CAPACITY; probe++) {
const DumpTaskMaskEntry &entry = g_dump_mask_table[(idx + probe) & (DUMP_TASK_MASK_TABLE_CAPACITY - 1)];
if (entry.task_id == task_id) {
return entry.mask;
}
if (entry.task_id == DUMP_TASK_MASK_EMPTY_TASK_ID) {
return TENSOR_DUMP_ARG_MASK_NONE;
}
}
return TENSOR_DUMP_ARG_MASK_NONE;
}

bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role) {
switch (dir) {
case ArgDirection::IN:
Expand Down Expand Up @@ -97,6 +186,21 @@ bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage) {
return false;
}

bool should_dump_task(TensorDumpArgMask arg_mask) {
if (!is_dump_tensor_selective_mode()) {
return true;
}
return arg_mask != TENSOR_DUMP_ARG_MASK_NONE;
}

bool should_dump_tensor_arg(TensorDumpArgMask arg_mask, int32_t arg_index) {
if (!is_dump_tensor_selective_mode()) {
return true;
}
if (arg_index < 0 || arg_index >= static_cast<int32_t>(TENSOR_DUMP_ARG_MASK_BITS)) return false;
return (arg_mask & (TensorDumpArgMask{1} << arg_index)) != 0;
}

bool try_log_tensor_dump_layout_mismatch() {
if (s_logged_dump_layout_mismatch) {
return false;
Expand Down
4 changes: 2 additions & 2 deletions src/a2a3/runtime/host_build_graph/build_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

BUILD_CONFIG = {
"aicore": {"include_dirs": ["runtime"], "source_dirs": ["aicore", "runtime"]},
"aicpu": {"include_dirs": ["runtime"], "source_dirs": ["aicpu", "runtime"]},
"host": {"include_dirs": ["runtime", "orchestration"], "source_dirs": ["host", "runtime"]},
"aicpu": {"include_dirs": ["runtime", ".."], "source_dirs": ["aicpu", "runtime"]},
"host": {"include_dirs": ["runtime", "orchestration", ".."], "source_dirs": ["host", "runtime"]},
"orchestration": {"include_dirs": ["runtime", "orchestration"], "source_dirs": []},
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,13 @@
#define PTO2_PROFILING 1
#endif

// =============================================================================
// Tensor Dump Configuration
// =============================================================================

// Tensor dump uses these defaults to size its selective mask table so task-id
// ring/slot lookup stays aligned with PTO2 task id layout.
#define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2)
#define PTO2_MAX_RING_DEPTH 4 // Number of task-id ring layers

#endif // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
8 changes: 4 additions & 4 deletions src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
# by the Tensor constructor's validation logic).

BUILD_CONFIG = {
"aicore": {"include_dirs": ["runtime", "common"], "source_dirs": ["aicore", "orchestration"]},
"aicpu": {"include_dirs": ["runtime", "common"], "source_dirs": ["aicpu", "runtime", "orchestration"]},
"host": {"include_dirs": ["runtime", "common"], "source_dirs": ["host", "runtime/shared", "orchestration"]},
"orchestration": {"include_dirs": ["runtime", "orchestration", "common"], "source_dirs": ["orchestration"]},
"aicore": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicore", "orchestration"]},
"aicpu": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicpu", "runtime", "orchestration"]},
"host": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["host", "runtime/shared", "orchestration"]},
"orchestration": {"include_dirs": ["runtime", "orchestration", "common", ".."], "source_dirs": ["orchestration"]},
}
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ struct PTO2Runtime {
PTO2ScopeMode pending_scope_mode;
};

static inline void enable_dump_tensor_selective() { set_tensor_dump_selective_requested(true); }

// =============================================================================
// Inline Convenience Wrappers (call through ops table)
// =============================================================================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
#include "pto_types.h"
#include "tensor.h"

extern "C" void set_dump_tensor_selective_mode(bool enable);
extern "C" void set_dump_tensor_task_mask(uint64_t task_id, uint64_t mask);

// Verify the captured Tensor blob size in DepGenRecord matches the runtime
// Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without
// including runtime/tensor.h, so this check lives at the orch callsite.
Expand Down Expand Up @@ -674,6 +677,14 @@ static TaskOutputTensors submit_task_common(
}

payload.init(args, result, prepared.alloc_result, layout);
#if PTO2_PROFILING
if (args.tensor_dump_selective_requested()) {
set_dump_tensor_selective_mode(true);
}
if (args.tensor_dump_arg_mask() != 0) {
set_dump_tensor_task_mask(task_id.raw, args.tensor_dump_arg_mask());
}
#endif

CYCLE_COUNT_LAP_RECORD(g_orch_args_cycle, AicpuPhaseId::ORCH_PARAMS, task_id.raw);
#if PTO2_ORCH_PROFILING
Expand Down
Loading
Loading