Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions docs/dfx/tensor-dump.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ execution, and the host exports a JSON manifest plus a binary payload.
The result is a stable, replayable record of every tensor a kernel
saw, without the timing distortion of inline printing.

The same device-to-host dump channel also carries task argument
descriptors. When tensor dump is enabled, AICPU records the runtime
args it sees at dispatch time: tensor buffer descriptors plus scalar
values. These `args` entries share the tensor dump lifecycle, queues,
arena, and output directory so they can be correlated with swimlane
and PMU task ids without opening a second DFX path.

## 2. Overview

- **Per-task input/output capture.** Inputs snapshotted before
Expand All @@ -26,6 +33,9 @@ saw, without the timing distortion of inline printing.
- **Manifest + binary payload.** A single JSON manifest plus one
`.bin` payload per run; each manifest entry has `bin_offset` /
`bin_size` into the payload.
- **Args descriptors.** The manifest also includes an `args` array
with per-dispatch tensor descriptors and scalar values observed on
device.
- **Cross-architecture.** Same `--dump-tensor` flag, same on-disk
format on `a2a3` and `a5`. Both runtimes are wired through.

Expand Down Expand Up @@ -90,6 +100,7 @@ Example manifest (one input tensor captured before dispatch):
"byte_order": "little_endian"
},
"total_tensors": 1,
"total_args": 1,
"before_dispatch": 1,
"after_completion": 0,
"input_tensors": 1,
Expand All @@ -99,6 +110,32 @@ Example manifest (one input tensor captured before dispatch):
"dropped_records": 0,
"dropped_overwrite": 0,
"bin_file": "tensor_dump.bin",
"args": [
{
"task_id": "0x0000000200000a00",
"subtask_id": 0,
"func_id": 0,
"stage": "before_dispatch",
"tensor_count": 1,
"scalar_count": 1,
"payload_size": 128,
"overwritten": false,
"tensors": [
{
"arg_index": 0,
"buffer_addr": "0x100000",
"buffer_size": 65536,
"dtype": "float32",
"shape": [16384],
"raw_shape": [16384],
"offsets": [0],
"is_contiguous": true,
"is_all_offset_zero": true
}
],
"scalars": ["0x40"]
}
],
"tensors": [
{
"task_id": "0x0000000200000a00",
Expand Down Expand Up @@ -155,6 +192,9 @@ python -m simpler_setup.tools.dump_viewer --func 0 --stage before --role input -
# Export one specific entry by its manifest index
python -m simpler_setup.tools.dump_viewer --index 42

# List dumped task args in the latest run
python -m simpler_setup.tools.dump_viewer --args

# Pin to a specific dump directory
python -m simpler_setup.tools.dump_viewer outputs/<case>_<ts>/tensor_dump \
--task 0x0000000200000a00 --export
Expand Down
24 changes: 24 additions & 0 deletions simpler_setup/tools/dump_viewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
--stage Filter by stage (before / after)
--role Filter by role (input / output / inout)
--arg Filter by arg_index (int)
--args List dumped task args instead of tensors

With no filters: lists all tensors.
With filters: lists matching tensors. Add --export to save them to txt.
Expand Down Expand Up @@ -194,6 +195,20 @@ def list_tensors(tensors: list):
)


def list_args(args_records: list):
print(
f"{'idx':>6} {'task_id':>18} {'s':>1} {'stage':>15} {'func':>4}"
f" {'tensors':>7} {'scalars':>7} {'overwritten':>11}"
)
print("-" * 92)
for i, rec in enumerate(args_records):
print(
f"{i:>6} {rec['task_id']:>18} {rec['subtask_id']:>1} {rec['stage']:>15}"
f" {rec['func_id']:>4} {rec['tensor_count']:>7} {rec['scalar_count']:>7}"
f" {str(rec.get('overwritten', False)):>11}"
)


def _resolve_dump_dir(dump_dir_arg: str | None) -> Path:
if dump_dir_arg is not None:
return Path(dump_dir_arg)
Expand Down Expand Up @@ -267,6 +282,7 @@ def main():
parser.add_argument("--stage", "-s", help="Filter by stage (before / after)")
parser.add_argument("--role", "-r", help="Filter by role (input / output / inout)")
parser.add_argument("--arg", "-a", type=int, help="Filter by arg_index")
parser.add_argument("--args", action="store_true", help="List dumped task args instead of tensors")
parser.add_argument("--index", "-i", type=int, help="Select tensor by index in manifest")
parser.add_argument("--export", "-e", action="store_true", help="Export filtered tensors to txt")
args = parser.parse_args()
Expand All @@ -283,6 +299,14 @@ def main():
bin_path = dump_dir / manifest.get("bin_file", "tensors.bin")
tensors = manifest["tensors"]

if args.args:
args_records = manifest.get("args", [])
if not args_records:
print("No args records found in manifest.", file=sys.stderr)
sys.exit(1)
list_args(args_records)
return

filtered = _apply_filters(tensors, args)

# --- Select by index ---
Expand Down
60 changes: 60 additions & 0 deletions src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,58 @@ int32_t count_callable_tensor_args(const CoreCallable &callable);
bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage);
bool try_log_tensor_dump_layout_mismatch();
int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);
int dump_args_record(int thread_idx, const ArgsDumpInfo &info);

template <int MaxTensorArgs, int MaxScalarArgs, typename PayloadT>
inline void dump_args_for_payload(
int32_t thread_idx, uint64_t task_id, uint8_t subtask_id, int32_t func_id, const PayloadT &payload,
TensorDumpStage stage
) {
ArgsDumpTensorEntry tensor_entries[MaxTensorArgs] = {};
int32_t tensor_count = payload.tensor_count;
if (tensor_count < 0) {
tensor_count = 0;
}
if (tensor_count > MaxTensorArgs) {
tensor_count = MaxTensorArgs;
}
int32_t scalar_count = payload.scalar_count;
if (scalar_count < 0) {
scalar_count = 0;
}
if (scalar_count > MaxScalarArgs) {
scalar_count = MaxScalarArgs;
}

for (int32_t i = 0; i < tensor_count; i++) {
const auto &t = payload.tensors[i];
ArgsDumpTensorEntry &entry = tensor_entries[i];
entry.buffer_addr = t.buffer.addr;
entry.buffer_size = t.buffer.size;
entry.owner_task_id = t.owner_task_id.raw;
entry.ndims = t.ndims;
entry.dtype = static_cast<uint8_t>(t.dtype);
entry.is_contiguous = t.is_raw_eq_shapes ? 1 : 0;
entry.is_all_offset_zero = t.is_all_offset_zero ? 1 : 0;
const uint32_t *raw_shapes = t.get_raw_shapes();
for (uint32_t d = 0; d < t.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) {
entry.shapes[d] = t.shapes[d];
entry.offsets[d] = t.is_all_offset_zero ? 0 : t.offsets[d];
entry.raw_shapes[d] = raw_shapes[d];
}
}

ArgsDumpInfo info = {};
info.task_id = task_id;
info.subtask_id = subtask_id;
info.stage = stage;
info.func_id = static_cast<uint32_t>(func_id);
info.tensor_count = static_cast<uint32_t>(tensor_count);
info.scalar_count = static_cast<uint32_t>(scalar_count);
info.tensors = tensor_entries;
info.scalars = payload.scalars;
dump_args_record(thread_idx, info);
}

template <int MaxSubtaskSlots, typename SlotStateT, typename IsSubtaskActiveFn, typename GetFunctionBinAddrFn>
inline void dump_tensors_for_task(
Expand Down Expand Up @@ -261,6 +313,14 @@ void dump_tensor_init(int num_dump_threads);
*/
int dump_tensor_record(int thread_idx, const TensorDumpInfo &info);

/**
* Record a task's runtime argument descriptors.
*
* Copies an ArgsDumpPayloadHeader, ArgsDumpTensorEntry array, and raw scalar
* values into the existing dump arena, then appends an ARGS metadata record.
*/
int dump_args_record(int thread_idx, const ArgsDumpInfo &info);

/**
* Flush remaining tensor dump data for a thread.
*
Expand Down
51 changes: 50 additions & 1 deletion src/a2a3/platform/include/common/tensor_dump.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,55 @@ enum class TensorDumpStage : uint8_t {
AFTER_COMPLETION = 1,
};

// =============================================================================
// DumpRecordKind - Logical record type carried by the dump channel
// =============================================================================

enum class DumpRecordKind : uint8_t {
TENSOR = 0,
ARGS = 1,
};

// =============================================================================
// Args dump payload schema
// =============================================================================

constexpr uint32_t ARGS_DUMP_PAYLOAD_VERSION = 1;

struct ArgsDumpPayloadHeader {
uint32_t version;
uint32_t tensor_count;
uint32_t scalar_count;
uint32_t tensor_entry_size;
uint32_t scalar_entry_size;
uint32_t reserved;
};

struct ArgsDumpTensorEntry {
uint64_t buffer_addr;
uint64_t buffer_size;
uint64_t owner_task_id;
uint32_t shapes[PLATFORM_DUMP_MAX_DIMS];
uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS];
uint32_t offsets[PLATFORM_DUMP_MAX_DIMS];
uint32_t ndims;
uint8_t dtype;
uint8_t is_contiguous;
uint8_t is_all_offset_zero;
uint8_t reserved;
};
Comment on lines +97 to +109
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The ArgsDumpTensorEntry struct should be 64-byte aligned to ensure optimal cache performance and prevent regressions. Please adjust the layout or add padding to make the struct size a multiple of 64, and include a static_assert to verify the alignment of critical members.

References
  1. Ensure critical struct layout alignments (especially for cache performance) are 64-byte aligned and protected by static_assert to prevent regressions.


struct ArgsDumpInfo {
uint64_t task_id;
uint8_t subtask_id;
TensorDumpStage stage;
uint32_t func_id;
uint32_t tensor_count;
uint32_t scalar_count;
const ArgsDumpTensorEntry *tensors;
const uint64_t *scalars;
};

// =============================================================================
// TensorDumpRecord - Single Tensor Dump Entry (128B = 2 cache lines)
// =============================================================================
Expand All @@ -92,7 +141,7 @@ struct alignas(64) TensorDumpRecord {
uint8_t dtype; // DataType raw enum value
uint8_t truncated; // 1 if payload was truncated (tensor > arena capacity)
uint8_t is_contiguous; // 1 when source view is already contiguous
uint8_t pad0_align; // Explicit alignment before 64-bit payload offsets
uint8_t kind; // DumpRecordKind; defaults to tensor for legacy records
uint64_t payload_offset; // Monotonic byte offset into thread arena
uint64_t payload_size; // Bytes actually copied (may be < full tensor bytes)
uint8_t pad0[24]; // Preserve 64B cache-line layout
Expand Down
13 changes: 13 additions & 0 deletions src/a2a3/platform/include/host/tensor_dump_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,18 @@ struct DumpedTensor {
std::vector<uint8_t> bytes;
};

struct DumpedArgs {
uint64_t task_id;
uint8_t subtask_id;
uint32_t func_id;
TensorDumpStage stage;
uint32_t tensor_count;
uint32_t scalar_count;
uint64_t payload_size;
bool overwritten;
std::vector<uint8_t> bytes;
};

class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpCollector, DumpModule> {
public:
TensorDumpCollector() = default;
Expand Down Expand Up @@ -283,6 +295,7 @@ class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpColl

// Collected dump tensors
std::vector<DumpedTensor> collected_;
std::vector<DumpedArgs> collected_args_;
std::mutex collected_mutex_;

// Stats
Expand Down
Loading