diff --git a/docs/dfx/tensor-dump.md b/docs/dfx/tensor-dump.md index 3bcb14320..cfdf0f49c 100644 --- a/docs/dfx/tensor-dump.md +++ b/docs/dfx/tensor-dump.md @@ -15,6 +15,13 @@ execution, and the host exports a JSON manifest plus a binary payload. The result is a stable, replayable record of every tensor a kernel saw, without the timing distortion of inline printing. +The same device-to-host dump channel also carries task argument +descriptors. When tensor dump is enabled, AICPU records the runtime +args it sees at dispatch time: tensor buffer descriptors plus scalar +values. These `args` entries share the tensor dump lifecycle, queues, +arena, and output directory so they can be correlated with swimlane +and PMU task ids without opening a second DFX path. + ## 2. Overview - **Per-task input/output capture.** Inputs snapshotted before @@ -26,6 +33,9 @@ saw, without the timing distortion of inline printing. - **Manifest + binary payload.** A single JSON manifest plus one `.bin` payload per run; each manifest entry has `bin_offset` / `bin_size` into the payload. +- **Args descriptors.** The manifest also includes an `args` array + with per-dispatch tensor descriptors and scalar values observed on + device. - **Cross-architecture.** Same `--dump-tensor` flag, same on-disk format on `a2a3` and `a5`. Both runtimes are wired through. @@ -90,6 +100,7 @@ Example manifest (one input tensor captured before dispatch): "byte_order": "little_endian" }, "total_tensors": 1, + "total_args": 1, "before_dispatch": 1, "after_completion": 0, "input_tensors": 1, @@ -99,6 +110,32 @@ Example manifest (one input tensor captured before dispatch): "dropped_records": 0, "dropped_overwrite": 0, "bin_file": "tensor_dump.bin", + "args": [ + { + "task_id": "0x0000000200000a00", + "subtask_id": 0, + "func_id": 0, + "stage": "before_dispatch", + "tensor_count": 1, + "scalar_count": 1, + "payload_size": 128, + "overwritten": false, + "tensors": [ + { + "arg_index": 0, + "buffer_addr": "0x100000", + "buffer_size": 65536, + "dtype": "float32", + "shape": [16384], + "raw_shape": [16384], + "offsets": [0], + "is_contiguous": true, + "is_all_offset_zero": true + } + ], + "scalars": ["0x40"] + } + ], "tensors": [ { "task_id": "0x0000000200000a00", @@ -155,6 +192,9 @@ python -m simpler_setup.tools.dump_viewer --func 0 --stage before --role input - # Export one specific entry by its manifest index python -m simpler_setup.tools.dump_viewer --index 42 +# List dumped task args in the latest run +python -m simpler_setup.tools.dump_viewer --args + # Pin to a specific dump directory python -m simpler_setup.tools.dump_viewer outputs/_/tensor_dump \ --task 0x0000000200000a00 --export diff --git a/simpler_setup/tools/dump_viewer.py b/simpler_setup/tools/dump_viewer.py index f4f02187c..f347ce966 100644 --- a/simpler_setup/tools/dump_viewer.py +++ b/simpler_setup/tools/dump_viewer.py @@ -15,6 +15,7 @@ --stage Filter by stage (before / after) --role Filter by role (input / output / inout) --arg Filter by arg_index (int) + --args List dumped task args instead of tensors With no filters: lists all tensors. With filters: lists matching tensors. Add --export to save them to txt. @@ -194,6 +195,20 @@ def list_tensors(tensors: list): ) +def list_args(args_records: list): + print( + f"{'idx':>6} {'task_id':>18} {'s':>1} {'stage':>15} {'func':>4}" + f" {'tensors':>7} {'scalars':>7} {'overwritten':>11}" + ) + print("-" * 92) + for i, rec in enumerate(args_records): + print( + f"{i:>6} {rec['task_id']:>18} {rec['subtask_id']:>1} {rec['stage']:>15}" + f" {rec['func_id']:>4} {rec['tensor_count']:>7} {rec['scalar_count']:>7}" + f" {str(rec.get('overwritten', False)):>11}" + ) + + def _resolve_dump_dir(dump_dir_arg: str | None) -> Path: if dump_dir_arg is not None: return Path(dump_dir_arg) @@ -267,6 +282,7 @@ def main(): parser.add_argument("--stage", "-s", help="Filter by stage (before / after)") parser.add_argument("--role", "-r", help="Filter by role (input / output / inout)") parser.add_argument("--arg", "-a", type=int, help="Filter by arg_index") + parser.add_argument("--args", action="store_true", help="List dumped task args instead of tensors") parser.add_argument("--index", "-i", type=int, help="Select tensor by index in manifest") parser.add_argument("--export", "-e", action="store_true", help="Export filtered tensors to txt") args = parser.parse_args() @@ -283,6 +299,14 @@ def main(): bin_path = dump_dir / manifest.get("bin_file", "tensors.bin") tensors = manifest["tensors"] + if args.args: + args_records = manifest.get("args", []) + if not args_records: + print("No args records found in manifest.", file=sys.stderr) + sys.exit(1) + list_args(args_records) + return + filtered = _apply_filters(tensors, args) # --- Select by index --- diff --git a/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h b/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h index 193719668..c00640d73 100644 --- a/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h +++ b/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h @@ -75,6 +75,58 @@ int32_t count_callable_tensor_args(const CoreCallable &callable); bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage); bool try_log_tensor_dump_layout_mismatch(); int dump_tensor_record(int thread_idx, const TensorDumpInfo &info); +int dump_args_record(int thread_idx, const ArgsDumpInfo &info); + +template +inline void dump_args_for_payload( + int32_t thread_idx, uint64_t task_id, uint8_t subtask_id, int32_t func_id, const PayloadT &payload, + TensorDumpStage stage +) { + ArgsDumpTensorEntry tensor_entries[MaxTensorArgs] = {}; + int32_t tensor_count = payload.tensor_count; + if (tensor_count < 0) { + tensor_count = 0; + } + if (tensor_count > MaxTensorArgs) { + tensor_count = MaxTensorArgs; + } + int32_t scalar_count = payload.scalar_count; + if (scalar_count < 0) { + scalar_count = 0; + } + if (scalar_count > MaxScalarArgs) { + scalar_count = MaxScalarArgs; + } + + for (int32_t i = 0; i < tensor_count; i++) { + const auto &t = payload.tensors[i]; + ArgsDumpTensorEntry &entry = tensor_entries[i]; + entry.buffer_addr = t.buffer.addr; + entry.buffer_size = t.buffer.size; + entry.owner_task_id = t.owner_task_id.raw; + entry.ndims = t.ndims; + entry.dtype = static_cast(t.dtype); + entry.is_contiguous = t.is_raw_eq_shapes ? 1 : 0; + entry.is_all_offset_zero = t.is_all_offset_zero ? 1 : 0; + const uint32_t *raw_shapes = t.get_raw_shapes(); + for (uint32_t d = 0; d < t.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + entry.shapes[d] = t.shapes[d]; + entry.offsets[d] = t.is_all_offset_zero ? 0 : t.offsets[d]; + entry.raw_shapes[d] = raw_shapes[d]; + } + } + + ArgsDumpInfo info = {}; + info.task_id = task_id; + info.subtask_id = subtask_id; + info.stage = stage; + info.func_id = static_cast(func_id); + info.tensor_count = static_cast(tensor_count); + info.scalar_count = static_cast(scalar_count); + info.tensors = tensor_entries; + info.scalars = payload.scalars; + dump_args_record(thread_idx, info); +} template inline void dump_tensors_for_task( @@ -261,6 +313,14 @@ void dump_tensor_init(int num_dump_threads); */ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info); +/** + * Record a task's runtime argument descriptors. + * + * Copies an ArgsDumpPayloadHeader, ArgsDumpTensorEntry array, and raw scalar + * values into the existing dump arena, then appends an ARGS metadata record. + */ +int dump_args_record(int thread_idx, const ArgsDumpInfo &info); + /** * Flush remaining tensor dump data for a thread. * diff --git a/src/a2a3/platform/include/common/tensor_dump.h b/src/a2a3/platform/include/common/tensor_dump.h index eccc2e665..b56e62daf 100644 --- a/src/a2a3/platform/include/common/tensor_dump.h +++ b/src/a2a3/platform/include/common/tensor_dump.h @@ -70,6 +70,55 @@ enum class TensorDumpStage : uint8_t { AFTER_COMPLETION = 1, }; +// ============================================================================= +// DumpRecordKind - Logical record type carried by the dump channel +// ============================================================================= + +enum class DumpRecordKind : uint8_t { + TENSOR = 0, + ARGS = 1, +}; + +// ============================================================================= +// Args dump payload schema +// ============================================================================= + +constexpr uint32_t ARGS_DUMP_PAYLOAD_VERSION = 1; + +struct ArgsDumpPayloadHeader { + uint32_t version; + uint32_t tensor_count; + uint32_t scalar_count; + uint32_t tensor_entry_size; + uint32_t scalar_entry_size; + uint32_t reserved; +}; + +struct ArgsDumpTensorEntry { + uint64_t buffer_addr; + uint64_t buffer_size; + uint64_t owner_task_id; + uint32_t shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t offsets[PLATFORM_DUMP_MAX_DIMS]; + uint32_t ndims; + uint8_t dtype; + uint8_t is_contiguous; + uint8_t is_all_offset_zero; + uint8_t reserved; +}; + +struct ArgsDumpInfo { + uint64_t task_id; + uint8_t subtask_id; + TensorDumpStage stage; + uint32_t func_id; + uint32_t tensor_count; + uint32_t scalar_count; + const ArgsDumpTensorEntry *tensors; + const uint64_t *scalars; +}; + // ============================================================================= // TensorDumpRecord - Single Tensor Dump Entry (128B = 2 cache lines) // ============================================================================= @@ -92,7 +141,7 @@ struct alignas(64) TensorDumpRecord { uint8_t dtype; // DataType raw enum value uint8_t truncated; // 1 if payload was truncated (tensor > arena capacity) uint8_t is_contiguous; // 1 when source view is already contiguous - uint8_t pad0_align; // Explicit alignment before 64-bit payload offsets + uint8_t kind; // DumpRecordKind; defaults to tensor for legacy records uint64_t payload_offset; // Monotonic byte offset into thread arena uint64_t payload_size; // Bytes actually copied (may be < full tensor bytes) uint8_t pad0[24]; // Preserve 64B cache-line layout diff --git a/src/a2a3/platform/include/host/tensor_dump_collector.h b/src/a2a3/platform/include/host/tensor_dump_collector.h index 7fd5cd14e..806252509 100644 --- a/src/a2a3/platform/include/host/tensor_dump_collector.h +++ b/src/a2a3/platform/include/host/tensor_dump_collector.h @@ -176,6 +176,18 @@ struct DumpedTensor { std::vector bytes; }; +struct DumpedArgs { + uint64_t task_id; + uint8_t subtask_id; + uint32_t func_id; + TensorDumpStage stage; + uint32_t tensor_count; + uint32_t scalar_count; + uint64_t payload_size; + bool overwritten; + std::vector bytes; +}; + class TensorDumpCollector : public profiling_common::ProfilerBase { public: TensorDumpCollector() = default; @@ -283,6 +295,7 @@ class TensorDumpCollector : public profiling_common::ProfilerBase collected_; + std::vector collected_args_; std::mutex collected_mutex_; // Stats diff --git a/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp b/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp index 7f485cf1d..8c0196f65 100644 --- a/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp +++ b/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp @@ -326,6 +326,23 @@ static inline void write_tensor_dump_logical_prefix( gather_tensor_dump_dim(writer, info, elem_sz, 0, 0, &remaining_bytes); } +static DumpMetaBuffer *ensure_dump_record_slot(int thread_idx) { + if (s_dump_header == nullptr || thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + return nullptr; + } + DumpMetaBuffer *buf = s_current_dump_buf[thread_idx]; + if (buf == nullptr) { + return nullptr; + } + if (buf->count >= PLATFORM_DUMP_RECORDS_PER_BUFFER) { + if (switch_dump_meta_buffer(thread_idx) != 0) { + return nullptr; + } + buf = s_current_dump_buf[thread_idx]; + } + return buf; +} + void dump_tensor_init(int num_dump_threads) { void *dump_base = reinterpret_cast(get_platform_dump_base()); if (dump_base == nullptr) { @@ -373,29 +390,12 @@ void dump_tensor_init(int num_dump_threads) { } int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) { - if (s_dump_header == nullptr) { + DumpMetaBuffer *buf = ensure_dump_record_slot(thread_idx); + DumpBufferState *state = + (thread_idx >= 0 && thread_idx < PLATFORM_MAX_AICPU_THREADS) ? s_dump_states[thread_idx] : nullptr; + if (state == nullptr || buf == nullptr) { return -1; } - if (thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { - return -1; - } - - DumpBufferState *state = s_dump_states[thread_idx]; - DumpMetaBuffer *buf = s_current_dump_buf[thread_idx]; - if (buf == nullptr) { - return -1; - } - - // Switch metadata buffer if full - if (buf->count >= PLATFORM_DUMP_RECORDS_PER_BUFFER) { - if (switch_dump_meta_buffer(thread_idx) != 0) { - return -1; // No free buffer - } - buf = s_current_dump_buf[thread_idx]; - if (buf == nullptr) { - return -1; - } - } // Reserve space in arena // Compute actual tensor data size from shape (not buffer.size which may include padding) @@ -435,6 +435,7 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) { rec->ndims = info.ndims; rec->dtype = info.dtype; rec->truncated = truncated ? 1 : 0; + rec->kind = static_cast(DumpRecordKind::TENSOR); rec->payload_offset = offset; rec->payload_size = copy_bytes; for (int d = 0; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { @@ -450,6 +451,61 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) { return 0; } +int dump_args_record(int thread_idx, const ArgsDumpInfo &info) { + DumpMetaBuffer *buf = ensure_dump_record_slot(thread_idx); + DumpBufferState *state = + (thread_idx >= 0 && thread_idx < PLATFORM_MAX_AICPU_THREADS) ? s_dump_states[thread_idx] : nullptr; + if (state == nullptr || buf == nullptr) { + return -1; + } + + uint64_t payload_size = sizeof(ArgsDumpPayloadHeader) + + static_cast(info.tensor_count) * sizeof(ArgsDumpTensorEntry) + + static_cast(info.scalar_count) * sizeof(uint64_t); + if (payload_size > state->arena_size) { + account_dropped_records(state, 1); + return 0; + } + + uint64_t offset = state->arena_write_offset; + state->arena_write_offset = offset + payload_size; + + char *arena = reinterpret_cast(state->arena_base); + CircularArenaWriter writer = {arena, state->arena_size, offset, 0}; + ArgsDumpPayloadHeader header = {}; + header.version = ARGS_DUMP_PAYLOAD_VERSION; + header.tensor_count = info.tensor_count; + header.scalar_count = info.scalar_count; + header.tensor_entry_size = sizeof(ArgsDumpTensorEntry); + header.scalar_entry_size = sizeof(uint64_t); + writer.write(&header, sizeof(header)); + if (info.tensor_count > 0 && info.tensors != nullptr) { + writer.write(info.tensors, static_cast(info.tensor_count) * sizeof(ArgsDumpTensorEntry)); + } + if (info.scalar_count > 0 && info.scalars != nullptr) { + writer.write(info.scalars, static_cast(info.scalar_count) * sizeof(uint64_t)); + } + wmb(); + + uint32_t idx = buf->count; + TensorDumpRecord *rec = &buf->records[idx]; + memset(rec, 0, sizeof(*rec)); + rec->task_id = info.task_id; + rec->subtask_id = info.subtask_id; + rec->func_id = info.func_id; + rec->stage = static_cast(info.stage); + rec->kind = static_cast(DumpRecordKind::ARGS); + rec->payload_offset = offset; + rec->payload_size = payload_size; + rec->shapes[0] = info.tensor_count; + rec->shapes[1] = info.scalar_count; + buf->count = idx + 1; + wmb(); + + s_records_written[thread_idx]++; + return 0; +} + void dump_tensor_flush(int thread_idx) { if (s_dump_header == nullptr) { return; diff --git a/src/a2a3/platform/src/host/tensor_dump_collector.cpp b/src/a2a3/platform/src/host/tensor_dump_collector.cpp index f1e5806d2..d458b3147 100644 --- a/src/a2a3/platform/src/host/tensor_dump_collector.cpp +++ b/src/a2a3/platform/src/host/tensor_dump_collector.cpp @@ -169,6 +169,53 @@ void TensorDumpCollector::process_dump_buffer(const DumpReadyBufferInfo &info) { for (uint32_t i = 0; i < count; i++) { const TensorDumpRecord &rec = buf->records[i]; + if (static_cast(rec.kind) == DumpRecordKind::ARGS) { + DumpedArgs da; + da.task_id = rec.task_id; + da.subtask_id = rec.subtask_id; + da.func_id = rec.func_id; + da.stage = static_cast(rec.stage); + da.tensor_count = rec.shapes[0]; + da.scalar_count = rec.shapes[1]; + da.payload_size = rec.payload_size; + da.overwritten = false; + + int thread_idx = static_cast(info.thread_index); + if (thread_idx < static_cast(arenas_.size())) { + ArenaInfo &ai = arenas_[thread_idx]; + char *arena_host = reinterpret_cast(ai.host_ptr); + uint64_t arena_sz = ai.size; + uint64_t high_water = ai.high_water; + if (high_water > arena_sz && rec.payload_offset < high_water - arena_sz) { + da.overwritten = true; + if (++total_overwrite_count_ == 1) { + LOG_WARN( + "Dump args overwrite detected: host drain was slower than arena reuse. " + "Increase PLATFORM_DUMP_BUFFERS_PER_THREAD." + ); + } + } + if (!da.overwritten && rec.payload_size > 0) { + da.bytes.resize(rec.payload_size); + uint64_t pos = rec.payload_offset % arena_sz; + if (pos + rec.payload_size <= arena_sz) { + memcpy(da.bytes.data(), arena_host + pos, rec.payload_size); + } else { + uint64_t first = arena_sz - pos; + memcpy(da.bytes.data(), arena_host + pos, first); + memcpy(da.bytes.data() + first, arena_host, rec.payload_size - first); + } + } + uint64_t end_offset = rec.payload_offset + rec.payload_size; + if (end_offset > ai.high_water) { + ai.high_water = end_offset; + } + } + + std::lock_guard lock(collected_mutex_); + collected_args_.push_back(std::move(da)); + continue; + } DumpedTensor dt; dt.task_id = rec.task_id; @@ -292,6 +339,40 @@ static std::string dims_to_string(const uint32_t dims[], int ndims) { return ss.str(); } +static ArgsDumpPayloadHeader read_args_payload_header(const DumpedArgs &da) { + ArgsDumpPayloadHeader header = {}; + if (da.bytes.size() >= sizeof(header)) { + memcpy(&header, da.bytes.data(), sizeof(header)); + } + return header; +} + +static bool read_args_tensor_entry(const DumpedArgs &da, const ArgsDumpPayloadHeader &header, uint32_t index, ArgsDumpTensorEntry *out) { + if (header.tensor_entry_size != sizeof(ArgsDumpTensorEntry)) { + return false; + } + size_t offset = sizeof(ArgsDumpPayloadHeader) + static_cast(index) * sizeof(ArgsDumpTensorEntry); + if (offset + sizeof(ArgsDumpTensorEntry) > da.bytes.size()) { + return false; + } + memcpy(out, da.bytes.data() + offset, sizeof(ArgsDumpTensorEntry)); + return true; +} + +static bool read_args_scalar(const DumpedArgs &da, const ArgsDumpPayloadHeader &header, uint32_t index, uint64_t *out) { + if (header.tensor_entry_size != sizeof(ArgsDumpTensorEntry) || header.scalar_entry_size != sizeof(uint64_t)) { + return false; + } + size_t offset = sizeof(ArgsDumpPayloadHeader) + + static_cast(header.tensor_count) * sizeof(ArgsDumpTensorEntry) + + static_cast(index) * sizeof(uint64_t); + if (offset + sizeof(uint64_t) > da.bytes.size()) { + return false; + } + memcpy(out, da.bytes.data() + offset, sizeof(uint64_t)); + return true; +} + void TensorDumpCollector::start_writer_thread_once() { if (writer_started_) return; writer_started_ = true; @@ -449,8 +530,8 @@ int TensorDumpCollector::export_dump_files() { ); } - if (collected_.empty()) { - LOG_WARN("No tensor dump data to export"); + if (collected_.empty() && collected_args_.empty()) { + LOG_WARN("No dump data to export"); writer_started_ = false; return 0; } @@ -465,8 +546,14 @@ int TensorDumpCollector::export_dump_files() { if (a.arg_index != b.arg_index) return a.arg_index < b.arg_index; return static_cast(a.role) < static_cast(b.role); }); + std::sort(collected_args_.begin(), collected_args_.end(), [](const DumpedArgs &a, const DumpedArgs &b) { + if (a.task_id != b.task_id) return a.task_id < b.task_id; + if (a.subtask_id != b.subtask_id) return a.subtask_id < b.subtask_id; + if (a.func_id != b.func_id) return a.func_id < b.func_id; + return static_cast(a.stage) < static_cast(b.stage); + }); - LOG_INFO_V0("Writing JSON manifest for %zu tensors...", collected_.size()); + LOG_INFO_V0("Writing JSON manifest for %zu tensors and %zu args records...", collected_.size(), collected_args_.size()); uint32_t num_before_dispatch = 0; uint32_t num_after_completion = 0; @@ -502,6 +589,7 @@ int TensorDumpCollector::export_dump_files() { json << " \"byte_order\": \"little_endian\"\n"; json << " },\n"; json << " \"total_tensors\": " << collected_.size() << ",\n"; + json << " \"total_args\": " << collected_args_.size() << ",\n"; json << " \"before_dispatch\": " << num_before_dispatch << ",\n"; json << " \"after_completion\": " << num_after_completion << ",\n"; json << " \"input_tensors\": " << num_input_tensors << ",\n"; @@ -538,6 +626,46 @@ int TensorDumpCollector::export_dump_files() { << ", \"overwritten\": " << (dt.overwritten ? "true" : "false") << "}"; } + json << "\n ],\n"; + json << " \"args\": [\n"; + + for (size_t i = 0; i < collected_args_.size(); i++) { + const DumpedArgs &da = collected_args_[i]; + ArgsDumpPayloadHeader header = read_args_payload_header(da); + if (i > 0) json << ",\n"; + json << " {\"task_id\": \"0x" << std::hex << std::setfill('0') << std::setw(16) << da.task_id << std::dec + << "\", \"subtask_id\": " << static_cast(da.subtask_id) << ", \"func_id\": " << da.func_id + << ", \"stage\": \"" << tensor_dump_stage_name(da.stage) << "\", \"tensor_count\": " << da.tensor_count + << ", \"scalar_count\": " << da.scalar_count << ", \"payload_size\": " << da.payload_size + << ", \"overwritten\": " << (da.overwritten ? "true" : "false") << ", \"tensors\": ["; + for (uint32_t t = 0; t < header.tensor_count; t++) { + ArgsDumpTensorEntry entry = {}; + if (!read_args_tensor_entry(da, header, t, &entry)) { + break; + } + if (t > 0) json << ", "; + json << "{\"arg_index\": " << t << ", \"buffer_addr\": \"0x" << std::hex << entry.buffer_addr << std::dec + << "\", \"buffer_size\": " << entry.buffer_size << ", \"owner_task_id\": \"0x" << std::hex + << entry.owner_task_id << std::dec << "\", \"dtype\": \"" + << get_dtype_name_from_raw(entry.dtype) << "\", \"shape\": " + << dims_to_string(entry.shapes, static_cast(entry.ndims)) << ", \"raw_shape\": " + << dims_to_string(entry.raw_shapes, static_cast(entry.ndims)) << ", \"offsets\": " + << dims_to_string(entry.offsets, static_cast(entry.ndims)) << ", \"is_contiguous\": " + << (entry.is_contiguous ? "true" : "false") << ", \"is_all_offset_zero\": " + << (entry.is_all_offset_zero ? "true" : "false") << "}"; + } + json << "], \"scalars\": ["; + for (uint32_t s = 0; s < header.scalar_count; s++) { + uint64_t value = 0; + if (!read_args_scalar(da, header, s, &value)) { + break; + } + if (s > 0) json << ", "; + json << "\"0x" << std::hex << value << std::dec << "\""; + } + json << "]}"; + } + json << "\n ]\n}\n"; json.close(); @@ -554,6 +682,7 @@ int TensorDumpCollector::export_dump_files() { // Clear state so subsequent runs don't accumulate data from previous runs collected_.clear(); + collected_args_.clear(); total_dropped_record_count_ = 0; total_truncated_count_ = 0; total_overwrite_count_ = 0; diff --git a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp index 4b0c22cae..dc95f0c8f 100644 --- a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp @@ -162,6 +162,68 @@ collect_task_tensor_buffer_addrs(const Runtime &runtime, const Task &task, uint6 } return found; } + +static uint64_t tensor_info_nbytes(const TensorInfo &info) { + uint64_t elements = 1; + for (uint32_t d = 0; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + elements *= info.shapes[d]; + } + return elements * get_element_size(info.dtype); +} + +static void dump_task_args_record( + int thread_idx, const Task &task, const CoreCallable &callable, const TensorInfo *tensor_info, + int tensor_info_count, const uint64_t *buffer_addrs, int buffer_count +) { + ArgsDumpTensorEntry tensor_entries[RUNTIME_MAX_ARGS] = {}; + uint64_t scalar_values[RUNTIME_MAX_ARGS] = {}; + int tensor_arg_index = 0; + int recorded_tensor_count = 0; + int scalar_count = 0; + int sig_count = callable.sig_count(); + for (int32_t sig_idx = 0; sig_idx < sig_count && sig_idx < task.num_args; sig_idx++) { + ArgDirection dir = callable.sig(sig_idx); + if (dir == ArgDirection::SCALAR) { + scalar_values[scalar_count++] = task.args[sig_idx]; + continue; + } + if (tensor_arg_index < tensor_info_count && tensor_arg_index < buffer_count && tensor_arg_index < RUNTIME_MAX_ARGS) { + const TensorInfo &src = tensor_info[tensor_arg_index]; + ArgsDumpTensorEntry &entry = tensor_entries[tensor_arg_index]; + entry.buffer_addr = buffer_addrs[tensor_arg_index]; + entry.buffer_size = tensor_info_nbytes(src); + entry.owner_task_id = UINT64_MAX; + entry.ndims = src.ndims; + entry.dtype = static_cast(src.dtype); + entry.is_contiguous = 1; + entry.is_all_offset_zero = 1; + for (uint32_t d = 0; d < src.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + entry.shapes[d] = src.shapes[d]; + entry.raw_shapes[d] = src.raw_shapes[d]; + entry.offsets[d] = src.offsets[d]; + if (src.shapes[d] != src.raw_shapes[d]) { + entry.is_contiguous = 0; + } + if (src.offsets[d] != 0) { + entry.is_all_offset_zero = 0; + } + } + recorded_tensor_count++; + } + tensor_arg_index++; + } + + ArgsDumpInfo info = {}; + info.task_id = static_cast(task.task_id); + info.subtask_id = 0; + info.stage = TensorDumpStage::BEFORE_DISPATCH; + info.func_id = static_cast(task.func_id); + info.tensor_count = static_cast(recorded_tensor_count); + info.scalar_count = static_cast(scalar_count); + info.tensors = tensor_entries; + info.scalars = scalar_values; + dump_args_record(thread_idx, info); +} #endif // ===== Helper Function Implementations ===== @@ -274,6 +336,10 @@ inline bool AicpuExecutor::try_dispatch_task( uint64_t tensor_buffer_addrs[RUNTIME_MAX_ARGS] = {}; int tensor_buffer_count = collect_task_tensor_buffer_addrs(runtime, *task, tensor_buffer_addrs, RUNTIME_MAX_ARGS); + dump_task_args_record( + thread_idx, *task, *callable, tensor_info, tensor_info_count, tensor_buffer_addrs, + tensor_buffer_count + ); dump_tensors_for_task( thread_idx, static_cast(task_id), 0, task->num_args, task->func_id, *callable, tensor_info, tensor_info_count, tensor_buffer_addrs, tensor_buffer_count, diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index a028fd138..fabba2b6e 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -38,7 +38,17 @@ namespace { inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256; + +int32_t first_active_subtask_slot(const PTO2TaskSlotState &slot_state) { + for (int32_t slot = 0; slot < PTO2_SUBTASK_SLOT_COUNT; slot++) { + if (slot_state.active_mask.subtask_active(static_cast(slot)) && + slot_state.task->kernel_id[slot] != INVALID_KERNEL_ID) { + return slot; + } + } + return -1; } +} // namespace const char *SchedulerContext::shape_name(PTO2ResourceShape shape) { switch (shape) { @@ -212,6 +222,13 @@ void SchedulerContext::dispatch_block( ) { #if PTO2_PROFILING if (is_dump_tensor_enabled()) { + int32_t args_slot = first_active_subtask_slot(slot_state); + if (args_slot >= 0) { + dump_args_for_payload( + thread_idx, slot_state.task->task_id.raw, static_cast(args_slot), + slot_state.task->kernel_id[args_slot], *slot_state.payload, TensorDumpStage::BEFORE_DISPATCH + ); + } dump_tensors_for_task( thread_idx, slot_state, TensorDumpStage::BEFORE_DISPATCH, [](ActiveMask active_mask, int raw_subtask_id) { diff --git a/src/a5/platform/include/aicpu/tensor_dump_aicpu.h b/src/a5/platform/include/aicpu/tensor_dump_aicpu.h index 497154c1c..790afc00e 100644 --- a/src/a5/platform/include/aicpu/tensor_dump_aicpu.h +++ b/src/a5/platform/include/aicpu/tensor_dump_aicpu.h @@ -75,6 +75,58 @@ int32_t count_callable_tensor_args(const CoreCallable &callable); bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage); bool try_log_tensor_dump_layout_mismatch(); int dump_tensor_record(int thread_idx, const TensorDumpInfo &info); +int dump_args_record(int thread_idx, const ArgsDumpInfo &info); + +template +inline void dump_args_for_payload( + int32_t thread_idx, uint64_t task_id, uint8_t subtask_id, int32_t func_id, const PayloadT &payload, + TensorDumpStage stage +) { + ArgsDumpTensorEntry tensor_entries[MaxTensorArgs] = {}; + int32_t tensor_count = payload.tensor_count; + if (tensor_count < 0) { + tensor_count = 0; + } + if (tensor_count > MaxTensorArgs) { + tensor_count = MaxTensorArgs; + } + int32_t scalar_count = payload.scalar_count; + if (scalar_count < 0) { + scalar_count = 0; + } + if (scalar_count > MaxScalarArgs) { + scalar_count = MaxScalarArgs; + } + + for (int32_t i = 0; i < tensor_count; i++) { + const auto &t = payload.tensors[i]; + ArgsDumpTensorEntry &entry = tensor_entries[i]; + entry.buffer_addr = t.buffer.addr; + entry.buffer_size = t.buffer.size; + entry.owner_task_id = t.owner_task_id.raw; + entry.ndims = t.ndims; + entry.dtype = static_cast(t.dtype); + entry.is_contiguous = t.is_raw_eq_shapes ? 1 : 0; + entry.is_all_offset_zero = t.is_all_offset_zero ? 1 : 0; + const uint32_t *raw_shapes = t.get_raw_shapes(); + for (uint32_t d = 0; d < t.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + entry.shapes[d] = t.shapes[d]; + entry.offsets[d] = t.is_all_offset_zero ? 0 : t.offsets[d]; + entry.raw_shapes[d] = raw_shapes[d]; + } + } + + ArgsDumpInfo info = {}; + info.task_id = task_id; + info.subtask_id = subtask_id; + info.stage = stage; + info.func_id = static_cast(func_id); + info.tensor_count = static_cast(tensor_count); + info.scalar_count = static_cast(scalar_count); + info.tensors = tensor_entries; + info.scalars = payload.scalars; + dump_args_record(thread_idx, info); +} template inline void dump_tensors_for_task( @@ -261,6 +313,14 @@ void dump_tensor_init(int num_dump_threads); */ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info); +/** + * Record a task's runtime argument descriptors. + * + * Copies an ArgsDumpPayloadHeader, ArgsDumpTensorEntry array, and raw scalar + * values into the existing dump arena, then appends an ARGS metadata record. + */ +int dump_args_record(int thread_idx, const ArgsDumpInfo &info); + /** * Flush remaining tensor dump data for a thread. * diff --git a/src/a5/platform/include/common/tensor_dump.h b/src/a5/platform/include/common/tensor_dump.h index 80ab8d5f4..d983edf8f 100644 --- a/src/a5/platform/include/common/tensor_dump.h +++ b/src/a5/platform/include/common/tensor_dump.h @@ -74,6 +74,55 @@ enum class TensorDumpStage : uint8_t { AFTER_COMPLETION = 1, }; +// ============================================================================= +// DumpRecordKind - Logical record type carried by the dump channel +// ============================================================================= + +enum class DumpRecordKind : uint8_t { + TENSOR = 0, + ARGS = 1, +}; + +// ============================================================================= +// Args dump payload schema +// ============================================================================= + +constexpr uint32_t ARGS_DUMP_PAYLOAD_VERSION = 1; + +struct ArgsDumpPayloadHeader { + uint32_t version; + uint32_t tensor_count; + uint32_t scalar_count; + uint32_t tensor_entry_size; + uint32_t scalar_entry_size; + uint32_t reserved; +}; + +struct ArgsDumpTensorEntry { + uint64_t buffer_addr; + uint64_t buffer_size; + uint64_t owner_task_id; + uint32_t shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t offsets[PLATFORM_DUMP_MAX_DIMS]; + uint32_t ndims; + uint8_t dtype; + uint8_t is_contiguous; + uint8_t is_all_offset_zero; + uint8_t reserved; +}; + +struct ArgsDumpInfo { + uint64_t task_id; + uint8_t subtask_id; + TensorDumpStage stage; + uint32_t func_id; + uint32_t tensor_count; + uint32_t scalar_count; + const ArgsDumpTensorEntry *tensors; + const uint64_t *scalars; +}; + // ============================================================================= // TensorDumpRecord - Single Tensor Dump Entry (128B = 2 cache lines) // ============================================================================= @@ -96,7 +145,7 @@ struct alignas(64) TensorDumpRecord { uint8_t dtype; // DataType raw enum value uint8_t truncated; // 1 if payload was truncated (tensor > arena capacity) uint8_t is_contiguous; // 1 when source view is already contiguous - uint8_t pad0_align; // Explicit alignment before 64-bit payload offsets + uint8_t kind; // DumpRecordKind; defaults to tensor for legacy records uint64_t payload_offset; // Monotonic byte offset into thread arena uint64_t payload_size; // Bytes actually copied (may be < full tensor bytes) uint8_t pad0[24]; // Preserve 64B cache-line layout diff --git a/src/a5/platform/include/host/tensor_dump_collector.h b/src/a5/platform/include/host/tensor_dump_collector.h index 57c3fa45b..538d79e4c 100644 --- a/src/a5/platform/include/host/tensor_dump_collector.h +++ b/src/a5/platform/include/host/tensor_dump_collector.h @@ -182,6 +182,18 @@ struct DumpedTensor { std::vector bytes; }; +struct DumpedArgs { + uint64_t task_id; + uint8_t subtask_id; + uint32_t func_id; + TensorDumpStage stage; + uint32_t tensor_count; + uint32_t scalar_count; + uint64_t payload_size; + bool overwritten; + std::vector bytes; +}; + class TensorDumpCollector : public profiling_common::ProfilerBase { public: TensorDumpCollector() = default; @@ -289,6 +301,7 @@ class TensorDumpCollector : public profiling_common::ProfilerBase collected_; + std::vector collected_args_; std::mutex collected_mutex_; // Stats diff --git a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp index 9fee131fd..de6b08084 100644 --- a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp +++ b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp @@ -327,6 +327,23 @@ static inline void write_tensor_dump_logical_prefix( gather_tensor_dump_dim(writer, info, elem_sz, 0, 0, &remaining_bytes); } +static DumpMetaBuffer *ensure_dump_record_slot(int thread_idx) { + if (s_dump_header == nullptr || thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + return nullptr; + } + DumpMetaBuffer *buf = s_current_dump_buf[thread_idx]; + if (buf == nullptr) { + return nullptr; + } + if (buf->count >= PLATFORM_DUMP_RECORDS_PER_BUFFER) { + if (switch_dump_meta_buffer(thread_idx) != 0) { + return nullptr; + } + buf = s_current_dump_buf[thread_idx]; + } + return buf; +} + void dump_tensor_init(int num_dump_threads) { void *dump_base = reinterpret_cast(get_platform_dump_base()); if (dump_base == nullptr) { @@ -374,29 +391,12 @@ void dump_tensor_init(int num_dump_threads) { } int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) { - if (s_dump_header == nullptr) { + DumpMetaBuffer *buf = ensure_dump_record_slot(thread_idx); + DumpBufferState *state = + (thread_idx >= 0 && thread_idx < PLATFORM_MAX_AICPU_THREADS) ? s_dump_states[thread_idx] : nullptr; + if (state == nullptr || buf == nullptr) { return -1; } - if (thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { - return -1; - } - - DumpBufferState *state = s_dump_states[thread_idx]; - DumpMetaBuffer *buf = s_current_dump_buf[thread_idx]; - if (buf == nullptr) { - return -1; - } - - // Switch metadata buffer if full - if (buf->count >= PLATFORM_DUMP_RECORDS_PER_BUFFER) { - if (switch_dump_meta_buffer(thread_idx) != 0) { - return -1; // No free buffer - } - buf = s_current_dump_buf[thread_idx]; - if (buf == nullptr) { - return -1; - } - } // Reserve space in arena // Compute actual tensor data size from shape (not buffer.size which may include padding) @@ -436,6 +436,7 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) { rec->ndims = info.ndims; rec->dtype = info.dtype; rec->truncated = truncated ? 1 : 0; + rec->kind = static_cast(DumpRecordKind::TENSOR); rec->payload_offset = offset; rec->payload_size = copy_bytes; for (int d = 0; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { @@ -451,6 +452,61 @@ int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) { return 0; } +int dump_args_record(int thread_idx, const ArgsDumpInfo &info) { + DumpMetaBuffer *buf = ensure_dump_record_slot(thread_idx); + DumpBufferState *state = + (thread_idx >= 0 && thread_idx < PLATFORM_MAX_AICPU_THREADS) ? s_dump_states[thread_idx] : nullptr; + if (state == nullptr || buf == nullptr) { + return -1; + } + + uint64_t payload_size = sizeof(ArgsDumpPayloadHeader) + + static_cast(info.tensor_count) * sizeof(ArgsDumpTensorEntry) + + static_cast(info.scalar_count) * sizeof(uint64_t); + if (payload_size > state->arena_size) { + account_dropped_records(state, 1); + return 0; + } + + uint64_t offset = state->arena_write_offset; + state->arena_write_offset = offset + payload_size; + + char *arena = reinterpret_cast(state->arena_base); + CircularArenaWriter writer = {arena, state->arena_size, offset, 0}; + ArgsDumpPayloadHeader header = {}; + header.version = ARGS_DUMP_PAYLOAD_VERSION; + header.tensor_count = info.tensor_count; + header.scalar_count = info.scalar_count; + header.tensor_entry_size = sizeof(ArgsDumpTensorEntry); + header.scalar_entry_size = sizeof(uint64_t); + writer.write(&header, sizeof(header)); + if (info.tensor_count > 0 && info.tensors != nullptr) { + writer.write(info.tensors, static_cast(info.tensor_count) * sizeof(ArgsDumpTensorEntry)); + } + if (info.scalar_count > 0 && info.scalars != nullptr) { + writer.write(info.scalars, static_cast(info.scalar_count) * sizeof(uint64_t)); + } + wmb(); + + uint32_t idx = buf->count; + TensorDumpRecord *rec = &buf->records[idx]; + memset(rec, 0, sizeof(*rec)); + rec->task_id = info.task_id; + rec->subtask_id = info.subtask_id; + rec->func_id = info.func_id; + rec->stage = static_cast(info.stage); + rec->kind = static_cast(DumpRecordKind::ARGS); + rec->payload_offset = offset; + rec->payload_size = payload_size; + rec->shapes[0] = info.tensor_count; + rec->shapes[1] = info.scalar_count; + buf->count = idx + 1; + wmb(); + + s_records_written[thread_idx]++; + return 0; +} + void dump_tensor_flush(int thread_idx) { if (s_dump_header == nullptr) { return; diff --git a/src/a5/platform/src/host/tensor_dump_collector.cpp b/src/a5/platform/src/host/tensor_dump_collector.cpp index 2e1698177..739a6fae7 100644 --- a/src/a5/platform/src/host/tensor_dump_collector.cpp +++ b/src/a5/platform/src/host/tensor_dump_collector.cpp @@ -236,6 +236,52 @@ void TensorDumpCollector::process_dump_buffer(const DumpReadyBufferInfo &info) { for (uint32_t i = 0; i < count; i++) { const TensorDumpRecord &rec = buf->records[i]; + if (static_cast(rec.kind) == DumpRecordKind::ARGS) { + DumpedArgs da; + da.task_id = rec.task_id; + da.subtask_id = rec.subtask_id; + da.func_id = rec.func_id; + da.stage = static_cast(rec.stage); + da.tensor_count = rec.shapes[0]; + da.scalar_count = rec.shapes[1]; + da.payload_size = rec.payload_size; + da.overwritten = false; + + if (thread_idx < static_cast(arenas_.size())) { + ArenaInfo &ai = arenas_[thread_idx]; + char *arena_host = reinterpret_cast(ai.host_ptr); + uint64_t arena_sz = ai.size; + uint64_t high_water = ai.high_water; + if (high_water > arena_sz && rec.payload_offset < high_water - arena_sz) { + da.overwritten = true; + if (++total_overwrite_count_ == 1) { + LOG_WARN( + "Dump args overwrite detected: host drain was slower than arena reuse. " + "Increase PLATFORM_DUMP_BUFFERS_PER_THREAD." + ); + } + } + if (!da.overwritten && rec.payload_size > 0) { + da.bytes.resize(rec.payload_size); + uint64_t pos = rec.payload_offset % arena_sz; + if (pos + rec.payload_size <= arena_sz) { + memcpy(da.bytes.data(), arena_host + pos, rec.payload_size); + } else { + uint64_t first = arena_sz - pos; + memcpy(da.bytes.data(), arena_host + pos, first); + memcpy(da.bytes.data() + first, arena_host, rec.payload_size - first); + } + } + uint64_t end_offset = rec.payload_offset + rec.payload_size; + if (end_offset > ai.high_water) { + ai.high_water = end_offset; + } + } + + std::lock_guard lock(collected_mutex_); + collected_args_.push_back(std::move(da)); + continue; + } DumpedTensor dt; dt.task_id = rec.task_id; @@ -428,6 +474,40 @@ static std::string dims_to_string(const uint32_t dims[], int ndims) { return ss.str(); } +static ArgsDumpPayloadHeader read_args_payload_header(const DumpedArgs &da) { + ArgsDumpPayloadHeader header = {}; + if (da.bytes.size() >= sizeof(header)) { + memcpy(&header, da.bytes.data(), sizeof(header)); + } + return header; +} + +static bool read_args_tensor_entry(const DumpedArgs &da, const ArgsDumpPayloadHeader &header, uint32_t index, ArgsDumpTensorEntry *out) { + if (header.tensor_entry_size != sizeof(ArgsDumpTensorEntry)) { + return false; + } + size_t offset = sizeof(ArgsDumpPayloadHeader) + static_cast(index) * sizeof(ArgsDumpTensorEntry); + if (offset + sizeof(ArgsDumpTensorEntry) > da.bytes.size()) { + return false; + } + memcpy(out, da.bytes.data() + offset, sizeof(ArgsDumpTensorEntry)); + return true; +} + +static bool read_args_scalar(const DumpedArgs &da, const ArgsDumpPayloadHeader &header, uint32_t index, uint64_t *out) { + if (header.tensor_entry_size != sizeof(ArgsDumpTensorEntry) || header.scalar_entry_size != sizeof(uint64_t)) { + return false; + } + size_t offset = sizeof(ArgsDumpPayloadHeader) + + static_cast(header.tensor_count) * sizeof(ArgsDumpTensorEntry) + + static_cast(index) * sizeof(uint64_t); + if (offset + sizeof(uint64_t) > da.bytes.size()) { + return false; + } + memcpy(out, da.bytes.data() + offset, sizeof(uint64_t)); + return true; +} + static std::string get_dtype_name_from_raw(uint8_t dtype) { return get_dtype_name(static_cast(dtype)); } static uint64_t get_num_elements(const DumpedTensor &dt) { @@ -496,8 +576,8 @@ int TensorDumpCollector::export_dump_files() { ); } - if (collected_.empty()) { - LOG_WARN("No tensor dump data to export"); + if (collected_.empty() && collected_args_.empty()) { + LOG_WARN("No dump data to export"); writer_started_ = false; return 0; } @@ -511,8 +591,14 @@ int TensorDumpCollector::export_dump_files() { if (a.arg_index != b.arg_index) return a.arg_index < b.arg_index; return static_cast(a.role) < static_cast(b.role); }); + std::sort(collected_args_.begin(), collected_args_.end(), [](const DumpedArgs &a, const DumpedArgs &b) { + if (a.task_id != b.task_id) return a.task_id < b.task_id; + if (a.subtask_id != b.subtask_id) return a.subtask_id < b.subtask_id; + if (a.func_id != b.func_id) return a.func_id < b.func_id; + return static_cast(a.stage) < static_cast(b.stage); + }); - LOG_INFO_V0("Writing JSON manifest for %zu tensors...", collected_.size()); + LOG_INFO_V0("Writing JSON manifest for %zu tensors and %zu args records...", collected_.size(), collected_args_.size()); uint32_t num_before_dispatch = 0; uint32_t num_after_completion = 0; @@ -547,6 +633,7 @@ int TensorDumpCollector::export_dump_files() { json << " \"byte_order\": \"little_endian\"\n"; json << " },\n"; json << " \"total_tensors\": " << collected_.size() << ",\n"; + json << " \"total_args\": " << collected_args_.size() << ",\n"; json << " \"before_dispatch\": " << num_before_dispatch << ",\n"; json << " \"after_completion\": " << num_after_completion << ",\n"; json << " \"input_tensors\": " << num_input_tensors << ",\n"; @@ -583,6 +670,46 @@ int TensorDumpCollector::export_dump_files() { << ", \"overwritten\": " << (dt.overwritten ? "true" : "false") << "}"; } + json << "\n ],\n"; + json << " \"args\": [\n"; + + for (size_t i = 0; i < collected_args_.size(); i++) { + const DumpedArgs &da = collected_args_[i]; + ArgsDumpPayloadHeader header = read_args_payload_header(da); + if (i > 0) json << ",\n"; + json << " {\"task_id\": \"0x" << std::hex << std::setfill('0') << std::setw(16) << da.task_id << std::dec + << "\", \"subtask_id\": " << static_cast(da.subtask_id) << ", \"func_id\": " << da.func_id + << ", \"stage\": \"" << tensor_dump_stage_name(da.stage) << "\", \"tensor_count\": " << da.tensor_count + << ", \"scalar_count\": " << da.scalar_count << ", \"payload_size\": " << da.payload_size + << ", \"overwritten\": " << (da.overwritten ? "true" : "false") << ", \"tensors\": ["; + for (uint32_t t = 0; t < header.tensor_count; t++) { + ArgsDumpTensorEntry entry = {}; + if (!read_args_tensor_entry(da, header, t, &entry)) { + break; + } + if (t > 0) json << ", "; + json << "{\"arg_index\": " << t << ", \"buffer_addr\": \"0x" << std::hex << entry.buffer_addr << std::dec + << "\", \"buffer_size\": " << entry.buffer_size << ", \"owner_task_id\": \"0x" << std::hex + << entry.owner_task_id << std::dec << "\", \"dtype\": \"" + << get_dtype_name_from_raw(entry.dtype) << "\", \"shape\": " + << dims_to_string(entry.shapes, static_cast(entry.ndims)) << ", \"raw_shape\": " + << dims_to_string(entry.raw_shapes, static_cast(entry.ndims)) << ", \"offsets\": " + << dims_to_string(entry.offsets, static_cast(entry.ndims)) << ", \"is_contiguous\": " + << (entry.is_contiguous ? "true" : "false") << ", \"is_all_offset_zero\": " + << (entry.is_all_offset_zero ? "true" : "false") << "}"; + } + json << "], \"scalars\": ["; + for (uint32_t s = 0; s < header.scalar_count; s++) { + uint64_t value = 0; + if (!read_args_scalar(da, header, s, &value)) { + break; + } + if (s > 0) json << ", "; + json << "\"0x" << std::hex << value << std::dec << "\""; + } + json << "]}"; + } + json << "\n ]\n}\n"; json.close(); @@ -599,6 +726,8 @@ int TensorDumpCollector::export_dump_files() { // Clear state so subsequent runs don't accumulate data from previous runs collected_.clear(); + collected_args_.clear(); + processed_buffers_.clear(); total_dropped_record_count_ = 0; total_truncated_count_ = 0; total_overwrite_count_ = 0; @@ -683,6 +812,7 @@ int TensorDumpCollector::finalize(DumpUnregisterCallback unregister_cb, DumpFree // Reset state num_dump_threads_ = 0; collected_.clear(); + collected_args_.clear(); total_dropped_record_count_ = 0; total_truncated_count_ = 0; total_overwrite_count_ = 0; diff --git a/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp index 3341c556d..d633a871d 100644 --- a/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp @@ -163,6 +163,68 @@ collect_task_tensor_buffer_addrs(const Runtime &runtime, const Task &task, uint6 } return found; } + +static uint64_t tensor_info_nbytes(const TensorInfo &info) { + uint64_t elements = 1; + for (uint32_t d = 0; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + elements *= info.shapes[d]; + } + return elements * get_element_size(info.dtype); +} + +static void dump_task_args_record( + int thread_idx, const Task &task, const CoreCallable &callable, const TensorInfo *tensor_info, + int tensor_info_count, const uint64_t *buffer_addrs, int buffer_count +) { + ArgsDumpTensorEntry tensor_entries[RUNTIME_MAX_ARGS] = {}; + uint64_t scalar_values[RUNTIME_MAX_ARGS] = {}; + int tensor_arg_index = 0; + int recorded_tensor_count = 0; + int scalar_count = 0; + int sig_count = callable.sig_count(); + for (int32_t sig_idx = 0; sig_idx < sig_count && sig_idx < task.num_args; sig_idx++) { + ArgDirection dir = callable.sig(sig_idx); + if (dir == ArgDirection::SCALAR) { + scalar_values[scalar_count++] = task.args[sig_idx]; + continue; + } + if (tensor_arg_index < tensor_info_count && tensor_arg_index < buffer_count && tensor_arg_index < RUNTIME_MAX_ARGS) { + const TensorInfo &src = tensor_info[tensor_arg_index]; + ArgsDumpTensorEntry &entry = tensor_entries[tensor_arg_index]; + entry.buffer_addr = buffer_addrs[tensor_arg_index]; + entry.buffer_size = tensor_info_nbytes(src); + entry.owner_task_id = UINT64_MAX; + entry.ndims = src.ndims; + entry.dtype = static_cast(src.dtype); + entry.is_contiguous = 1; + entry.is_all_offset_zero = 1; + for (uint32_t d = 0; d < src.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + entry.shapes[d] = src.shapes[d]; + entry.raw_shapes[d] = src.raw_shapes[d]; + entry.offsets[d] = src.offsets[d]; + if (src.shapes[d] != src.raw_shapes[d]) { + entry.is_contiguous = 0; + } + if (src.offsets[d] != 0) { + entry.is_all_offset_zero = 0; + } + } + recorded_tensor_count++; + } + tensor_arg_index++; + } + + ArgsDumpInfo info = {}; + info.task_id = static_cast(task.task_id); + info.subtask_id = 0; + info.stage = TensorDumpStage::BEFORE_DISPATCH; + info.func_id = static_cast(task.func_id); + info.tensor_count = static_cast(recorded_tensor_count); + info.scalar_count = static_cast(scalar_count); + info.tensors = tensor_entries; + info.scalars = scalar_values; + dump_args_record(thread_idx, info); +} #endif // ===== Helper Function Implementations ===== @@ -272,6 +334,10 @@ inline bool AicpuExecutor::try_dispatch_task( uint64_t tensor_buffer_addrs[RUNTIME_MAX_ARGS] = {}; int tensor_buffer_count = collect_task_tensor_buffer_addrs(*runtime_, *task, tensor_buffer_addrs, RUNTIME_MAX_ARGS); + dump_task_args_record( + thread_idx, *task, *callable, tensor_info, tensor_info_count, tensor_buffer_addrs, + tensor_buffer_count + ); dump_tensors_for_task( thread_idx, static_cast(task_id), 0, task->num_args, task->func_id, *callable, tensor_info, tensor_info_count, tensor_buffer_addrs, tensor_buffer_count, diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index 9e3564ca5..e09ea7da8 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -34,7 +34,17 @@ namespace { inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256; + +int32_t first_active_subtask_slot(const PTO2TaskSlotState &slot_state) { + for (int32_t slot = 0; slot < PTO2_SUBTASK_SLOT_COUNT; slot++) { + if (slot_state.active_mask.subtask_active(static_cast(slot)) && + slot_state.task->kernel_id[slot] != INVALID_KERNEL_ID) { + return slot; + } + } + return -1; } +} // namespace const char *SchedulerContext::shape_name(PTO2ResourceShape shape) { switch (shape) { @@ -216,6 +226,13 @@ void SchedulerContext::dispatch_block( ) { #if PTO2_PROFILING if (is_dump_tensor_enabled()) { + int32_t args_slot = first_active_subtask_slot(slot_state); + if (args_slot >= 0) { + dump_args_for_payload( + thread_idx, slot_state.task->task_id.raw, static_cast(args_slot), + slot_state.task->kernel_id[args_slot], *slot_state.payload, TensorDumpStage::BEFORE_DISPATCH + ); + } dump_tensors_for_task( thread_idx, slot_state, TensorDumpStage::BEFORE_DISPATCH, [](ActiveMask active_mask, int raw_subtask_id) {