diff --git a/conftest.py b/conftest.py index bdefdc418..e3c25e285 100644 --- a/conftest.py +++ b/conftest.py @@ -163,6 +163,12 @@ def pytest_addoption(parser): help="Enable PMU collection. Bare flag = PIPE_UTILIZATION(2). " "Pass event type to override (e.g. --enable-pmu 4)", ) + parser.addoption( + "--enable-scope-stats", + action="store_true", + default=False, + help="Enable per-scope peak collection and emit /scope_stats.json (per-scope ring-fill peaks).", + ) parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source") parser.addoption( "--pto-isa-commit", diff --git a/docs/dfx/scope-stats.md b/docs/dfx/scope-stats.md new file mode 100644 index 000000000..c418db54e --- /dev/null +++ b/docs/dfx/scope-stats.md @@ -0,0 +1,264 @@ +# Scope Stats — Per-scope Resource Usage Peaks + +## 1. Background & Motivation + +When a model runs out of task windows, heap, or dep/fanin pool entries, +the failure message tells you *which* resource is exhausted but not +*which scope* caused the peak. Without per-scope attribution, debugging +requires binary-searching the orchestration code to find the offending +scope — slow and error-prone. + +Scope stats captures the peak resource usage (heap bytes, task +in-flight, dep/fanin pool entries, tensormap entries) for every +`PTO2_SCOPE` region, so the output directly tells you which scope drove +each resource to its high-water mark. + +## 2. Overview + +- **One row per scope exit.** Peaks are sampled continuously inside the + scope and flushed to a shared buffer on `scope_end`. +- **Per-ring breakdown.** Each ring's task allocator heap/task-window + and dep/fanin pool are tracked independently. +- **JSON output.** A `scope_stats.json` lands under the per-task output + prefix with capacities in the header and per-scope records. +- **Runtime-gated.** Controlled by `--enable-scope-stats` (bit 4 of + `enable_profiling_flag`). When off, every probe is a single bool + load — no measurement overhead. +- **T&R runtime only.** See §6 for why. + +Enable in one line: + +```bash +python tests/st//test_.py -p a2a3 -d 0 --enable-scope-stats +``` + +## 3. Architecture + +### 3.1 Layering + +Scope stats uses a clean platform-provides / runtime-calls pattern: + +```text +platform/include/aicpu/scope_stats_collector.h + Pure-value API declarations. No runtime types cross this boundary. + +platform/src/aicpu/scope_stats_collector.cpp + Owns all collector state (depth stack, peak arrays, shared buffer). + Implements scope lifecycle (on_begin/on_end), peak comparison logic, + capacity registration, and shared buffer record writes. + +runtime (pto_orchestrator.cpp, pto_scheduler.h) + Calls platform APIs at instrumentation points, passing extracted + values (ring_id, heap_bytes, tasks_in_flight, etc.) as plain + integers. No scope_stats source files in the runtime directory. +``` + +### 3.2 Platform API + +Header: +[`src/a2a3/platform/include/aicpu/scope_stats_collector_aicpu.h`](../../src/a2a3/platform/include/aicpu/scope_stats_collector_aicpu.h) + +All entry points are `extern "C"` and take primitive types only — no +runtime structs cross the boundary, so the same collector links into +any runtime that wants to wire it up. Symbol resolution is unconditional +(see §3.4), so callers do not need to guard the call sites. + +Single-producer contract: all `*_peaks` updates use non-atomic +read-max-write and assume the orchestrator thread is the only writer. +Concurrent callers may lose peaks silently — that is acceptable for +diagnostic data and saves an atomic on the hot path. + +#### Setter symbols (host → AICPU init) + +```cpp +void set_scope_stats_enabled(bool enable); +void set_platform_scope_stats_base(uint64_t scope_stats_data_base); +``` + +`kernel.cpp` calls both at kernel entry from `KernelArgs`. `enable` +mirrors the host's `--enable-scope-stats` flag; `scope_stats_data_base` +is the device-visible address of a `ScopeStatsBuffer` host allocated +during `init_scope_stats()`. When `enable=false` every probe early-returns +after one bool load — that is the off-cost. + +#### Capacity registration (runtime → AICPU init) + +```cpp +void scope_stats_set_ring_capacity(int ring_id, int32_t window_cap, + uint64_t heap_cap, int32_t dep_cap); +void scope_stats_set_tensormap_capacity(int32_t cap); +``` + +Called once per ring at orchestrator init / scheduler attach. Caps are +copied verbatim into the buffer header so the host JSON can render +`used/cap` ratios without a second device→host query. `ring_id` outside +`[0, PTO2_SCOPE_STATS_MAX_RING_DEPTH)` is silently dropped. + +#### Scope lifecycle (runtime → AICPU per-scope) + +```cpp +void scope_stats_set_pending_site(const char *file, int line); +void scope_stats_on_begin(); +void scope_stats_on_end(); +void scope_stats_on_fatal(); +``` + +`PTO2_SCOPE()` expansion calls `set_pending_site(__FILE__, __LINE__)` +immediately before `on_begin()` so the next `on_end()` can stamp the +record with the originating source location — the basename copy +(`copy_basename`) keeps the JSON readable without forcing host to chase +a device pointer into the orchestration `.so`'s string table. `on_fatal` +sets `header.fatal_latched`, which surfaces as `"fatal": true` in the +JSON; the host treats that as "the run was diagnostic-only past this +point" but still emits whatever records made it. + +#### Peak updates (runtime → AICPU on resource touch) + +```cpp +void scope_stats_update_allocator_peaks(int ring_id, uint64_t heap_bytes, + int32_t tasks_in_flight); +void scope_stats_update_tensormap_peak(int32_t tensormap_used); +void scope_stats_update_pool_peaks(int ring_id, int32_t fanin_used, + int32_t dep_used); +``` + +Called after each allocation or pool change. Each update walks +`d ∈ [0, scope_stats_depth]` so peaks bubble up: an alloc spike inside +an inner scope shows in both the inner and outer record. `ring_id` out +of range is dropped silently (same clamp as capacity registration). + +#### Initial-sample callback (optional) + +```cpp +typedef void (*ScopeStatsInitialSampleFn)(int32_t depth); +void scope_stats_register_initial_sampler(ScopeStatsInitialSampleFn fn); +``` + +If the runtime registers a sampler, `on_begin` calls it instead of +zero-clearing the new depth's peak arrays. Used to inherit the current +in-flight resource snapshot at scope entry so the inner-scope peak is +relative to "what was already in flight when we entered", not zero. +Without it, the per-depth peak arrays are zeroed at each `on_begin`. + +### 3.3 Comparison with other profiling subsystems + +| Feature | Layer | Runtime scope | Why | +| ------- | ----- | ------------- | --- | +| PMU | platform only | all runtimes | reads hardware registers (platform) | +| L2 swimlane | platform only | all runtimes | reads AICore ring buffers (platform) | +| dep_gen | platform only | all runtimes | traces `submit_task` (runtime-agnostic) | +| tensor dump | platform only | all runtimes | dumps tensor data (platform) | +| **scope stats** | **platform API + runtime call sites** | **T&R only** | runtime extracts values, platform tracks peaks | + +### 3.4 Symbol resolution flow + +```text +kernel.cpp (platform, shared by all runtimes) + ├── set_scope_stats_enabled(flag) + └── set_platform_scope_stats_base(addr) + +For host_build_graph AICPU .so: + kernel.cpp ──links──> platform collector + → symbols resolve, .so loads, scope_stats is enabled but + no runtime call sites invoke update APIs → no records + +For tensormap_and_ringbuffer AICPU .so: + kernel.cpp ──links──> platform collector + runtime call sites invoke update/capacity APIs + → full peak tracking active +``` + +## 4. Data Flow + +```text +Host AICPU (T&R runtime) +───── ───────────────────── +ScopeStatsCollector platform scope_stats_collector.cpp + allocate ScopeStatsBuffer set_platform_scope_stats_base(addr) + set kernel_args fields set_scope_stats_enabled(true) + launch kernel runtime: scope_stats_set_ring_capacity() + │ runtime: scope_stats_set_tensormap_capacity() + │ │ + │ on PTO2_SCOPE begin: + │ scope_stats_on_begin() + │ runtime: scope_stats_update_*_peaks() + │ on alloc/pool change: + │ runtime: scope_stats_update_*_peaks() + │ on PTO2_SCOPE end: + │ scope_stats_on_end() + │ └─ write record to ScopeStatsBuffer + │ │ + stream sync kernel exit + read ScopeStatsBuffer + emit scope_stats.json +``` + +## 5. Output: `scope_stats.json` + +The host emits `/scope_stats.json` at finalize, after +the device stream is synced. Schema (version 2): + +```json +{ + "version": 2, + "fatal": false, + "write_count": 2, + "cap": 16384, + "dropped": 0, + "records": [ + {"site": "example_orchestration.cpp:77", "depth": 1, + "task_window": ["0/16384", "4/16384", "0/16384", "0/16384"], + "heap": ["0/268435456", "8192/268435456", "0/268435456", "0/268435456"], + "dep": ["0/16384", "5/16384", "0/16384", "0/16384"], + "fanin_used": [0, 3, 0, 0], + "tensormap": "5/65536"}, + {"site": "kernel.cpp:80", "depth": 0, + "task_window": ["1/16384", "4/16384", "0/16384", "0/16384"], + "heap": ["4096/268435456", "8192/268435456", "0/268435456", "0/268435456"], + "dep": ["1/16384", "5/16384", "0/16384", "0/16384"], + "fanin_used": [0, 3, 0, 0], + "tensormap": "5/65536"} + ] +} +``` + +Top-level fields: + +| Field | Type | Meaning | +| ----- | ---- | ------- | +| `version` | int | Always `2` | +| `fatal` | bool | `true` iff `scope_stats_on_fatal()` fired during the run | +| `write_count` | uint64 | Total `scope_end` events observed (incl. dropped) | +| `cap` | uint32 | Ring capacity, `PTO2_SCOPE_STATS_LOG_CAP` (16384) | +| `dropped` | uint64 | `max(write_count - cap, 0)` — overflow count | +| `records` | array | Up to `min(cap, write_count)` records, oldest-first | + +Per-record fields: + +| Field | Type | Description | +| ----- | ---- | ----------- | +| `site` | `"basename:line"` | Source location of the `PTO2_SCOPE()` call | +| `depth` | int | Nesting depth (0 = root scope inside the executor) | +| `task_window[ring]` | `"used/cap"` | Peak task-window slots in use | +| `heap[ring]` | `"used/cap"` | Peak per-ring heap bytes in use | +| `dep[ring]` | `"used/cap"` | Peak dep-pool entries in use | +| `fanin_used[ring]` | int32 | Peak fanin-pool entries in use (capacity isn't currently carried — fanin reservation is implicit in dep accounting) | +| `tensormap` | `"used/cap"` | Peak tensormap entries in use | + +The `cap` denominators come from `scope_stats_set_ring_capacity` / +`scope_stats_set_tensormap_capacity` snapshots, so they always reflect +the values the runtime actually configured for that run. + +A worked example is in +[`tests/st/a2a3/tensormap_and_ringbuffer/dfx/scope_stats/test_scope_stats.py`](../../tests/st/a2a3/tensormap_and_ringbuffer/dfx/scope_stats/test_scope_stats.py) +— it runs the `vector_example` orchestration with `--enable-scope-stats` +and asserts the resulting JSON for the depth=0 / depth=1 records the +outer-executor + inner `PTO2_SCOPE` produce. + +## 6. Future: Cross-runtime Support + +If host_build_graph adds scope-like concepts in the future, extending +scope_stats only requires adding the same platform API call sites in +the HBG runtime — no platform changes needed. The platform collector +is already runtime-agnostic; it accepts plain values and has no +knowledge of T&R types. diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index 7aa251db2..360c51989 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -618,6 +618,15 @@ NB_MODULE(_task_interface, m) { c.enable_dep_gen = v ? 1 : 0; } ) + .def_prop_rw( + "enable_scope_stats", + [](const CallConfig &c) { + return static_cast(c.enable_scope_stats); + }, + [](CallConfig &c, bool v) { + c.enable_scope_stats = v ? 1 : 0; + } + ) .def_prop_rw( "output_prefix", [](const CallConfig &c) -> std::string { @@ -639,7 +648,8 @@ NB_MODULE(_task_interface, m) { os << "CallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num << ", enable_l2_swimlane=" << self.enable_l2_swimlane << ", enable_dump_tensor=" << (self.enable_dump_tensor ? "True" : "False") - << ", enable_pmu=" << self.enable_pmu << ", enable_dep_gen=" << (self.enable_dep_gen ? "True" : "False"); + << ", enable_pmu=" << self.enable_pmu << ", enable_dep_gen=" << (self.enable_dep_gen ? "True" : "False") + << ", enable_scope_stats=" << (self.enable_scope_stats ? "True" : "False"); if (self.output_prefix_set()) { os << ", output_prefix='" << self.output_prefix << "'"; } diff --git a/python/simpler/worker.py b/python/simpler/worker.py index e4956e708..64274d35d 100644 --- a/python/simpler/worker.py +++ b/python/simpler/worker.py @@ -112,11 +112,11 @@ def my_l4_orch(orch, args, config): _OFF_CALLABLE = 8 _OFF_CONFIG = 16 # Packed CallConfig wire layout — must match call_config.h byte for byte: -# 6 int32 (block_dim, aicpu_thread_num, enable_l2_swimlane, enable_dump_tensor, -# enable_pmu, enable_dep_gen) + 1024-byte NUL-terminated output_prefix. Log -# config travels separately via ChipWorker.init(log_level, log_info_v) — not -# on per-task wire. -_CFG_FMT = struct.Struct("=iiiiii1024s") +# 7 int32 (block_dim, aicpu_thread_num, enable_l2_swimlane, enable_dump_tensor, +# enable_pmu, enable_dep_gen, enable_scope_stats) + 1024-byte NUL-terminated +# output_prefix. Log config travels separately via ChipWorker.init(log_level, +# log_info_v) — not on per-task wire. +_CFG_FMT = struct.Struct("=iiiiiii1024s") # Args region starts after CONFIG, rounded up to 8 bytes so the first # ContinuousTensor.data (uint64_t at OFF_ARGS+8) is 8-byte aligned, avoiding # SIGBUS on strict-alignment platforms (aarch64 atomics, some ARM cores). @@ -718,7 +718,7 @@ def _chip_process_loop( def _read_config_from_mailbox(buf: memoryview) -> "CallConfig": """Reconstruct a CallConfig from the unified mailbox layout.""" - block_dim, aicpu_tn, swl, dt, pmu, dep_gen, prefix_bytes = _CFG_FMT.unpack_from(buf, _OFF_CONFIG) + block_dim, aicpu_tn, swl, dt, pmu, dep_gen, scope_stats, prefix_bytes = _CFG_FMT.unpack_from(buf, _OFF_CONFIG) cfg = CallConfig() cfg.block_dim = block_dim cfg.aicpu_thread_num = aicpu_tn @@ -726,6 +726,7 @@ def _read_config_from_mailbox(buf: memoryview) -> "CallConfig": cfg.enable_dump_tensor = bool(dt) cfg.enable_pmu = pmu cfg.enable_dep_gen = bool(dep_gen) + cfg.enable_scope_stats = bool(scope_stats) # NUL-terminated C string in a 1024-byte field. cfg.output_prefix = prefix_bytes.split(b"\x00", 1)[0].decode("utf-8") return cfg diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py index f179b5cba..04ec7ed1e 100644 --- a/simpler_setup/scene_test.py +++ b/simpler_setup/scene_test.py @@ -652,6 +652,7 @@ def run_class_cases( # noqa: PLR0913 -- shared layer-5 entry; kwargs mirror CLI enable_dump_tensor, enable_pmu, enable_dep_gen, + enable_scope_stats, ): """Execute a pre-filtered list of cases for one class (layers 5-6). @@ -661,11 +662,14 @@ def run_class_cases( # noqa: PLR0913 -- shared layer-5 entry; kwargs mirror CLI """ cls_name = type(cls_inst).__name__ callable_spec = getattr(type(cls_inst), "CALLABLE", None) - diagnostics_on = enable_l2_swimlane or enable_dump_tensor or enable_pmu or enable_dep_gen + diagnostics_on = enable_l2_swimlane or enable_dump_tensor or enable_pmu or enable_dep_gen or enable_scope_stats for case in cases: case_label = f"{cls_name}_{case['name']}" # Per-case directory the runtime writes into. Required (non-empty) when # any diagnostic flag is on; CallConfig::validate() throws otherwise. + # scope_stats now writes /scope_stats.json (sibling of + # l2_perf_records.json / deps.json), so it pulls output_prefix the + # same way the other DFX flags do. prefix = _build_output_prefix(case_label) if diagnostics_on else Path("") try: cls_inst._run_and_validate( @@ -679,6 +683,7 @@ def run_class_cases( # noqa: PLR0913 -- shared layer-5 entry; kwargs mirror CLI enable_dump_tensor=enable_dump_tensor, enable_pmu=enable_pmu, enable_dep_gen=enable_dep_gen, + enable_scope_stats=enable_scope_stats, output_prefix=str(prefix) if diagnostics_on else "", ) finally: @@ -851,6 +856,7 @@ def _build_config( enable_dump_tensor=False, enable_pmu=0, enable_dep_gen=False, + enable_scope_stats=False, *, output_prefix="", ): @@ -867,6 +873,7 @@ def _build_config( config.enable_dump_tensor = enable_dump_tensor config.enable_pmu = enable_pmu # 0=disabled, >0=enabled with event type config.enable_dep_gen = enable_dep_gen + config.enable_scope_stats = enable_scope_stats # `output_prefix` is required by CallConfig::validate() whenever any # diagnostic flag is enabled. Caller threads it down from the per-case # directory built by _build_output_prefix(). @@ -903,6 +910,7 @@ def _run_and_validate( # noqa: PLR0913 -- threads CLI diagnostic flags + case c enable_dump_tensor=False, enable_pmu=0, enable_dep_gen=False, + enable_scope_stats=False, output_prefix="", ): if self._st_level == 2: @@ -916,6 +924,7 @@ def _run_and_validate( # noqa: PLR0913 -- threads CLI diagnostic flags + case c enable_dump_tensor=enable_dump_tensor, enable_pmu=enable_pmu, enable_dep_gen=enable_dep_gen, + enable_scope_stats=enable_scope_stats, output_prefix=output_prefix, ) elif self._st_level == 3: @@ -930,10 +939,11 @@ def _run_and_validate( # noqa: PLR0913 -- threads CLI diagnostic flags + case c enable_dump_tensor=enable_dump_tensor, enable_pmu=enable_pmu, enable_dep_gen=enable_dep_gen, + enable_scope_stats=enable_scope_stats, output_prefix=output_prefix, ) - def _run_and_validate_l2( + def _run_and_validate_l2( # noqa: PLR0913 -- threads CLI diagnostic flags + case context self, worker, callable_obj, @@ -944,6 +954,7 @@ def _run_and_validate_l2( enable_dump_tensor=False, enable_pmu=0, enable_dep_gen=False, + enable_scope_stats=False, output_prefix="", ): params = case.get("params", {}) @@ -993,6 +1004,7 @@ def _run_and_validate_l2( enable_dump_tensor=enable_dump_tensor, enable_pmu=enable_pmu, enable_dep_gen=enable_dep_gen, + enable_scope_stats=enable_scope_stats, output_prefix=output_prefix, ) @@ -1019,6 +1031,7 @@ def _run_and_validate_l3( # noqa: PLR0913 -- threads CLI diagnostic flags + L3 enable_dump_tensor=False, enable_pmu=0, enable_dep_gen=False, + enable_scope_stats=False, output_prefix="", ): # Defensive belt-and-braces: the pytest dispatcher and run_module both @@ -1073,6 +1086,7 @@ def _run_and_validate_l3( # noqa: PLR0913 -- threads CLI diagnostic flags + L3 enable_dump_tensor=enable_dump_tensor, enable_pmu=enable_pmu, enable_dep_gen=enable_dep_gen, + enable_scope_stats=enable_scope_stats, output_prefix=output_prefix, ) @@ -1126,6 +1140,7 @@ def test_run(self, st_platform, st_worker, request): enable_dump_tensor = request.config.getoption("--dump-tensor", default=False) enable_pmu = request.config.getoption("--enable-pmu", default=0) enable_dep_gen = self._effective_enable_dep_gen(request, warn=True) + enable_scope_stats = request.config.getoption("--enable-scope-stats", default=False) if rounds > 1: if enable_l2_swimlane: logger.warning("Profiling disabled: --rounds > 1") @@ -1176,6 +1191,7 @@ def test_run(self, st_platform, st_worker, request): enable_dump_tensor=enable_dump_tensor, enable_pmu=enable_pmu, enable_dep_gen=enable_dep_gen, + enable_scope_stats=enable_scope_stats, ) # ------------------------------------------------------------------ @@ -1243,6 +1259,13 @@ def run_module(module_name): # noqa: PLR0912, PLR0915 -- CLI parsing + dispatch help="Enable PMU collection. Bare flag = PIPE_UTILIZATION(2). " "Pass event type to override (e.g. --enable-pmu 4)", ) + parser.add_argument( + "--enable-scope-stats", + action="store_true", + default=False, + help="Enable per-scope peak collection and emit /scope_stats.json " + "(per-scope ring-fill peaks).", + ) parser.add_argument("--build", action="store_true", help="Compile runtime from source") parser.add_argument( "--runtime", @@ -1433,6 +1456,7 @@ def run_module(module_name): # noqa: PLR0912, PLR0915 -- CLI parsing + dispatch enable_dump_tensor=args.dump_tensor, enable_pmu=args.enable_pmu, enable_dep_gen=args.enable_dep_gen, + enable_scope_stats=args.enable_scope_stats, ) print("PASSED") except Exception as e: # noqa: BLE001 @@ -1472,6 +1496,8 @@ def _dispatch_test_phases_standalone(module_name, selected_by_cls, args): # noq common.append("--dump-tensor") if args.enable_dep_gen: common.append("--enable-dep-gen") + if args.enable_scope_stats: + common.append("--enable-scope-stats") if args.build: common.append("--build") diff --git a/src/a2a3/platform/include/aicpu/scope_stats_collector_aicpu.h b/src/a2a3/platform/include/aicpu/scope_stats_collector_aicpu.h new file mode 100644 index 000000000..f3d944165 --- /dev/null +++ b/src/a2a3/platform/include/aicpu/scope_stats_collector_aicpu.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#pragma once + +#include + +#include "common/scope_stats_buffer.h" + +// Scope-stats collector — platform-owned, runtime-agnostic. +// +// Platform owns all collector state and peak-tracking logic. Runtime calls +// pure-value APIs to report resource usage; no runtime types cross the +// boundary. +// +// Setter symbols (set_scope_stats_enabled, set_platform_scope_stats_base) +// are exported unconditionally so the host-side sim DeviceRunner's dlsym +// always resolves. + +extern "C" { + +// --- Scope lifecycle probes (called by orchestrator begin_scope/end_scope) --- + +void scope_stats_on_begin(); +void scope_stats_on_end(); +void scope_stats_on_fatal(); + +// --- Site tracking --- + +void scope_stats_set_pending_site(const char *file, int line); + +// --- Setter symbols (always exported) --- + +void set_scope_stats_enabled(bool enable); +void set_platform_scope_stats_base(uint64_t scope_stats_data_base); + +// --- Initial sampling callback --- + +typedef void (*ScopeStatsInitialSampleFn)(int32_t depth); +void scope_stats_register_initial_sampler(ScopeStatsInitialSampleFn fn); + +// --- Pure-value peak update APIs (called by runtime at instrumentation points) --- +// Single-producer assumption: peak updates use non-atomic read-max-write. +// Safe when the orchestrator is single-threaded; concurrent callers may +// lose peaks silently (acceptable for diagnostic data). + +void scope_stats_update_allocator_peaks(int ring_id, uint64_t heap_bytes, int32_t tasks_in_flight); +void scope_stats_update_tensormap_peak(int32_t tensormap_used); +void scope_stats_update_pool_peaks(int ring_id, int32_t fanin_used, int32_t dep_used); + +// --- Capacity registration (called by runtime at init) --- + +void scope_stats_set_ring_capacity(int ring_id, int32_t window_cap, uint64_t heap_cap, int32_t dep_cap); +void scope_stats_set_tensormap_capacity(int32_t cap); + +} // extern "C" diff --git a/src/a2a3/platform/include/common/kernel_args.h b/src/a2a3/platform/include/common/kernel_args.h index 99e72fdea..59c1d68be 100644 --- a/src/a2a3/platform/include/common/kernel_args.h +++ b/src/a2a3/platform/include/common/kernel_args.h @@ -88,12 +88,15 @@ struct KernelArgs { uint64_t pmu_data_base{0}; // PMU shared memory base address; use explicit flags to detect enablement uint64_t pmu_reg_addrs{0}; // Per-core PMU MMIO register base address array (onboard only; 0 on sim) uint64_t dep_gen_data_base{0}; // dep_gen shared memory base address; use explicit flags to detect enablement + uint64_t scope_stats_data_base{0}; // ScopeStatsBuffer shared memory base; 0 when scope_stats is off. + // Allocated by host's ScopeStatsCollector, read+written by AICPU's + // scope_stats_collector via set_platform_scope_stats_base. uint64_t aicore_ring_addr{0}; // Device ptr to a uint64_t[num_aicore] table holding each core's // L2PerfAicoreRing address. AICore kernel entry indexes by block_idx // and forwards into platform set/get state. 0 when L2 swimlane is off. uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5 - uint32_t enable_profiling_flag{0}; // Profiling umbrella bitmask; bit0=dump_tensor, bit1=l2_swimlane, bit2=pmu + uint32_t enable_profiling_flag{0}; // Profiling umbrella bitmask; dump_tensor|l2_swimlane|pmu|dep_gen|scope_stats uint32_t _pad{0}; // Alignment padding // Device pointer to an 8-byte buffer that the platform AICPU entry writes diff --git a/src/a2a3/platform/include/common/platform_config.h b/src/a2a3/platform/include/common/platform_config.h index 2bb14cb9c..53858e89b 100644 --- a/src/a2a3/platform/include/common/platform_config.h +++ b/src/a2a3/platform/include/common/platform_config.h @@ -177,6 +177,7 @@ inline double cycles_to_us(uint64_t cycles) { #define PROFILING_FLAG_L2_SWIMLANE (1u << 1) #define PROFILING_FLAG_PMU (1u << 2) #define PROFILING_FLAG_DEP_GEN (1u << 3) +#define PROFILING_FLAG_SCOPE_STATS (1u << 4) #define GET_PROFILING_FLAG(flags, bit) ((((uint32_t)(flags)) & ((uint32_t)(bit))) != 0u) #define SET_PROFILING_FLAG(flags, bit) ((flags) |= (uint32_t)(bit)) #define CLEAR_PROFILING_FLAG(flags, bit) ((flags) &= ~((uint32_t)(bit))) diff --git a/src/a2a3/platform/include/common/scope_stats_buffer.h b/src/a2a3/platform/include/common/scope_stats_buffer.h new file mode 100644 index 000000000..7780fc333 --- /dev/null +++ b/src/a2a3/platform/include/common/scope_stats_buffer.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef PLATFORM_COMMON_SCOPE_STATS_BUFFER_H_ +#define PLATFORM_COMMON_SCOPE_STATS_BUFFER_H_ + +#include + +// Layout shared between AICPU writer (scope_stats_collector_aicpu.cpp on +// device) and the host reader (ScopeStatsHostBuffer on host). The whole block lives in a +// host-allocated device-visible memory region; AICPU mutates `header.write_count` +// and `records[i]` during the run, host snapshots both after the run to write +// `/scope_stats.json`. +// +// Hot-path semantics: AICPU appends one record per scope_end into the ring +// using `idx = header.write_count % header.cap`, then increments +// `write_count`. No locking — single-producer (orchestrator thread) / +// single-consumer (host post-run). Host never reads while AICPU writes. +// +// Capacity (PTO2_SCOPE_STATS_LOG_CAP) is fixed at build time so the layout is +// stable across host/device builds. 16 384 records × ~96 B = ~1.5 MB; the +// host opts in via `--enable-scope-stats` and the allocation is skipped when +// the flag is off. + +#define PTO2_SCOPE_STATS_LOG_CAP 16384u +#define PTO2_SCOPE_STATS_MAX_RING_DEPTH 4 +#define PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH 64 + +#ifdef __cplusplus +extern "C" { +#endif + +// One record per scope_end. Layout MUST stay in sync with the device-side +// writer in platform/src/aicpu/scope_stats_collector_aicpu.cpp. +struct ScopeStatsRecord { + uint64_t site_file_addr; // device-side const char *; for diagnostics the host + // only logs the raw pointer (string table lives in + // the orchestration .so, not in shared memory). + // AICPU also writes a basename copy into site_file_basename. + char site_file_basename[32]; // NUL-terminated basename of site_file, captured + // at append time so the host JSON contains a + // human-readable path without dereferencing a + // device pointer. + int32_t site_line; + int16_t depth; + int16_t _pad0; + uint64_t heap_bytes[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + int32_t task_in_flight[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + int32_t dep_used[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + int32_t fanin_used[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + int32_t tensormap_used; + int32_t _pad1; +}; + +struct ScopeStatsHeader { + uint64_t write_count; // Total append count. write_count > cap means the + // ring wrapped; host reports `dropped = write_count - cap` + // and emits `min(cap, write_count)` records starting + // from `(write_count - kept) % cap`. + uint32_t cap; // Fixed at PTO2_SCOPE_STATS_LOG_CAP; copied in by host + // at init so device and host see the same value + // without needing a separate sync. + uint32_t fatal_latched; // AICPU sets to 1 on first fatal. Host uses this + // to stamp the JSON `fatal` field — no separate + // device→host channel needed. + // Per-ring capacities — snapshotted by AICPU once at scope_stats_bind + // (constant for the run, so writing them once is fine). Host needs them + // to render the "used/cap" ratio in JSON without re-introducing a + // separate device→host query. + int32_t task_window_cap[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + uint64_t heap_cap[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + int32_t dep_cap[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + int32_t tensormap_cap; + int32_t _pad; +}; + +struct ScopeStatsBuffer { + ScopeStatsHeader header; + ScopeStatsRecord records[PTO2_SCOPE_STATS_LOG_CAP]; +}; + +#ifdef __cplusplus +} +#endif + +#endif // PLATFORM_COMMON_SCOPE_STATS_BUFFER_H_ diff --git a/src/a2a3/platform/include/host/scope_stats_dump.h b/src/a2a3/platform/include/host/scope_stats_dump.h new file mode 100644 index 000000000..af586b83b --- /dev/null +++ b/src/a2a3/platform/include/host/scope_stats_dump.h @@ -0,0 +1,305 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A2A3_PLATFORM_INCLUDE_HOST_SCOPE_STATS_DUMP_H_ +#define SRC_A2A3_PLATFORM_INCLUDE_HOST_SCOPE_STATS_DUMP_H_ + +#include +#include +#include +#include +#include +#include + +#include "common/scope_stats_buffer.h" +#include "common/unified_log.h" +#include "host/profiling_common/profiler_base.h" + +// Header-only host-side helper for scope_stats. Intentionally NOT modeled on +// L2PerfCollector / DepGenCollector — scope_stats is a single end-of-run +// snapshot (no streaming, no mgmt thread, no reconcile), so this whole +// feature collapses into one class with three short methods. Keeping every +// host-side scope_stats line in one file isolates the feature from the +// general device_runner flow: hooking the feature on adds three call sites +// (init / dump / finalize) and zero protocol details to device_runner. +// +// AICPU side is symmetric: layout in `scope_stats_buffer.h`, device probes / +// writer in `platform/src/aicpu/scope_stats_collector_aicpu.cpp`. The shared +// layout header is the only file both sides include. + +// Memory callbacks — thin aliases for the canonical profiling_common shapes +// (same pattern as dep_gen_collector / l2_perf_collector / pmu_collector / +// tensor_dump_collector). Three init modes: +// - a2a3 onboard: pass register_cb (halHostRegister maps device→host). +// - a5 onboard: pass copy_from_device_cb (no halHostRegister — host +// shadow is allocated separately and refreshed at dump +// time via rtMemcpy DEVICE_TO_HOST). +// - sim: pass neither; sim is single-address-space, host_ptr +// aliases device_ptr directly. +using ScopeStatsAllocCallback = profiling_common::ProfAllocCallback; +using ScopeStatsRegisterCallback = profiling_common::ProfRegisterCallback; +using ScopeStatsUnregisterCallback = profiling_common::ProfUnregisterCallback; +using ScopeStatsFreeCallback = profiling_common::ProfFreeCallback; +using ScopeStatsCopyFromDeviceCallback = std::function; + +// BufferPoolManager template stub. ScopeStatsHostBuffer drives the manager +// purely for its single-buffer alloc_and_register / free_buffer methods (the +// canonical dev↔host shared-memory setup path on a2a3); the streaming +// ready/done queue side is never touched, so the layout-trait aliases below +// are unused placeholders required only by the manager's static checks. +struct ScopeStatsModule { + using DataHeader = int; + using ReadyEntry = int; + using ReadyBufferInfo = int; + using FreeQueue = int; + static constexpr int kBufferKinds = 1; +}; + +class ScopeStatsHostBuffer { +public: + // Allocate the device-side buffer and set up the host-side view. Returns + // 0 on success; on failure the object stays uninitialized and every + // other method is a no-op, so callers can chain without guarding. + // + // Exactly one of `register_cb` / `copy_from_device_cb` should be set on + // hardware (a2a3 vs a5 respectively); pass both null on sim. + int init( + const ScopeStatsAllocCallback &alloc_cb, ScopeStatsRegisterCallback register_cb, + const ScopeStatsFreeCallback &free_cb, const ScopeStatsCopyFromDeviceCallback ©_from_device_cb, + int device_id + ) { + device_id_ = device_id; + copy_from_device_cb_ = copy_from_device_cb; + const std::size_t bytes = sizeof(ScopeStatsBuffer); + + if (copy_from_device_cb_) { + // a5 onboard — no halHostRegister; allocate device + host shadow + // separately. ``dump()`` refreshes the shadow via rtMemcpy + // DEVICE_TO_HOST. The AICPU writer zeros the device header itself + // in set_platform_scope_stats_base. + dev_ptr_ = alloc_cb(bytes); + if (dev_ptr_ == nullptr) { + LOG_ERROR("scope_stats: failed to alloc %zu bytes", bytes); + return -1; + } + host_ptr_ = std::malloc(bytes); + if (host_ptr_ == nullptr) { + LOG_ERROR("scope_stats: failed to alloc host shadow %zu bytes", bytes); + if (free_cb) free_cb(dev_ptr_); + dev_ptr_ = nullptr; + return -1; + } + std::memset(host_ptr_, 0, bytes); + host_shadow_owned_ = true; + } else { + // a2a3 onboard (halHostRegister) and sim (identity) both go + // through profiling_common::BufferPoolManager so this collector's + // dev↔host setup matches every pool-based collector's. Manager is + // used only for alloc_and_register / free_buffer — its streaming + // pool/queue side stays untouched. + profiling_common::MemoryOps ops; + ops.alloc = alloc_cb; + ops.free_ = free_cb; + if (register_cb != nullptr) { + ops.reg = register_cb; + } else { + // Sim — single address space; install an identity wrapper so + // alloc_and_register has a uniform code path (mirrors what + // ProfilerBase::start does for sim collectors). + ops.reg = [](void *dev, std::size_t /*size*/, int /*device_id*/, void **host_ptr_out) { + *host_ptr_out = dev; + return 0; + }; + } + manager_.set_memory_context(std::move(ops), /*shared_mem_host=*/nullptr, device_id); + dev_ptr_ = manager_.alloc_and_register(bytes, &host_ptr_); + if (dev_ptr_ == nullptr) { + return -1; + } + std::memset(host_ptr_, 0, bytes); + host_shadow_owned_ = false; + } + initialized_ = true; + return 0; + } + + bool is_initialized() const { return initialized_; } + + void *device_ptr() const { return dev_ptr_; } + + // Snapshot the shared region as JSON at /scope_stats.json. + // Assumes the device stream has already been synced (matches dep_gen / + // l2_perf export ordering), so AICPU writes are fully visible. + int dump(const std::string &output_dir) const { + if (!initialized_ || host_ptr_ == nullptr) return 0; + if (host_shadow_owned_ && copy_from_device_cb_) { + // a5 onboard: refresh the host shadow before reading. + int rc = copy_from_device_cb_(host_ptr_, dev_ptr_, sizeof(ScopeStatsBuffer)); + if (rc != 0) { + LOG_ERROR("scope_stats: copy_from_device failed: %d", rc); + return rc; + } + } + const std::string path = make_path(output_dir); + const auto *buf = static_cast(host_ptr_); + return write_json(buf, path); + } + + void finalize(ScopeStatsUnregisterCallback unregister_cb, const ScopeStatsFreeCallback &free_cb) { + if (!initialized_) return; + if (host_shadow_owned_) { + // a5 path — manual cleanup; the BufferPoolManager was never set up + // because halHostRegister isn't available on a5. + if (dev_ptr_ != nullptr && free_cb) free_cb(dev_ptr_); + if (host_ptr_ != nullptr) std::free(host_ptr_); + } else { + // a2a3 / sim — undo manager_.alloc_and_register: unregister the + // halHostRegister mapping (BufferPoolManager does not own that + // side), then let the manager free the device buffer and erase + // its dev→host map entry. + if (dev_ptr_ != nullptr && unregister_cb != nullptr) { + int rc = unregister_cb(dev_ptr_, device_id_); + if (rc != 0) { + LOG_ERROR("scope_stats: halHostUnregister failed: %d", rc); + } + } + manager_.free_buffer(dev_ptr_); + } + dev_ptr_ = nullptr; + host_ptr_ = nullptr; + host_shadow_owned_ = false; + copy_from_device_cb_ = nullptr; + initialized_ = false; + } + +private: + static std::string make_path(const std::string &output_dir) { + std::filesystem::path dir(output_dir); + std::error_code ec; + std::filesystem::create_directories(dir, ec); + if (ec) { + LOG_WARN("scope_stats: failed to create output dir %s: %s", output_dir.c_str(), ec.message().c_str()); + } + return (dir / "scope_stats.json").string(); + } + + // Schema (version 2) — flat, not Chrome-trace, because scope_stats is a + // list of per-scope_end snapshots, not a timeline. Each metric is rendered + // as a `"used/cap"` string so the JSON reads the same as the original + // `[ScopeStats]` log line: + // { "version": 2, "fatal": bool, + // "write_count": uint, "cap": uint, "dropped": uint, + // "records": [ + // { "site": "file:line", "depth": int, + // "task_window": ["used/cap", "used/cap", "used/cap", "used/cap"], + // "heap": ["used/cap", "used/cap", "used/cap", "used/cap"], + // "dep": ["used/cap", "used/cap", "used/cap", "used/cap"], + // "fanin_used": [int, int, int, int], + // "tensormap": "used/cap" }, + // ... + // ] } + static int write_json(const ScopeStatsBuffer *buf, const std::string &path) { + if (buf == nullptr) { + LOG_ERROR("scope_stats: null buffer"); + return -1; + } + std::FILE *fp = std::fopen(path.c_str(), "w"); + if (fp == nullptr) { + LOG_ERROR("scope_stats: failed to open %s", path.c_str()); + return -1; + } + + const std::uint64_t write_count = buf->header.write_count; + const std::uint32_t cap = buf->header.cap; + const std::uint64_t kept = write_count > cap ? cap : write_count; + const std::uint64_t dropped = write_count > cap ? (write_count - cap) : 0; + const std::uint64_t start = write_count - kept; + + std::fprintf(fp, "{\n"); + std::fprintf(fp, " \"version\": 2,\n"); + std::fprintf(fp, " \"fatal\": %s,\n", buf->header.fatal_latched ? "true" : "false"); + std::fprintf(fp, " \"write_count\": %" PRIu64 ",\n", write_count); + std::fprintf(fp, " \"cap\": %u,\n", cap); + std::fprintf(fp, " \"dropped\": %" PRIu64 ",\n", dropped); + + std::fprintf(fp, " \"records\": ["); + for (std::uint64_t i = 0; i < kept; i++) { + const ScopeStatsRecord &rec = buf->records[(start + i) % cap]; + if (i) std::fputc(',', fp); + std::fprintf(fp, "\n {"); + // Bound the print to the on-wire field size in case a future + // writer change drops the NUL terminator. + const std::size_t site_len = strnlen(rec.site_file_basename, sizeof(rec.site_file_basename)); + std::fprintf( + fp, "\"site\": \"%.*s:%d\", ", static_cast(site_len), rec.site_file_basename, rec.site_line + ); + std::fprintf(fp, "\"depth\": %d, ", rec.depth); + std::fprintf(fp, "\"task_window\": "); + write_i32_over_i32_array( + fp, rec.task_in_flight, buf->header.task_window_cap, PTO2_SCOPE_STATS_MAX_RING_DEPTH + ); + std::fprintf(fp, ", \"heap\": "); + write_u64_over_u64_array(fp, rec.heap_bytes, buf->header.heap_cap, PTO2_SCOPE_STATS_MAX_RING_DEPTH); + std::fprintf(fp, ", \"dep\": "); + write_i32_over_i32_array(fp, rec.dep_used, buf->header.dep_cap, PTO2_SCOPE_STATS_MAX_RING_DEPTH); + std::fprintf(fp, ", \"fanin_used\": "); + write_i32_array(fp, rec.fanin_used, PTO2_SCOPE_STATS_MAX_RING_DEPTH); + std::fprintf(fp, ", \"tensormap\": \"%d/%d\"}", rec.tensormap_used, buf->header.tensormap_cap); + } + std::fprintf(fp, "\n ]\n}\n"); + std::fclose(fp); + + LOG_INFO_V1("scope_stats: wrote %" PRIu64 " records (dropped=%" PRIu64 ") to %s", kept, dropped, path.c_str()); + return 0; + } + + static void write_i32_array(std::FILE *fp, const std::int32_t *arr, std::size_t n) { + std::fputc('[', fp); + for (std::size_t i = 0; i < n; i++) { + if (i) std::fputc(',', fp); + std::fprintf(fp, "%d", arr[i]); + } + std::fputc(']', fp); + } + + static void + write_i32_over_i32_array(std::FILE *fp, const std::int32_t *used, const std::int32_t *cap, std::size_t n) { + std::fputc('[', fp); + for (std::size_t i = 0; i < n; i++) { + if (i) std::fputc(',', fp); + std::fprintf(fp, "\"%d/%d\"", used[i], cap[i]); + } + std::fputc(']', fp); + } + + static void + write_u64_over_u64_array(std::FILE *fp, const std::uint64_t *used, const std::uint64_t *cap, std::size_t n) { + std::fputc('[', fp); + for (std::size_t i = 0; i < n; i++) { + if (i) std::fputc(',', fp); + std::fprintf(fp, "\"%" PRIu64 "/%" PRIu64 "\"", used[i], cap[i]); + } + std::fputc(']', fp); + } + + bool initialized_ = false; + int device_id_ = -1; + void *dev_ptr_ = nullptr; + void *host_ptr_ = nullptr; + bool host_shadow_owned_ = false; + ScopeStatsCopyFromDeviceCallback copy_from_device_cb_; + // Owns the dev↔host shared-memory setup for the a2a3 / sim path. Stays + // unconfigured (no set_memory_context call) when init() takes the a5 + // host-shadow path; finalize() routes accordingly via host_shadow_owned_. + profiling_common::BufferPoolManager manager_; +}; + +#endif // SRC_A2A3_PLATFORM_INCLUDE_HOST_SCOPE_STATS_DUMP_H_ diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp index 2e28997e8..9776b423f 100644 --- a/src/a2a3/platform/onboard/aicpu/kernel.cpp +++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp @@ -20,6 +20,7 @@ #include "aicpu/pmu_collector_aicpu.h" #include "aicpu/platform_regs.h" #include "aicpu/platform_aicpu_affinity.h" +#include "aicpu/scope_stats_collector_aicpu.h" #include "aicpu/tensor_dump_aicpu.h" #include "runtime.h" @@ -113,6 +114,8 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a set_pmu_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_PMU)); set_platform_dep_gen_base(k_args->dep_gen_data_base); set_dep_gen_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DEP_GEN)); + set_scope_stats_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_SCOPE_STATS)); + set_platform_scope_stats_base(k_args->scope_stats_data_base); // Affinity gate: drop excess threads before entering runtime if (!platform_aicpu_affinity_gate(runtime->aicpu_thread_num, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH)) { diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index cf6ddea88..8172a1d78 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -701,6 +702,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { if (enable_dep_gen_) { SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_DEP_GEN); } + if (enable_scope_stats_) { + SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_SCOPE_STATS); + } kernel_args_.args.enable_profiling_flag = enable_profiling_flag; for (int i = 0; i < num_aicore; i++) { @@ -765,6 +769,14 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { } } + if (enable_scope_stats_) { + rc = init_scope_stats(device_id_); + if (rc != 0) { + LOG_ERROR("init_scope_stats failed: %d", rc); + return rc; + } + } + // On any exit from run() — success or early error — release the diagnostics // collectors' shared memory. They are only re-initialized per run(), so a // Worker reused across runs (e.g. a pytest session-scoped worker pool) would @@ -931,6 +943,13 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { } } + if (enable_scope_stats_ && scope_stats_buf_.is_initialized()) { + // Device stream sync has already completed by this point (matches + // l2_perf / dep_gen export ordering), so AICPU writes to the shared + // region are fully visible. + scope_stats_buf_.dump(output_prefix_); + } + // Print handshake results (reads from device memory, must be before free) print_handshake_results(); @@ -1508,6 +1527,37 @@ int DeviceRunner::init_dep_gen(int num_threads, int device_id) { return 0; } +int DeviceRunner::init_scope_stats(int device_id) { + auto alloc_cb = [this](size_t size) -> void * { + return mem_alloc_.alloc(size); + }; + + auto register_cb = +[](void *dev_ptr, size_t size, int device_id, void **host_ptr) -> int { + if (load_hal_if_needed() != 0) { + LOG_ERROR("Failed to load ascend_hal for scope_stats: %s", dlerror()); + return -1; + } + HalHostRegisterFn fn = get_halHostRegister(); + if (fn == nullptr) { + LOG_ERROR("halHostRegister symbol not found: %s", dlerror()); + return -1; + } + return fn(dev_ptr, size, DEV_SVM_MAP_HOST, device_id, host_ptr); + }; + + auto free_cb = [this](void *dev_ptr) -> int { + return mem_alloc_.free(dev_ptr); + }; + + int rc = scope_stats_buf_.init(alloc_cb, register_cb, free_cb, /*copy_from_device_cb=*/nullptr, device_id); + if (rc != 0) { + return rc; + } + + kernel_args_.args.scope_stats_data_base = reinterpret_cast(scope_stats_buf_.device_ptr()); + return 0; +} + void DeviceRunner::finalize_collectors() { auto unregister_cb = [](void *dev_ptr, int device_id) -> int { HalHostUnregisterFn fn = get_halHostUnregister(); @@ -1532,4 +1582,8 @@ void DeviceRunner::finalize_collectors() { if (dep_gen_collector_.is_initialized()) { dep_gen_collector_.finalize(unregister_cb, free_cb); } + if (scope_stats_buf_.is_initialized()) { + scope_stats_buf_.finalize(unregister_cb, free_cb); + kernel_args_.args.scope_stats_data_base = 0; + } } diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 53fb6555f..d1b27e5f4 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -52,6 +52,7 @@ #include "host/pmu_collector.h" #include "host/dep_gen_collector.h" #include "load_aicpu_op.h" +#include "host/scope_stats_dump.h" #include "runtime.h" /** @@ -313,6 +314,7 @@ class DeviceRunner { pmu_event_type_ = resolve_pmu_event_type(enable_pmu); } void set_dep_gen_enabled(bool enable) { enable_dep_gen_ = enable; } + void set_scope_stats_enabled(bool enable) { enable_scope_stats_ = enable; } // Directory under which all diagnostic artifacts (l2_perf_records.json / // tensor_dump/ / pmu.csv) land. Required (non-empty) when any diagnostic // is enabled; CallConfig::validate() enforces this contract upstream. @@ -830,6 +832,7 @@ class DeviceRunner { * @return 0 on success, error code on failure */ int init_dep_gen(int num_threads, int device_id); + int init_scope_stats(int device_id); /** * Finalize whichever diagnostics collectors are currently initialized, @@ -850,6 +853,12 @@ class DeviceRunner { bool enable_dump_tensor_{false}; bool enable_pmu_{false}; bool enable_dep_gen_{false}; + bool enable_scope_stats_{false}; + // scope_stats has no dedicated collector class — its data is a single + // end-of-run snapshot (no streaming, no mgmt thread). All scope_stats + // host logic lives in ScopeStatsHostBuffer; this member is the only hook + // device_runner needs. + ScopeStatsHostBuffer scope_stats_buf_; L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED}; // resolved from set_l2_swimlane_enabled() PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION}; // resolved from set_pmu_enabled() std::string output_prefix_{}; // diagnostic artifact root directory diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index 744b7291c..ad44c6bfc 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -333,7 +333,7 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c int run_prepared( DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen, - const char *output_prefix, PtoRunTiming *out_timing + int enable_scope_stats, const char *output_prefix, PtoRunTiming *out_timing ) { if (out_timing != NULL) { out_timing->host_wall_ns = 0; @@ -400,6 +400,7 @@ int run_prepared( runner->set_dump_tensor_enabled(enable_dump_tensor != 0); runner->set_pmu_enabled(enable_pmu); runner->set_dep_gen_enabled(enable_dep_gen != 0); + runner->set_scope_stats_enabled(enable_scope_stats != 0); runner->set_output_prefix(output_prefix); rc = runner->run(*r, block_dim, aicpu_thread_num); diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 1635f3a7a..6b3684cfa 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -285,6 +285,20 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } + set_scope_stats_enabled_func_ = + reinterpret_cast(dlsym(aicpu_so_handle_, "set_scope_stats_enabled")); + if (set_scope_stats_enabled_func_ == nullptr) { + LOG_ERROR("dlsym failed for set_scope_stats_enabled: %s", dlerror()); + return -1; + } + + set_platform_scope_stats_base_func_ = + reinterpret_cast(dlsym(aicpu_so_handle_, "set_platform_scope_stats_base")); + if (set_platform_scope_stats_base_func_ == nullptr) { + LOG_ERROR("dlsym failed for set_platform_scope_stats_base: %s", dlerror()); + return -1; + } + // Log config travels via the RTLD_GLOBAL HostLogger singleton in // libsimpler_log.so — already seeded by simpler_log_init() before the // AICPU sim SO was dlopen'd, so no per-SO setter forwarding is needed. @@ -436,6 +450,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { if (enable_dep_gen_) { SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_DEP_GEN); } + if (enable_scope_stats_) { + SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_SCOPE_STATS); + } kernel_args_.enable_profiling_flag = enable_profiling_flag; for (int i = 0; i < num_aicore; i++) { @@ -503,6 +520,14 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { } } + if (enable_scope_stats_) { + rc = init_scope_stats(); + if (rc != 0) { + LOG_ERROR("init_scope_stats failed: %d", rc); + return rc; + } + } + // On any exit from run() — success or early error — release the diagnostics // collectors' shared memory. They are only re-initialized per run(), so a // Worker reused across runs (e.g. a pytest session-scoped worker pool) would @@ -589,7 +614,8 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { set_platform_dump_base_func_ == nullptr || set_dump_tensor_enabled_func_ == nullptr || set_platform_pmu_base_func_ == nullptr || set_platform_pmu_reg_addrs_func_ == nullptr || set_pmu_enabled_func_ == nullptr || set_platform_dep_gen_base_func_ == nullptr || - set_dep_gen_enabled_func_ == nullptr) { + set_dep_gen_enabled_func_ == nullptr || set_scope_stats_enabled_func_ == nullptr || + set_platform_scope_stats_base_func_ == nullptr) { LOG_ERROR("Executor functions not loaded. Call ensure_binaries_loaded first."); return -1; } @@ -604,6 +630,8 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { set_pmu_enabled_func_(enable_pmu_); set_platform_dep_gen_base_func_(kernel_args_.dep_gen_data_base); set_dep_gen_enabled_func_(enable_dep_gen_); + set_scope_stats_enabled_func_(enable_scope_stats_); + set_platform_scope_stats_base_func_(kernel_args_.scope_stats_data_base); // No per-SO log-config push: HostLogger lives in libsimpler_log.so // (RTLD_GLOBAL singleton) and the AICPU sim SO reads it directly via the @@ -736,6 +764,10 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { } } + if (enable_scope_stats_ && scope_stats_buf_.is_initialized()) { + scope_stats_buf_.dump(output_prefix_); + } + // Print handshake results at end of run print_handshake_results(); @@ -785,6 +817,8 @@ void DeviceRunner::unload_executor_binaries() { set_pmu_enabled_func_ = nullptr; set_platform_dep_gen_base_func_ = nullptr; set_dep_gen_enabled_func_ = nullptr; + set_scope_stats_enabled_func_ = nullptr; + set_platform_scope_stats_base_func_ = nullptr; aicpu_so_loaded_ = false; } if (!aicpu_so_path_.empty()) { @@ -1230,6 +1264,28 @@ int DeviceRunner::init_dep_gen(int num_threads, int /*device_id*/) { return 0; } +int DeviceRunner::init_scope_stats() { + auto alloc_cb = [this](size_t size) -> void * { + return mem_alloc_.alloc(size); + }; + auto free_cb = [this](void *dev_ptr) -> int { + return mem_alloc_.free(dev_ptr); + }; + + // Sim shares an address space with the AICPU thread, so register_cb is + // not needed (mirrors PMU / dep_gen's nullptr in sim). + int rc = scope_stats_buf_.init( + alloc_cb, /*register_cb=*/nullptr, free_cb, /*copy_from_device_cb=*/nullptr, + /*device_id=*/-1 + ); + if (rc != 0) { + return rc; + } + + kernel_args_.scope_stats_data_base = reinterpret_cast(scope_stats_buf_.device_ptr()); + return 0; +} + void DeviceRunner::finalize_collectors() { // Free through MemoryAllocator so finalize() can audit. Sim shares an // address space with the AICPU thread, so no host unregister is needed. @@ -1249,4 +1305,8 @@ void DeviceRunner::finalize_collectors() { if (dep_gen_collector_.is_initialized()) { dep_gen_collector_.finalize(nullptr, free_cb); } + if (scope_stats_buf_.is_initialized()) { + scope_stats_buf_.finalize(nullptr, free_cb); + kernel_args_.scope_stats_data_base = 0; + } } diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index 73b3dfea2..1b98c22d9 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -58,6 +58,7 @@ #include "host/tensor_dump_collector.h" #include "host/pmu_collector.h" #include "host/dep_gen_collector.h" +#include "host/scope_stats_dump.h" #include "runtime.h" /** @@ -186,6 +187,7 @@ class DeviceRunner { pmu_event_type_ = resolve_pmu_event_type(enable_pmu); } void set_dep_gen_enabled(bool enable) { enable_dep_gen_ = enable; } + void set_scope_stats_enabled(bool enable) { enable_scope_stats_ = enable; } // Directory under which all diagnostic artifacts (l2_perf_records.json / // tensor_dump/ / pmu.csv) land. Required (non-empty) when any diagnostic // is enabled; CallConfig::validate() enforces this contract upstream. @@ -370,6 +372,8 @@ class DeviceRunner { void (*set_pmu_enabled_func_)(bool){nullptr}; void (*set_platform_dep_gen_base_func_)(uint64_t){nullptr}; void (*set_dep_gen_enabled_func_)(bool){nullptr}; + void (*set_scope_stats_enabled_func_)(bool){nullptr}; + void (*set_platform_scope_stats_base_func_)(uint64_t){nullptr}; std::string aicpu_so_path_; std::string aicore_so_path_; @@ -382,6 +386,7 @@ class DeviceRunner { PmuCollector pmu_collector_; // dep_gen collector — captures orchestrator submit_task inputs for offline replay DepGenCollector dep_gen_collector_; + ScopeStatsHostBuffer scope_stats_buf_; // Private helper methods — read aicpu_so_binary_ / aicore_kernel_binary_ // off the runner (populated by set_executors during simpler_init). @@ -414,6 +419,7 @@ class DeviceRunner { int init_pmu(int num_cores, int num_threads, const std::string &csv_path, PmuEventType event_type, int device_id); int init_dep_gen(int num_threads, int device_id); + int init_scope_stats(); /** * Finalize whichever diagnostics collectors are currently initialized, @@ -432,6 +438,7 @@ class DeviceRunner { bool enable_dump_tensor_{false}; bool enable_pmu_{false}; bool enable_dep_gen_{false}; + bool enable_scope_stats_{false}; L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED}; // resolved from set_l2_swimlane_enabled() PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION}; // resolved from set_pmu_enabled() std::string output_prefix_{}; // diagnostic artifact root directory diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 4ad438a9c..8ef59d95a 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -305,7 +305,7 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c int run_prepared( DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen, - const char *output_prefix, PtoRunTiming *out_timing + int enable_scope_stats, const char *output_prefix, PtoRunTiming *out_timing ) { if (out_timing != NULL) { out_timing->host_wall_ns = 0; @@ -359,6 +359,7 @@ int run_prepared( runner->set_dump_tensor_enabled(enable_dump_tensor != 0); runner->set_pmu_enabled(enable_pmu); runner->set_dep_gen_enabled(enable_dep_gen != 0); + runner->set_scope_stats_enabled(enable_scope_stats != 0); runner->set_output_prefix(output_prefix); rc = runner->run(*r, block_dim, aicpu_thread_num); diff --git a/src/a2a3/platform/src/aicpu/scope_stats_collector_aicpu.cpp b/src/a2a3/platform/src/aicpu/scope_stats_collector_aicpu.cpp new file mode 100644 index 000000000..a00ac3988 --- /dev/null +++ b/src/a2a3/platform/src/aicpu/scope_stats_collector_aicpu.cpp @@ -0,0 +1,208 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +// Platform-layer scope_stats collector. +// +// Owns all collector state (depth, peak arrays, shared buffer) and exposes +// pure-value APIs for runtime to report resource usage. No runtime-specific +// types cross the boundary. + +#include "aicpu/scope_stats_collector_aicpu.h" + +#include + +#include "common/scope_stats_buffer.h" + +// --------------------------------------------------------------------------- +// Collector state +// --------------------------------------------------------------------------- + +int32_t scope_stats_depth = -1; +bool scope_stats_enabled = false; + +static uint64_t scope_stats_peak_heap_bytes[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {}; +static int32_t scope_stats_peak_task_in_flight[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {}; +static int32_t scope_stats_peak_fanin_used[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {}; +static int32_t scope_stats_peak_dep_used[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {}; +static int32_t scope_stats_peak_tensormap_used[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH] = {}; +static ScopeStatsBuffer *scope_stats_shared_buf = nullptr; + +namespace { + +ScopeStatsInitialSampleFn g_initial_sample_fn = nullptr; + +const char *s_pending_site_file = nullptr; +int32_t s_pending_site_line = 0; + +const char *s_scope_site_file[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH] = {}; +int32_t s_scope_site_line[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH] = {}; + +inline const char *basename_of(const char *path) { + if (!path) return "(unknown)"; + const char *base = path; + for (const char *p = path; *p; ++p) { + if (*p == '/' || *p == '\\') base = p + 1; + } + return base; +} + +inline void copy_basename(char (&dst)[32], const char *src) { + const char *base = basename_of(src); + size_t i = 0; + for (; i + 1 < sizeof(dst) && base[i]; i++) + dst[i] = base[i]; + dst[i] = '\0'; +} + +} // namespace + +// --------------------------------------------------------------------------- +// Setter symbols — always exported, unconditionally compiled +// --------------------------------------------------------------------------- + +extern "C" void set_scope_stats_enabled(bool enable) { scope_stats_enabled = enable; } + +extern "C" void set_platform_scope_stats_base(uint64_t scope_stats_data_base) { + scope_stats_shared_buf = reinterpret_cast(scope_stats_data_base); + // Reset collector-local statics so a prior run that crashed mid-scope (or + // reused the same AICPU .so process) can't leak stale depth/peak data into + // the new run's records. + scope_stats_depth = -1; + s_pending_site_file = nullptr; + s_pending_site_line = 0; + memset(scope_stats_peak_heap_bytes, 0, sizeof(scope_stats_peak_heap_bytes)); + memset(scope_stats_peak_task_in_flight, 0, sizeof(scope_stats_peak_task_in_flight)); + memset(scope_stats_peak_fanin_used, 0, sizeof(scope_stats_peak_fanin_used)); + memset(scope_stats_peak_dep_used, 0, sizeof(scope_stats_peak_dep_used)); + memset(scope_stats_peak_tensormap_used, 0, sizeof(scope_stats_peak_tensormap_used)); + memset(s_scope_site_file, 0, sizeof(s_scope_site_file)); + memset(s_scope_site_line, 0, sizeof(s_scope_site_line)); + if (scope_stats_shared_buf) { + memset(&scope_stats_shared_buf->header, 0, sizeof(scope_stats_shared_buf->header)); + scope_stats_shared_buf->header.cap = PTO2_SCOPE_STATS_LOG_CAP; + } +} + +extern "C" void scope_stats_register_initial_sampler(ScopeStatsInitialSampleFn fn) { g_initial_sample_fn = fn; } + +// --------------------------------------------------------------------------- +// Scope lifecycle probes +// --------------------------------------------------------------------------- + +extern "C" void scope_stats_set_pending_site(const char *file, int line) { + s_pending_site_file = file; + s_pending_site_line = line; +} + +extern "C" void scope_stats_on_begin() { + if (!scope_stats_enabled) return; + if (scope_stats_depth + 1 >= PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH) return; + ++scope_stats_depth; + s_scope_site_file[scope_stats_depth] = s_pending_site_file; + s_scope_site_line[scope_stats_depth] = s_pending_site_line; + s_pending_site_file = nullptr; + s_pending_site_line = 0; + if (g_initial_sample_fn) { + g_initial_sample_fn(scope_stats_depth); + } else { + for (int r = 0; r < PTO2_SCOPE_STATS_MAX_RING_DEPTH; r++) { + scope_stats_peak_heap_bytes[scope_stats_depth][r] = 0; + scope_stats_peak_task_in_flight[scope_stats_depth][r] = 0; + scope_stats_peak_fanin_used[scope_stats_depth][r] = 0; + scope_stats_peak_dep_used[scope_stats_depth][r] = 0; + } + scope_stats_peak_tensormap_used[scope_stats_depth] = 0; + } +} + +extern "C" void scope_stats_on_end() { + if (!scope_stats_enabled) return; + if (scope_stats_depth < 0) return; + if (scope_stats_depth >= 0 && scope_stats_shared_buf) { + uint64_t idx = scope_stats_shared_buf->header.write_count % PTO2_SCOPE_STATS_LOG_CAP; + ScopeStatsRecord &rec = scope_stats_shared_buf->records[idx]; + rec.site_file_addr = reinterpret_cast(s_scope_site_file[scope_stats_depth]); + copy_basename(rec.site_file_basename, s_scope_site_file[scope_stats_depth]); + rec.site_line = s_scope_site_line[scope_stats_depth]; + rec.depth = static_cast(scope_stats_depth); + for (int r = 0; r < PTO2_SCOPE_STATS_MAX_RING_DEPTH; r++) { + rec.heap_bytes[r] = scope_stats_peak_heap_bytes[scope_stats_depth][r]; + rec.task_in_flight[r] = scope_stats_peak_task_in_flight[scope_stats_depth][r]; + rec.dep_used[r] = scope_stats_peak_dep_used[scope_stats_depth][r]; + rec.fanin_used[r] = scope_stats_peak_fanin_used[scope_stats_depth][r]; + } + rec.tensormap_used = scope_stats_peak_tensormap_used[scope_stats_depth]; + ++scope_stats_shared_buf->header.write_count; + } + for (int r = 0; r < PTO2_SCOPE_STATS_MAX_RING_DEPTH; r++) { + scope_stats_peak_heap_bytes[scope_stats_depth][r] = 0; + scope_stats_peak_task_in_flight[scope_stats_depth][r] = 0; + scope_stats_peak_fanin_used[scope_stats_depth][r] = 0; + scope_stats_peak_dep_used[scope_stats_depth][r] = 0; + } + scope_stats_peak_tensormap_used[scope_stats_depth] = 0; + s_scope_site_file[scope_stats_depth] = nullptr; + s_scope_site_line[scope_stats_depth] = 0; + --scope_stats_depth; +} + +extern "C" void scope_stats_on_fatal() { + if (!scope_stats_enabled) return; + if (!scope_stats_shared_buf) return; + scope_stats_shared_buf->header.fatal_latched = 1; +} + +// --------------------------------------------------------------------------- +// Pure-value peak update APIs — called by runtime at instrumentation points +// --------------------------------------------------------------------------- + +extern "C" void scope_stats_update_allocator_peaks(int ring_id, uint64_t heap_bytes, int32_t tasks_in_flight) { + if (!scope_stats_enabled || scope_stats_depth < 0) return; + if (ring_id < 0 || ring_id >= PTO2_SCOPE_STATS_MAX_RING_DEPTH) return; + for (int d = 0; d <= scope_stats_depth; d++) { + if (heap_bytes > scope_stats_peak_heap_bytes[d][ring_id]) scope_stats_peak_heap_bytes[d][ring_id] = heap_bytes; + if (tasks_in_flight > scope_stats_peak_task_in_flight[d][ring_id]) + scope_stats_peak_task_in_flight[d][ring_id] = tasks_in_flight; + } +} + +extern "C" void scope_stats_update_tensormap_peak(int32_t tensormap_used) { + if (!scope_stats_enabled || scope_stats_depth < 0) return; + for (int d = 0; d <= scope_stats_depth; d++) { + if (tensormap_used > scope_stats_peak_tensormap_used[d]) scope_stats_peak_tensormap_used[d] = tensormap_used; + } +} + +extern "C" void scope_stats_update_pool_peaks(int ring_id, int32_t fanin_used, int32_t dep_used) { + if (!scope_stats_enabled || scope_stats_depth < 0) return; + if (ring_id < 0 || ring_id >= PTO2_SCOPE_STATS_MAX_RING_DEPTH) return; + for (int d = 0; d <= scope_stats_depth; d++) { + if (fanin_used > scope_stats_peak_fanin_used[d][ring_id]) scope_stats_peak_fanin_used[d][ring_id] = fanin_used; + if (dep_used > scope_stats_peak_dep_used[d][ring_id]) scope_stats_peak_dep_used[d][ring_id] = dep_used; + } +} + +// --------------------------------------------------------------------------- +// Capacity registration — called by runtime at init +// --------------------------------------------------------------------------- + +extern "C" void scope_stats_set_ring_capacity(int ring_id, int32_t window_cap, uint64_t heap_cap, int32_t dep_cap) { + if (!scope_stats_shared_buf) return; + if (ring_id < 0 || ring_id >= PTO2_SCOPE_STATS_MAX_RING_DEPTH) return; + scope_stats_shared_buf->header.task_window_cap[ring_id] = window_cap; + scope_stats_shared_buf->header.heap_cap[ring_id] = heap_cap; + scope_stats_shared_buf->header.dep_cap[ring_id] = dep_cap; +} + +extern "C" void scope_stats_set_tensormap_capacity(int32_t cap) { + if (!scope_stats_shared_buf) return; + scope_stats_shared_buf->header.tensormap_cap = cap; +} diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h index eabe3ec3f..0bd3431e6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h @@ -123,6 +123,11 @@ typedef struct PTO2RuntimeOps { ); TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args); TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args); + + // Stash the call-site of the next PTO2ScopeGuard so the [ScopeStats] + // collector can log it. Always present to keep ops-table layout stable + // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0. + void (*scope_set_site)(const char *file, int line); } PTO2RuntimeOps; /** @@ -361,10 +366,13 @@ static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const u */ class PTO2ScopeGuard { public: - explicit PTO2ScopeGuard(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) : + explicit PTO2ScopeGuard( + PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE() + ) : rt_(current_runtime()) { if (!rt_->ops->is_fatal(rt_)) { rt_->pending_scope_mode = mode; + if (rt_->ops->scope_set_site) rt_->ops->scope_set_site(file, line); rt_->ops->scope_begin(rt_); } } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index d1c039785..d04c7a9cb 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -39,6 +39,10 @@ extern "C" void set_dump_tensor_selective_mode(bool enable); extern "C" void set_dump_tensor_task_mask(uint64_t task_id, uint64_t mask); +#if PTO2_PROFILING +#include "aicpu/scope_stats_collector_aicpu.h" +#endif + // Verify the captured Tensor blob size in DepGenRecord matches the runtime // Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without // including runtime/tensor.h, so this check lives at the orch callsite. @@ -168,6 +172,14 @@ static void orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *func, const char *fmt, va_list args) { int32_t latched_code = orch_mark_fatal(orch, error_code); +#if PTO2_PROFILING + // Flush the current scope's peaks BEFORE the FATAL log line, so the + // diagnostic context (which pool/window filled up) appears right next to + // the failure reason. on_fatal is latched, so duplicate fatals from + // different layers don't print multiple stats lines. + scope_stats_on_fatal(); +#endif + if (fmt == nullptr || fmt[0] == '\0') { if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) { unified_log_error(func, "FATAL(code=%d, latched=%d)", error_code, latched_code); @@ -253,6 +265,11 @@ static bool append_fanin_or_fail( } entry->slot_state = prod_state; fanin_builder->count++; +#if PTO2_PROFILING + scope_stats_update_pool_peaks( + ring_id, fanin_pool.used(), orch->scheduler ? orch->scheduler->ring_sched_states[ring_id].dep_pool.used() : 0 + ); +#endif return true; } @@ -342,6 +359,10 @@ static bool prepare_task( orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK); return false; } +#if PTO2_PROFILING + scope_stats_update_allocator_peaks(ring_id, allocator.heap_used_bytes(), allocator.active_count()); + scope_stats_update_tensormap_peak(orch->tensor_map.current_used()); +#endif out->task_id = PTO2TaskId::make(ring_id, static_cast(out->alloc_result.task_id)); out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot); @@ -437,6 +458,14 @@ bool PTO2OrchestratorState::init_from_layout( orch->scope_stack_capacity = layout.scope_stack_capacity; orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; +#if PTO2_PROFILING + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &alloc = orch->rings[r].task_allocator; + scope_stats_set_ring_capacity(r, alloc.window_size(), alloc.heap_capacity(), 0); + } + scope_stats_set_tensormap_capacity(orch->tensor_map.pool_capacity()); +#endif + return true; } @@ -450,7 +479,17 @@ void PTO2OrchestratorState::destroy() { orch->scope_begins = nullptr; } -void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; } +void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { + this->scheduler = scheduler; +#if PTO2_PROFILING + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + int32_t dep_cap = scheduler ? scheduler->ring_sched_states[r].dep_pool.capacity : 0; + scope_stats_set_ring_capacity( + r, rings[r].task_allocator.window_size(), rings[r].task_allocator.heap_capacity(), dep_cap + ); + } +#endif +} // ============================================================================= // Scope Management @@ -489,6 +528,18 @@ void PTO2OrchestratorState::begin_scope(PTO2ScopeMode mode) { if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) { orch->manual_begin_depth = orch->scope_stack_top; } +#if PTO2_PROFILING + scope_stats_on_begin(); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &alloc = orch->rings[r].task_allocator; + scope_stats_update_allocator_peaks(r, alloc.heap_used_bytes(), alloc.active_count()); + scope_stats_update_pool_peaks( + r, orch->rings[r].fanin_pool.used(), + orch->scheduler ? orch->scheduler->ring_sched_states[r].dep_pool.used() : 0 + ); + } + scope_stats_update_tensormap_peak(orch->tensor_map.current_used()); +#endif } void PTO2OrchestratorState::end_scope() { @@ -498,6 +549,14 @@ void PTO2OrchestratorState::end_scope() { } assert(orch->scope_stack_top >= 0 && "Scope stack underflow"); + // Snapshot peak intra-scope queue fill BEFORE the orchestrator drains + // pending tasks via scheduler->on_scope_end. The user is measuring how + // much ring/heap the work submitted inside this scope holds at its peak, + // not the residual after teardown. +#if PTO2_PROFILING + scope_stats_on_end(); +#endif + #if PTO2_ORCH_PROFILING uint64_t _se0 = get_sys_cnt_aicpu(); #endif diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 5a3e3d3d3..e98b8aafa 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -180,6 +180,10 @@ class PTO2TaskAllocator { uint64_t heap_top() const { return heap_top_; } uint64_t heap_capacity() const { return heap_size_; } + uint64_t heap_used_bytes() const { + if (heap_size_ == 0) return 0; + return (heap_top_ + heap_size_ - heap_tail_) % heap_size_; + } private: // --- Task Ring --- diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index c801d5c15..122611e3f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -28,6 +28,9 @@ #include "aicpu/device_time.h" #include "common/unified_log.h" +#if PTO2_PROFILING +#include "aicpu/scope_stats_collector_aicpu.h" +#endif // Weak fallback for HOST .so builds (never called, but satisfies linker). // The AICPU build links the strong symbol from platform/.../device_time.cpp. @@ -231,6 +234,14 @@ void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, cons memcpy(ptr, &value, elem_size); } +// Ops-table entry that hands the call-site captured by PTO2ScopeGuard to the +// [ScopeStats] collector. The slot is always present in the struct to keep +// the layout stable; at PTO2_PROFILING=0 we fill nullptr so the orchestration +// .so's null-check skips it. +#if PTO2_PROFILING +static void scope_set_site_impl(const char *file, int line) { scope_stats_set_pending_site(file, line); } +#endif + static const PTO2RuntimeOps s_runtime_ops = { .submit_task = submit_task_impl, .scope_begin = rt_scope_begin, @@ -246,6 +257,11 @@ static const PTO2RuntimeOps s_runtime_ops = { .set_tensor_data = set_tensor_data, .alloc_tensors = alloc_tensors_impl, .submit_dummy_task = submit_dummy_task_impl, +#if PTO2_PROFILING + .scope_set_site = scope_set_site_impl, +#else + .scope_set_site = nullptr, +#endif }; // ============================================================================= diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index 5709a85b7..b0848a9f6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -89,6 +89,10 @@ struct PTO2RuntimeOps { ); TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args); TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args); + // Stash the call-site captured by PTO2ScopeGuard into the [ScopeStats] + // collector. Always present in the struct to keep ops-table layout stable + // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0. + void (*scope_set_site)(const char *file, int line); }; /** diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index cf1f2d28d..cce6725b9 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -375,6 +375,13 @@ struct PTO2TensorMap { return task_local_id & (task_window_sizes[ring_id] - 1); } + // Accessors read by scope_stats_collector. Declared unconditionally so the + // collector .cpp compiles at PTO2_PROFILING=0 (collector is unconditional — + // setter symbols must export for host dlsym; the probe call sites that use + // these accessors stay gated by PTO2_PROFILING). + int32_t current_used() const { return next_entry_idx - free_num; } + int32_t pool_capacity() const { return pool_size; } + // new_entry only allocates memory, does not assign attributes PTO2TensorMapEntry *new_entry() { if (free_num > 0) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h index 8d50681ba..2db12e9e6 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h @@ -49,6 +49,10 @@ } while (0) #endif +#if PTO2_PROFILING +#include "aicpu/scope_stats_collector_aicpu.h" +#endif + // ============================================================================= // Ready Queue (Lock-free bounded MPMC — Vyukov design) // ============================================================================= @@ -722,6 +726,9 @@ struct PTO2SchedulerState { early_finished++; } else { producer->fanout_head = rss.dep_pool.prepend(producer->fanout_head, ws); +#if PTO2_PROFILING + scope_stats_update_pool_peaks(ws->ring_id, 0, rss.dep_pool.used()); +#endif } producer->unlock_fanout(); }); diff --git a/src/a5/platform/include/aicpu/scope_stats_collector_aicpu.h b/src/a5/platform/include/aicpu/scope_stats_collector_aicpu.h new file mode 100644 index 000000000..f3d944165 --- /dev/null +++ b/src/a5/platform/include/aicpu/scope_stats_collector_aicpu.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#pragma once + +#include + +#include "common/scope_stats_buffer.h" + +// Scope-stats collector — platform-owned, runtime-agnostic. +// +// Platform owns all collector state and peak-tracking logic. Runtime calls +// pure-value APIs to report resource usage; no runtime types cross the +// boundary. +// +// Setter symbols (set_scope_stats_enabled, set_platform_scope_stats_base) +// are exported unconditionally so the host-side sim DeviceRunner's dlsym +// always resolves. + +extern "C" { + +// --- Scope lifecycle probes (called by orchestrator begin_scope/end_scope) --- + +void scope_stats_on_begin(); +void scope_stats_on_end(); +void scope_stats_on_fatal(); + +// --- Site tracking --- + +void scope_stats_set_pending_site(const char *file, int line); + +// --- Setter symbols (always exported) --- + +void set_scope_stats_enabled(bool enable); +void set_platform_scope_stats_base(uint64_t scope_stats_data_base); + +// --- Initial sampling callback --- + +typedef void (*ScopeStatsInitialSampleFn)(int32_t depth); +void scope_stats_register_initial_sampler(ScopeStatsInitialSampleFn fn); + +// --- Pure-value peak update APIs (called by runtime at instrumentation points) --- +// Single-producer assumption: peak updates use non-atomic read-max-write. +// Safe when the orchestrator is single-threaded; concurrent callers may +// lose peaks silently (acceptable for diagnostic data). + +void scope_stats_update_allocator_peaks(int ring_id, uint64_t heap_bytes, int32_t tasks_in_flight); +void scope_stats_update_tensormap_peak(int32_t tensormap_used); +void scope_stats_update_pool_peaks(int ring_id, int32_t fanin_used, int32_t dep_used); + +// --- Capacity registration (called by runtime at init) --- + +void scope_stats_set_ring_capacity(int ring_id, int32_t window_cap, uint64_t heap_cap, int32_t dep_cap); +void scope_stats_set_tensormap_capacity(int32_t cap); + +} // extern "C" diff --git a/src/a5/platform/include/common/kernel_args.h b/src/a5/platform/include/common/kernel_args.h index 9a5416806..e66ddc58f 100644 --- a/src/a5/platform/include/common/kernel_args.h +++ b/src/a5/platform/include/common/kernel_args.h @@ -77,10 +77,13 @@ struct KernelArgs { // indexes by block_idx and forwards into per-core platform state. uint64_t aicore_l2_perf_ring_addrs{0}; // L2PerfAicoreRing* per core; 0 when L2 swimlane is off uint64_t aicore_pmu_ring_addrs{0}; // PmuAicoreRing* per core; 0 when PMU is off + uint64_t scope_stats_data_base{0}; // ScopeStatsBuffer device pointer; 0 when scope_stats is off. + // a5 has no halHostRegister — host keeps a separate shadow and + // refreshes it via rtMemcpy DEVICE_TO_HOST at dump time. uint32_t log_level{1}; // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL uint32_t log_info_v{5}; // INFO verbosity threshold (0..9); default V5 - uint32_t enable_profiling_flag{0}; // Profiling umbrella bitmask; bit0=dump_tensor, bit1=l2_swimlane, bit2=pmu - uint32_t _pad{0}; // Alignment padding + uint32_t enable_profiling_flag{0}; // Profiling umbrella bitmask; dump_tensor|l2_swimlane|pmu|dep_gen|scope_stats + uint32_t _pad{0}; // Alignment padding // Device pointer to an 8-byte buffer that the platform AICPU entry writes // the run-wall (ns) into. Allocated once at simpler_init, kept resident. diff --git a/src/a5/platform/include/common/platform_config.h b/src/a5/platform/include/common/platform_config.h index 5e740bae8..8fdb40c2b 100644 --- a/src/a5/platform/include/common/platform_config.h +++ b/src/a5/platform/include/common/platform_config.h @@ -177,6 +177,7 @@ inline double cycles_to_us(uint64_t cycles) { #define PROFILING_FLAG_DUMP_TENSOR (1u << 0) #define PROFILING_FLAG_L2_SWIMLANE (1u << 1) #define PROFILING_FLAG_PMU (1u << 2) +#define PROFILING_FLAG_SCOPE_STATS (1u << 4) #define GET_PROFILING_FLAG(flags, bit) ((((uint32_t)(flags)) & ((uint32_t)(bit))) != 0u) #define SET_PROFILING_FLAG(flags, bit) ((flags) |= (uint32_t)(bit)) #define CLEAR_PROFILING_FLAG(flags, bit) ((flags) &= ~((uint32_t)(bit))) diff --git a/src/a5/platform/include/common/scope_stats_buffer.h b/src/a5/platform/include/common/scope_stats_buffer.h new file mode 100644 index 000000000..7780fc333 --- /dev/null +++ b/src/a5/platform/include/common/scope_stats_buffer.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef PLATFORM_COMMON_SCOPE_STATS_BUFFER_H_ +#define PLATFORM_COMMON_SCOPE_STATS_BUFFER_H_ + +#include + +// Layout shared between AICPU writer (scope_stats_collector_aicpu.cpp on +// device) and the host reader (ScopeStatsHostBuffer on host). The whole block lives in a +// host-allocated device-visible memory region; AICPU mutates `header.write_count` +// and `records[i]` during the run, host snapshots both after the run to write +// `/scope_stats.json`. +// +// Hot-path semantics: AICPU appends one record per scope_end into the ring +// using `idx = header.write_count % header.cap`, then increments +// `write_count`. No locking — single-producer (orchestrator thread) / +// single-consumer (host post-run). Host never reads while AICPU writes. +// +// Capacity (PTO2_SCOPE_STATS_LOG_CAP) is fixed at build time so the layout is +// stable across host/device builds. 16 384 records × ~96 B = ~1.5 MB; the +// host opts in via `--enable-scope-stats` and the allocation is skipped when +// the flag is off. + +#define PTO2_SCOPE_STATS_LOG_CAP 16384u +#define PTO2_SCOPE_STATS_MAX_RING_DEPTH 4 +#define PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH 64 + +#ifdef __cplusplus +extern "C" { +#endif + +// One record per scope_end. Layout MUST stay in sync with the device-side +// writer in platform/src/aicpu/scope_stats_collector_aicpu.cpp. +struct ScopeStatsRecord { + uint64_t site_file_addr; // device-side const char *; for diagnostics the host + // only logs the raw pointer (string table lives in + // the orchestration .so, not in shared memory). + // AICPU also writes a basename copy into site_file_basename. + char site_file_basename[32]; // NUL-terminated basename of site_file, captured + // at append time so the host JSON contains a + // human-readable path without dereferencing a + // device pointer. + int32_t site_line; + int16_t depth; + int16_t _pad0; + uint64_t heap_bytes[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + int32_t task_in_flight[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + int32_t dep_used[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + int32_t fanin_used[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + int32_t tensormap_used; + int32_t _pad1; +}; + +struct ScopeStatsHeader { + uint64_t write_count; // Total append count. write_count > cap means the + // ring wrapped; host reports `dropped = write_count - cap` + // and emits `min(cap, write_count)` records starting + // from `(write_count - kept) % cap`. + uint32_t cap; // Fixed at PTO2_SCOPE_STATS_LOG_CAP; copied in by host + // at init so device and host see the same value + // without needing a separate sync. + uint32_t fatal_latched; // AICPU sets to 1 on first fatal. Host uses this + // to stamp the JSON `fatal` field — no separate + // device→host channel needed. + // Per-ring capacities — snapshotted by AICPU once at scope_stats_bind + // (constant for the run, so writing them once is fine). Host needs them + // to render the "used/cap" ratio in JSON without re-introducing a + // separate device→host query. + int32_t task_window_cap[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + uint64_t heap_cap[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + int32_t dep_cap[PTO2_SCOPE_STATS_MAX_RING_DEPTH]; + int32_t tensormap_cap; + int32_t _pad; +}; + +struct ScopeStatsBuffer { + ScopeStatsHeader header; + ScopeStatsRecord records[PTO2_SCOPE_STATS_LOG_CAP]; +}; + +#ifdef __cplusplus +} +#endif + +#endif // PLATFORM_COMMON_SCOPE_STATS_BUFFER_H_ diff --git a/src/a5/platform/include/host/scope_stats_dump.h b/src/a5/platform/include/host/scope_stats_dump.h new file mode 100644 index 000000000..e0b9b17e7 --- /dev/null +++ b/src/a5/platform/include/host/scope_stats_dump.h @@ -0,0 +1,257 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A5_PLATFORM_INCLUDE_HOST_SCOPE_STATS_DUMP_H_ +#define SRC_A5_PLATFORM_INCLUDE_HOST_SCOPE_STATS_DUMP_H_ + +#include +#include +#include +#include +#include +#include + +#include "common/scope_stats_buffer.h" +#include "common/unified_log.h" +#include "host/profiling_common/profiler_base.h" +#include "host/profiling_copy.h" + +// Header-only host-side helper for scope_stats. Intentionally NOT modeled on +// L2PerfCollector / DepGenCollector — scope_stats is a single end-of-run +// snapshot (no streaming, no mgmt thread, no reconcile), so this whole +// feature collapses into one class with three short methods. Keeping every +// host-side scope_stats line in one file isolates the feature from the +// general device_runner flow: hooking the feature on adds three call sites +// (init / dump / finalize) and zero protocol details to device_runner. +// +// AICPU side is symmetric: layout in `scope_stats_buffer.h`, device probes / +// writer in `platform/src/aicpu/scope_stats_collector_aicpu.cpp`. The shared +// layout header is the only file both sides include. + +// Memory callbacks — thin aliases for the canonical profiling_common shapes +// (same pattern as dep_gen_collector / l2_perf_collector / pmu_collector / +// tensor_dump_collector). On a5 there is no halHostRegister; the host shadow +// + rtMemcpy DEVICE_TO_HOST pair is wired through ``BufferPoolManager``'s +// default_host_shadow_register + ops_.copy_from_device, so callers do not +// need to supply a copy callback themselves. +using ScopeStatsAllocCallback = profiling_common::ProfAllocCallback; +using ScopeStatsRegisterCallback = profiling_common::ProfRegisterCallback; +using ScopeStatsUnregisterCallback = profiling_common::ProfUnregisterCallback; +using ScopeStatsFreeCallback = profiling_common::ProfFreeCallback; + +// BufferPoolManager template stub. ScopeStatsHostBuffer drives the manager +// purely for its single-buffer alloc_and_register / copy_buffer_from_device / +// free_buffer methods (the canonical dev↔host shadow + rtMemcpy pair on a5); +// the streaming ready/done queue side is never touched, so the layout-trait +// aliases below are unused placeholders required only by the manager's static +// checks. +struct ScopeStatsModule { + using DataHeader = int; + using ReadyEntry = int; + using ReadyBufferInfo = int; + using FreeQueue = int; + static constexpr int kBufferKinds = 1; +}; + +class ScopeStatsHostBuffer { +public: + // Allocate the device-side buffer and set up the host-side view. Returns + // 0 on success; on failure the object stays uninitialized and every + // other method is a no-op, so callers can chain without guarding. + // + // Everything goes through profiling_common::BufferPoolManager so this + // collector's dev↔host shadow + rtMemcpy setup matches every pool-based + // collector on a5. ``register_cb`` defaults to ``default_host_shadow_register`` + // (the same fallback ProfilerBase::start installs) when callers leave it + // null; the manager allocs the device buffer, mallocs a host shadow, zeros + // it, and pushes the zeros to device. + int init( + const ScopeStatsAllocCallback &alloc_cb, ScopeStatsRegisterCallback register_cb, + const ScopeStatsFreeCallback &free_cb, int device_id + ) { + device_id_ = device_id; + const std::size_t bytes = sizeof(ScopeStatsBuffer); + + profiling_common::MemoryOps ops; + ops.alloc = alloc_cb; + ops.free_ = free_cb; + ops.reg = register_cb != nullptr ? register_cb : &profiling_common::default_host_shadow_register; + ops.copy_to_device = [](void *dev_dst, const void *host_src, std::size_t size) { + return profiling_copy_to_device(dev_dst, host_src, size); + }; + ops.copy_from_device = [](void *host_dst, const void *dev_src, std::size_t size) { + return profiling_copy_from_device(host_dst, dev_src, size); + }; + manager_.set_memory_context( + std::move(ops), /*shared_mem_dev=*/nullptr, /*shared_mem_host=*/nullptr, + /*shm_size=*/0, device_id + ); + dev_ptr_ = manager_.alloc_and_register(bytes, &host_ptr_); + if (dev_ptr_ == nullptr) { + return -1; + } + initialized_ = true; + return 0; + } + + bool is_initialized() const { return initialized_; } + + void *device_ptr() const { return dev_ptr_; } + + // Snapshot the shared region as JSON at /scope_stats.json. + // Assumes the device stream has already been synced (matches dep_gen / + // l2_perf export ordering), so AICPU writes are fully visible. + int dump(const std::string &output_dir) { + if (!initialized_ || host_ptr_ == nullptr) return 0; + // a5 — refresh host shadow before reading (no halHostRegister, so the + // shadow is stale until we rtMemcpy DEVICE_TO_HOST). + int rc = manager_.copy_buffer_from_device(host_ptr_, dev_ptr_, sizeof(ScopeStatsBuffer)); + if (rc != 0) { + LOG_ERROR("scope_stats: copy_from_device failed: %d", rc); + return rc; + } + const std::string path = make_path(output_dir); + const auto *buf = static_cast(host_ptr_); + return write_json(buf, path); + } + + // unregister_cb / free_cb are accepted for signature symmetry with a2a3 and + // with the other collectors' finalize hooks; the manager already owns the + // free path via the MemoryOps stashed at init() time, so they go unused on + // a5 (a5 has no halHostRegister, hence no unregister either). + void finalize(ScopeStatsUnregisterCallback /*unregister_cb*/, const ScopeStatsFreeCallback & /*free_cb*/) { + if (!initialized_) return; + manager_.free_buffer(dev_ptr_); // frees dev + paired host shadow + dev_ptr_ = nullptr; + host_ptr_ = nullptr; + initialized_ = false; + } + +private: + static std::string make_path(const std::string &output_dir) { + std::filesystem::path dir(output_dir); + std::error_code ec; + std::filesystem::create_directories(dir, ec); + if (ec) { + LOG_WARN("scope_stats: failed to create output dir %s: %s", output_dir.c_str(), ec.message().c_str()); + } + return (dir / "scope_stats.json").string(); + } + + // Schema (version 2) — flat, not Chrome-trace, because scope_stats is a + // list of per-scope_end snapshots, not a timeline. Each metric is rendered + // as a `"used/cap"` string so the JSON reads the same as the original + // `[ScopeStats]` log line: + // { "version": 2, "fatal": bool, + // "write_count": uint, "cap": uint, "dropped": uint, + // "records": [ + // { "site": "file:line", "depth": int, + // "task_window": ["used/cap", "used/cap", "used/cap", "used/cap"], + // "heap": ["used/cap", "used/cap", "used/cap", "used/cap"], + // "dep": ["used/cap", "used/cap", "used/cap", "used/cap"], + // "fanin_used": [int, int, int, int], + // "tensormap": "used/cap" }, + // ... + // ] } + static int write_json(const ScopeStatsBuffer *buf, const std::string &path) { + if (buf == nullptr) { + LOG_ERROR("scope_stats: null buffer"); + return -1; + } + std::FILE *fp = std::fopen(path.c_str(), "w"); + if (fp == nullptr) { + LOG_ERROR("scope_stats: failed to open %s", path.c_str()); + return -1; + } + + const std::uint64_t write_count = buf->header.write_count; + const std::uint32_t cap = buf->header.cap; + const std::uint64_t kept = write_count > cap ? cap : write_count; + const std::uint64_t dropped = write_count > cap ? (write_count - cap) : 0; + const std::uint64_t start = write_count - kept; + + std::fprintf(fp, "{\n"); + std::fprintf(fp, " \"version\": 2,\n"); + std::fprintf(fp, " \"fatal\": %s,\n", buf->header.fatal_latched ? "true" : "false"); + std::fprintf(fp, " \"write_count\": %" PRIu64 ",\n", write_count); + std::fprintf(fp, " \"cap\": %u,\n", cap); + std::fprintf(fp, " \"dropped\": %" PRIu64 ",\n", dropped); + + std::fprintf(fp, " \"records\": ["); + for (std::uint64_t i = 0; i < kept; i++) { + const ScopeStatsRecord &rec = buf->records[(start + i) % cap]; + if (i) std::fputc(',', fp); + std::fprintf(fp, "\n {"); + // Bound the print to the on-wire field size in case a future + // writer change drops the NUL terminator. + const std::size_t site_len = strnlen(rec.site_file_basename, sizeof(rec.site_file_basename)); + std::fprintf( + fp, "\"site\": \"%.*s:%d\", ", static_cast(site_len), rec.site_file_basename, rec.site_line + ); + std::fprintf(fp, "\"depth\": %d, ", rec.depth); + std::fprintf(fp, "\"task_window\": "); + write_i32_over_i32_array( + fp, rec.task_in_flight, buf->header.task_window_cap, PTO2_SCOPE_STATS_MAX_RING_DEPTH + ); + std::fprintf(fp, ", \"heap\": "); + write_u64_over_u64_array(fp, rec.heap_bytes, buf->header.heap_cap, PTO2_SCOPE_STATS_MAX_RING_DEPTH); + std::fprintf(fp, ", \"dep\": "); + write_i32_over_i32_array(fp, rec.dep_used, buf->header.dep_cap, PTO2_SCOPE_STATS_MAX_RING_DEPTH); + std::fprintf(fp, ", \"fanin_used\": "); + write_i32_array(fp, rec.fanin_used, PTO2_SCOPE_STATS_MAX_RING_DEPTH); + std::fprintf(fp, ", \"tensormap\": \"%d/%d\"}", rec.tensormap_used, buf->header.tensormap_cap); + } + std::fprintf(fp, "\n ]\n}\n"); + std::fclose(fp); + + LOG_INFO_V1("scope_stats: wrote %" PRIu64 " records (dropped=%" PRIu64 ") to %s", kept, dropped, path.c_str()); + return 0; + } + + static void write_i32_array(std::FILE *fp, const std::int32_t *arr, std::size_t n) { + std::fputc('[', fp); + for (std::size_t i = 0; i < n; i++) { + if (i) std::fputc(',', fp); + std::fprintf(fp, "%d", arr[i]); + } + std::fputc(']', fp); + } + + static void + write_i32_over_i32_array(std::FILE *fp, const std::int32_t *used, const std::int32_t *cap, std::size_t n) { + std::fputc('[', fp); + for (std::size_t i = 0; i < n; i++) { + if (i) std::fputc(',', fp); + std::fprintf(fp, "\"%d/%d\"", used[i], cap[i]); + } + std::fputc(']', fp); + } + + static void + write_u64_over_u64_array(std::FILE *fp, const std::uint64_t *used, const std::uint64_t *cap, std::size_t n) { + std::fputc('[', fp); + for (std::size_t i = 0; i < n; i++) { + if (i) std::fputc(',', fp); + std::fprintf(fp, "\"%" PRIu64 "/%" PRIu64 "\"", used[i], cap[i]); + } + std::fputc(']', fp); + } + + bool initialized_ = false; + int device_id_ = -1; + void *dev_ptr_ = nullptr; + void *host_ptr_ = nullptr; + // Drives the dev↔host shadow + rtMemcpy DEVICE_TO_HOST pair on a5 so + // scope_stats's data-copy path matches every pool-based collector's. + profiling_common::BufferPoolManager manager_; +}; + +#endif // SRC_A5_PLATFORM_INCLUDE_HOST_SCOPE_STATS_DUMP_H_ diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp index 64bf019b3..c9e16628d 100644 --- a/src/a5/platform/onboard/aicpu/kernel.cpp +++ b/src/a5/platform/onboard/aicpu/kernel.cpp @@ -19,6 +19,7 @@ #include "aicpu/platform_regs.h" #include "aicpu/platform_aicpu_affinity.h" #include "aicpu/pmu_collector_aicpu.h" +#include "aicpu/scope_stats_collector_aicpu.h" #include "aicpu/tensor_dump_aicpu.h" #include "runtime.h" @@ -106,6 +107,8 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a set_l2_swimlane_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)); set_platform_pmu_base(k_args->pmu_data_base); set_pmu_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_PMU)); + set_scope_stats_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_SCOPE_STATS)); + set_platform_scope_stats_base(k_args->scope_stats_data_base); // Affinity gate: drop excess threads before entering runtime if (!platform_aicpu_affinity_gate(runtime->aicpu_thread_num, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH)) { diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index 38242555d..1a72ac8f7 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -544,6 +544,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { if (enable_pmu_) { SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU); } + if (enable_scope_stats_) { + SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_SCOPE_STATS); + } for (int i = 0; i < num_aicore; i++) { runtime.workers[i].aicpu_ready = 0; @@ -608,6 +611,14 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { } } + if (enable_scope_stats_) { + rc = init_scope_stats(device_id_); + if (rc != 0) { + LOG_ERROR("init_scope_stats failed: %d", rc); + return rc; + } + } + // Cleanup guard for early returns: stops all started collectors so // their mgmt + poll threads exit cleanly. stop() is idempotent and a // no-op on collectors that never started. @@ -743,6 +754,12 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { pmu_collector_.reconcile_counters(); } + if (enable_scope_stats_ && scope_stats_buf_.is_initialized()) { + // Stream sync has already completed; refresh the host shadow from + // device memory and write /scope_stats.json. + scope_stats_buf_.dump(output_prefix_); + } + // Print handshake results (reads from device memory, must be before free) print_handshake_results(); @@ -1038,6 +1055,10 @@ int DeviceRunner::finalize() { if (pmu_collector_.is_initialized()) { pmu_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb); } + if (scope_stats_buf_.is_initialized()) { + scope_stats_buf_.finalize(/*unregister_cb=*/nullptr, prof_free_cb); + kernel_args_.args.scope_stats_data_base = 0; + } // Release per-Worker static arena (GM heap + PTO2 SM in a single backing // device allocation). Must precede mem_alloc_.finalize() so the arena @@ -1244,3 +1265,17 @@ int DeviceRunner::init_pmu( } return rc; } + +int DeviceRunner::init_scope_stats(int device_id) { + // a5: ScopeStatsHostBuffer drives BufferPoolManager which, via the default + // host_shadow_register, allocs the dev buffer + host shadow and pushes the + // zeroed shadow to device — so no explicit copy_from_device wire-up or + // pre-zero is needed here, and the JSON's `write_count` will read 0 even + // if AICPU never runs (kernel launch failure). + int rc = scope_stats_buf_.init(prof_alloc_cb, /*register_cb=*/nullptr, prof_free_cb, device_id); + if (rc != 0) { + return rc; + } + kernel_args_.args.scope_stats_data_base = reinterpret_cast(scope_stats_buf_.device_ptr()); + return 0; +} diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index a07ab28bb..8869df7df 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -49,6 +49,7 @@ #include "host/memory_allocator.h" #include "host/l2_perf_collector.h" #include "host/pmu_collector.h" +#include "host/scope_stats_dump.h" #include "host/tensor_dump_collector.h" #include "load_aicpu_op.h" #include "runtime.h" @@ -300,6 +301,7 @@ class DeviceRunner { enable_pmu_ = (enable_pmu > 0); pmu_event_type_ = resolve_pmu_event_type(enable_pmu); } + void set_scope_stats_enabled(bool enable) { enable_scope_stats_ = enable; } // Directory under which all diagnostic artifacts (l2_perf_records.json / // tensor_dump/ / pmu.csv) land. Required (non-empty) when any diagnostic // is enabled; CallConfig::validate() enforces this contract upstream. @@ -699,11 +701,16 @@ class DeviceRunner { bool enable_l2_swimlane_{false}; bool enable_dump_tensor_{false}; bool enable_pmu_{false}; + bool enable_scope_stats_{false}; + // scope_stats: single end-of-run snapshot, no streaming. All host-side + // logic in ScopeStatsHostBuffer; this is the only hook device_runner needs. + ScopeStatsHostBuffer scope_stats_buf_; L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED}; // resolved from set_l2_swimlane_enabled() PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION}; // resolved from set_pmu_enabled() std::string output_prefix_{}; // diagnostic artifact root directory int init_pmu(int num_cores, int num_threads, const std::string &csv_path, PmuEventType event_type, int device_id); + int init_scope_stats(int device_id); // Per-run collector teardown: stops mgmt + poll threads on every collector // whose init succeeded, in the only safe order (stop() joins mgmt before diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index 0cc17c81f..1114f647d 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -389,7 +389,7 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c int run_prepared( DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int /*enable_dep_gen*/, - const char *output_prefix, PtoRunTiming *out_timing + int enable_scope_stats, const char *output_prefix, PtoRunTiming *out_timing ) { if (out_timing != NULL) { out_timing->host_wall_ns = 0; @@ -455,6 +455,7 @@ int run_prepared( runner->set_l2_swimlane_enabled(enable_l2_swimlane); runner->set_dump_tensor_enabled(enable_dump_tensor != 0); runner->set_pmu_enabled(enable_pmu); + runner->set_scope_stats_enabled(enable_scope_stats != 0); runner->set_output_prefix(output_prefix); rc = runner->run(*r, block_dim, aicpu_thread_num); diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index c0d26fbe1..969667530 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -254,6 +254,20 @@ int DeviceRunner::ensure_binaries_loaded() { return -1; } + set_scope_stats_enabled_func_ = + reinterpret_cast(dlsym(aicpu_so_handle_, "set_scope_stats_enabled")); + if (set_scope_stats_enabled_func_ == nullptr) { + LOG_ERROR("dlsym failed for set_scope_stats_enabled: %s", dlerror()); + return -1; + } + + set_platform_scope_stats_base_func_ = + reinterpret_cast(dlsym(aicpu_so_handle_, "set_platform_scope_stats_base")); + if (set_platform_scope_stats_base_func_ == nullptr) { + LOG_ERROR("dlsym failed for set_platform_scope_stats_base: %s", dlerror()); + return -1; + } + // Log config travels via the RTLD_GLOBAL HostLogger singleton in // libsimpler_log.so — already seeded by simpler_log_init() before the // AICPU sim SO was dlopen'd, so no per-SO setter forwarding is needed. @@ -399,6 +413,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { if (enable_pmu_) { SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU); } + if (enable_scope_stats_) { + SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_SCOPE_STATS); + } for (int i = 0; i < num_aicore; i++) { runtime.workers[i].aicpu_ready = 0; @@ -460,6 +477,14 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { } } + if (enable_scope_stats_) { + rc = init_scope_stats(); + if (rc != 0) { + LOG_ERROR("init_scope_stats failed: %d", rc); + return rc; + } + } + // Cleanup guard for early returns: stops all started collectors so // their mgmt + poll threads exit cleanly. stop() is idempotent and a // no-op on collectors that never started. @@ -507,7 +532,8 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { // Check if executors are loaded if (aicpu_execute_func_ == nullptr || aicore_execute_func_ == nullptr || set_platform_regs_func_ == nullptr || set_platform_dump_base_func_ == nullptr || set_dump_tensor_enabled_func_ == nullptr || - set_platform_pmu_base_func_ == nullptr || set_pmu_enabled_func_ == nullptr) { + set_platform_pmu_base_func_ == nullptr || set_pmu_enabled_func_ == nullptr || + set_scope_stats_enabled_func_ == nullptr || set_platform_scope_stats_base_func_ == nullptr) { LOG_ERROR("Executor functions not loaded. Call ensure_binaries_loaded first."); return -1; } @@ -519,6 +545,8 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { set_l2_swimlane_enabled_func_(enable_l2_swimlane_); set_platform_pmu_base_func_(kernel_args_.pmu_data_base); set_pmu_enabled_func_(enable_pmu_); + set_scope_stats_enabled_func_(enable_scope_stats_); + set_platform_scope_stats_base_func_(kernel_args_.scope_stats_data_base); // No per-SO log-config push: HostLogger lives in libsimpler_log.so // (RTLD_GLOBAL singleton) and the AICPU sim SO reads it directly via the @@ -634,6 +662,10 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { pmu_collector_.reconcile_counters(); } + if (enable_scope_stats_ && scope_stats_buf_.is_initialized()) { + scope_stats_buf_.dump(output_prefix_); + } + // Print handshake results at end of run print_handshake_results(); @@ -680,6 +712,8 @@ void DeviceRunner::unload_executor_binaries() { set_l2_swimlane_enabled_func_ = nullptr; set_platform_pmu_base_func_ = nullptr; set_pmu_enabled_func_ = nullptr; + set_scope_stats_enabled_func_ = nullptr; + set_platform_scope_stats_base_func_ = nullptr; aicpu_so_loaded_ = false; } if (!aicpu_so_path_.empty()) { @@ -890,6 +924,10 @@ int DeviceRunner::finalize() { if (pmu_collector_.is_initialized()) { pmu_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb); } + if (scope_stats_buf_.is_initialized()) { + scope_stats_buf_.finalize(/*unregister_cb=*/nullptr, prof_free_cb); + kernel_args_.scope_stats_data_base = 0; + } // Release any chip callable buffers uploaded via upload_chip_callable_buffer. // Pool semantics mirror per-fid binaries: never freed until finalize. @@ -1106,3 +1144,16 @@ int DeviceRunner::init_pmu( } return rc; } + +int DeviceRunner::init_scope_stats() { + // a5 sim: ScopeStatsHostBuffer drives BufferPoolManager with the default + // host_shadow_register; sim's profiling_copy_* are plain memcpys, so the + // dev/host shadow path collapses to one allocation pair without any + // address-space tricks. + int rc = scope_stats_buf_.init(prof_alloc_cb, /*register_cb=*/nullptr, prof_free_cb, /*device_id=*/-1); + if (rc != 0) { + return rc; + } + kernel_args_.scope_stats_data_base = reinterpret_cast(scope_stats_buf_.device_ptr()); + return 0; +} diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h index 0aa6e6fa1..772c1663a 100644 --- a/src/a5/platform/sim/host/device_runner.h +++ b/src/a5/platform/sim/host/device_runner.h @@ -54,6 +54,7 @@ #include "host/memory_allocator.h" #include "host/l2_perf_collector.h" #include "host/pmu_collector.h" +#include "host/scope_stats_dump.h" #include "host/tensor_dump_collector.h" #include "runtime.h" @@ -182,6 +183,7 @@ class DeviceRunner { enable_pmu_ = (enable_pmu > 0); pmu_event_type_ = resolve_pmu_event_type(enable_pmu); } + void set_scope_stats_enabled(bool enable) { enable_scope_stats_ = enable; } // Directory under which all diagnostic artifacts (l2_perf_records.json / // tensor_dump/ / pmu.csv) land. Required (non-empty) when any diagnostic // is enabled; CallConfig::validate() enforces this contract upstream. @@ -361,6 +363,8 @@ class DeviceRunner { void (*set_platform_l2_perf_base_func_)(uint64_t){nullptr}; void (*set_l2_swimlane_enabled_func_)(bool){nullptr}; void (*set_pmu_enabled_func_)(bool){nullptr}; + void (*set_scope_stats_enabled_func_)(bool){nullptr}; + void (*set_platform_scope_stats_base_func_)(uint64_t){nullptr}; std::string aicpu_so_path_; std::string aicore_so_path_; @@ -372,6 +376,7 @@ class DeviceRunner { // PMU profiling (per-task AICore hardware counters) PmuCollector pmu_collector_; + ScopeStatsHostBuffer scope_stats_buf_; // Private helper methods — read aicpu_so_binary_ / aicore_kernel_binary_ // off the runner (populated by set_executors during simpler_init). @@ -416,11 +421,13 @@ class DeviceRunner { bool enable_l2_swimlane_{false}; bool enable_dump_tensor_{false}; bool enable_pmu_{false}; + bool enable_scope_stats_{false}; L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED}; // resolved from set_l2_swimlane_enabled() PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION}; // resolved from set_pmu_enabled() std::string output_prefix_{}; // diagnostic artifact root directory int init_pmu(int num_cores, int num_threads, const std::string &csv_path, PmuEventType event_type, int device_id); + int init_scope_stats(); // Per-run collector teardown: stops mgmt + poll threads on every collector // whose init succeeded. Idempotent. Mirrors the onboard helper. diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index 81e9b138f..9c08b30f7 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -300,7 +300,7 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c int run_prepared( DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int /*enable_dep_gen*/, - const char *output_prefix, PtoRunTiming *out_timing + int enable_scope_stats, const char *output_prefix, PtoRunTiming *out_timing ) { if (out_timing != NULL) { out_timing->host_wall_ns = 0; @@ -353,6 +353,7 @@ int run_prepared( runner->set_l2_swimlane_enabled(enable_l2_swimlane); runner->set_dump_tensor_enabled(enable_dump_tensor != 0); runner->set_pmu_enabled(enable_pmu); + runner->set_scope_stats_enabled(enable_scope_stats != 0); runner->set_output_prefix(output_prefix); rc = runner->run(*r, block_dim, aicpu_thread_num); diff --git a/src/a5/platform/src/aicpu/scope_stats_collector_aicpu.cpp b/src/a5/platform/src/aicpu/scope_stats_collector_aicpu.cpp new file mode 100644 index 000000000..a00ac3988 --- /dev/null +++ b/src/a5/platform/src/aicpu/scope_stats_collector_aicpu.cpp @@ -0,0 +1,208 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +// Platform-layer scope_stats collector. +// +// Owns all collector state (depth, peak arrays, shared buffer) and exposes +// pure-value APIs for runtime to report resource usage. No runtime-specific +// types cross the boundary. + +#include "aicpu/scope_stats_collector_aicpu.h" + +#include + +#include "common/scope_stats_buffer.h" + +// --------------------------------------------------------------------------- +// Collector state +// --------------------------------------------------------------------------- + +int32_t scope_stats_depth = -1; +bool scope_stats_enabled = false; + +static uint64_t scope_stats_peak_heap_bytes[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {}; +static int32_t scope_stats_peak_task_in_flight[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {}; +static int32_t scope_stats_peak_fanin_used[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {}; +static int32_t scope_stats_peak_dep_used[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {}; +static int32_t scope_stats_peak_tensormap_used[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH] = {}; +static ScopeStatsBuffer *scope_stats_shared_buf = nullptr; + +namespace { + +ScopeStatsInitialSampleFn g_initial_sample_fn = nullptr; + +const char *s_pending_site_file = nullptr; +int32_t s_pending_site_line = 0; + +const char *s_scope_site_file[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH] = {}; +int32_t s_scope_site_line[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH] = {}; + +inline const char *basename_of(const char *path) { + if (!path) return "(unknown)"; + const char *base = path; + for (const char *p = path; *p; ++p) { + if (*p == '/' || *p == '\\') base = p + 1; + } + return base; +} + +inline void copy_basename(char (&dst)[32], const char *src) { + const char *base = basename_of(src); + size_t i = 0; + for (; i + 1 < sizeof(dst) && base[i]; i++) + dst[i] = base[i]; + dst[i] = '\0'; +} + +} // namespace + +// --------------------------------------------------------------------------- +// Setter symbols — always exported, unconditionally compiled +// --------------------------------------------------------------------------- + +extern "C" void set_scope_stats_enabled(bool enable) { scope_stats_enabled = enable; } + +extern "C" void set_platform_scope_stats_base(uint64_t scope_stats_data_base) { + scope_stats_shared_buf = reinterpret_cast(scope_stats_data_base); + // Reset collector-local statics so a prior run that crashed mid-scope (or + // reused the same AICPU .so process) can't leak stale depth/peak data into + // the new run's records. + scope_stats_depth = -1; + s_pending_site_file = nullptr; + s_pending_site_line = 0; + memset(scope_stats_peak_heap_bytes, 0, sizeof(scope_stats_peak_heap_bytes)); + memset(scope_stats_peak_task_in_flight, 0, sizeof(scope_stats_peak_task_in_flight)); + memset(scope_stats_peak_fanin_used, 0, sizeof(scope_stats_peak_fanin_used)); + memset(scope_stats_peak_dep_used, 0, sizeof(scope_stats_peak_dep_used)); + memset(scope_stats_peak_tensormap_used, 0, sizeof(scope_stats_peak_tensormap_used)); + memset(s_scope_site_file, 0, sizeof(s_scope_site_file)); + memset(s_scope_site_line, 0, sizeof(s_scope_site_line)); + if (scope_stats_shared_buf) { + memset(&scope_stats_shared_buf->header, 0, sizeof(scope_stats_shared_buf->header)); + scope_stats_shared_buf->header.cap = PTO2_SCOPE_STATS_LOG_CAP; + } +} + +extern "C" void scope_stats_register_initial_sampler(ScopeStatsInitialSampleFn fn) { g_initial_sample_fn = fn; } + +// --------------------------------------------------------------------------- +// Scope lifecycle probes +// --------------------------------------------------------------------------- + +extern "C" void scope_stats_set_pending_site(const char *file, int line) { + s_pending_site_file = file; + s_pending_site_line = line; +} + +extern "C" void scope_stats_on_begin() { + if (!scope_stats_enabled) return; + if (scope_stats_depth + 1 >= PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH) return; + ++scope_stats_depth; + s_scope_site_file[scope_stats_depth] = s_pending_site_file; + s_scope_site_line[scope_stats_depth] = s_pending_site_line; + s_pending_site_file = nullptr; + s_pending_site_line = 0; + if (g_initial_sample_fn) { + g_initial_sample_fn(scope_stats_depth); + } else { + for (int r = 0; r < PTO2_SCOPE_STATS_MAX_RING_DEPTH; r++) { + scope_stats_peak_heap_bytes[scope_stats_depth][r] = 0; + scope_stats_peak_task_in_flight[scope_stats_depth][r] = 0; + scope_stats_peak_fanin_used[scope_stats_depth][r] = 0; + scope_stats_peak_dep_used[scope_stats_depth][r] = 0; + } + scope_stats_peak_tensormap_used[scope_stats_depth] = 0; + } +} + +extern "C" void scope_stats_on_end() { + if (!scope_stats_enabled) return; + if (scope_stats_depth < 0) return; + if (scope_stats_depth >= 0 && scope_stats_shared_buf) { + uint64_t idx = scope_stats_shared_buf->header.write_count % PTO2_SCOPE_STATS_LOG_CAP; + ScopeStatsRecord &rec = scope_stats_shared_buf->records[idx]; + rec.site_file_addr = reinterpret_cast(s_scope_site_file[scope_stats_depth]); + copy_basename(rec.site_file_basename, s_scope_site_file[scope_stats_depth]); + rec.site_line = s_scope_site_line[scope_stats_depth]; + rec.depth = static_cast(scope_stats_depth); + for (int r = 0; r < PTO2_SCOPE_STATS_MAX_RING_DEPTH; r++) { + rec.heap_bytes[r] = scope_stats_peak_heap_bytes[scope_stats_depth][r]; + rec.task_in_flight[r] = scope_stats_peak_task_in_flight[scope_stats_depth][r]; + rec.dep_used[r] = scope_stats_peak_dep_used[scope_stats_depth][r]; + rec.fanin_used[r] = scope_stats_peak_fanin_used[scope_stats_depth][r]; + } + rec.tensormap_used = scope_stats_peak_tensormap_used[scope_stats_depth]; + ++scope_stats_shared_buf->header.write_count; + } + for (int r = 0; r < PTO2_SCOPE_STATS_MAX_RING_DEPTH; r++) { + scope_stats_peak_heap_bytes[scope_stats_depth][r] = 0; + scope_stats_peak_task_in_flight[scope_stats_depth][r] = 0; + scope_stats_peak_fanin_used[scope_stats_depth][r] = 0; + scope_stats_peak_dep_used[scope_stats_depth][r] = 0; + } + scope_stats_peak_tensormap_used[scope_stats_depth] = 0; + s_scope_site_file[scope_stats_depth] = nullptr; + s_scope_site_line[scope_stats_depth] = 0; + --scope_stats_depth; +} + +extern "C" void scope_stats_on_fatal() { + if (!scope_stats_enabled) return; + if (!scope_stats_shared_buf) return; + scope_stats_shared_buf->header.fatal_latched = 1; +} + +// --------------------------------------------------------------------------- +// Pure-value peak update APIs — called by runtime at instrumentation points +// --------------------------------------------------------------------------- + +extern "C" void scope_stats_update_allocator_peaks(int ring_id, uint64_t heap_bytes, int32_t tasks_in_flight) { + if (!scope_stats_enabled || scope_stats_depth < 0) return; + if (ring_id < 0 || ring_id >= PTO2_SCOPE_STATS_MAX_RING_DEPTH) return; + for (int d = 0; d <= scope_stats_depth; d++) { + if (heap_bytes > scope_stats_peak_heap_bytes[d][ring_id]) scope_stats_peak_heap_bytes[d][ring_id] = heap_bytes; + if (tasks_in_flight > scope_stats_peak_task_in_flight[d][ring_id]) + scope_stats_peak_task_in_flight[d][ring_id] = tasks_in_flight; + } +} + +extern "C" void scope_stats_update_tensormap_peak(int32_t tensormap_used) { + if (!scope_stats_enabled || scope_stats_depth < 0) return; + for (int d = 0; d <= scope_stats_depth; d++) { + if (tensormap_used > scope_stats_peak_tensormap_used[d]) scope_stats_peak_tensormap_used[d] = tensormap_used; + } +} + +extern "C" void scope_stats_update_pool_peaks(int ring_id, int32_t fanin_used, int32_t dep_used) { + if (!scope_stats_enabled || scope_stats_depth < 0) return; + if (ring_id < 0 || ring_id >= PTO2_SCOPE_STATS_MAX_RING_DEPTH) return; + for (int d = 0; d <= scope_stats_depth; d++) { + if (fanin_used > scope_stats_peak_fanin_used[d][ring_id]) scope_stats_peak_fanin_used[d][ring_id] = fanin_used; + if (dep_used > scope_stats_peak_dep_used[d][ring_id]) scope_stats_peak_dep_used[d][ring_id] = dep_used; + } +} + +// --------------------------------------------------------------------------- +// Capacity registration — called by runtime at init +// --------------------------------------------------------------------------- + +extern "C" void scope_stats_set_ring_capacity(int ring_id, int32_t window_cap, uint64_t heap_cap, int32_t dep_cap) { + if (!scope_stats_shared_buf) return; + if (ring_id < 0 || ring_id >= PTO2_SCOPE_STATS_MAX_RING_DEPTH) return; + scope_stats_shared_buf->header.task_window_cap[ring_id] = window_cap; + scope_stats_shared_buf->header.heap_cap[ring_id] = heap_cap; + scope_stats_shared_buf->header.dep_cap[ring_id] = dep_cap; +} + +extern "C" void scope_stats_set_tensormap_capacity(int32_t cap) { + if (!scope_stats_shared_buf) return; + scope_stats_shared_buf->header.tensormap_cap = cap; +} diff --git a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h index f71766618..cf448bba1 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h @@ -123,6 +123,11 @@ typedef struct PTO2RuntimeOps { ); TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args); TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args); + + // Stash the call-site of the next PTO2ScopeGuard so the [ScopeStats] + // collector can log it. Always present to keep ops-table layout stable + // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0. + void (*scope_set_site)(const char *file, int line); } PTO2RuntimeOps; /** @@ -361,10 +366,13 @@ static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const u */ class PTO2ScopeGuard { public: - explicit PTO2ScopeGuard(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) : + explicit PTO2ScopeGuard( + PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE() + ) : rt_(current_runtime()) { if (!rt_->ops->is_fatal(rt_)) { rt_->pending_scope_mode = mode; + if (rt_->ops->scope_set_site) rt_->ops->scope_set_site(file, line); rt_->ops->scope_begin(rt_); } } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 05ac105a8..36555a40b 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -37,6 +37,10 @@ extern "C" void set_dump_tensor_selective_mode(bool enable); extern "C" void set_dump_tensor_task_mask(uint64_t task_id, uint64_t mask); +#if PTO2_PROFILING +#include "aicpu/scope_stats_collector_aicpu.h" +#endif + // ============================================================================= // Orchestrator Profiling (compile-time toggle) // ============================================================================= @@ -146,6 +150,12 @@ static int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code) static void orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *func, const char *fmt, va_list args) { int32_t latched_code = orch_mark_fatal(orch, error_code); +#if PTO2_PROFILING + // Flush the active scope's peaks before the FATAL line so the diagnostic + // context lands adjacent in the log. Latched internally — safe to call + // from every cascaded report_fatal. + scope_stats_on_fatal(); +#endif if (fmt == nullptr || fmt[0] == '\0') { if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) { @@ -232,6 +242,11 @@ static bool append_fanin_or_fail( } entry->slot_state = prod_state; fanin_builder->count++; +#if PTO2_PROFILING + scope_stats_update_pool_peaks( + ring_id, fanin_pool.used(), orch->scheduler ? orch->scheduler->ring_sched_states[ring_id].dep_pool.used() : 0 + ); +#endif return true; } @@ -321,6 +336,10 @@ static bool prepare_task( orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK); return false; } +#if PTO2_PROFILING + scope_stats_update_allocator_peaks(ring_id, allocator.heap_used_bytes(), allocator.active_count()); + scope_stats_update_tensormap_peak(orch->tensor_map.current_used()); +#endif out->task_id = PTO2TaskId::make(ring_id, static_cast(out->alloc_result.task_id)); out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot); @@ -415,6 +434,14 @@ bool PTO2OrchestratorState::init_from_layout( orch->scope_stack_capacity = layout.scope_stack_capacity; orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; +#if PTO2_PROFILING + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &alloc = orch->rings[r].task_allocator; + scope_stats_set_ring_capacity(r, alloc.window_size(), alloc.heap_capacity(), 0); + } + scope_stats_set_tensormap_capacity(orch->tensor_map.pool_capacity()); +#endif + return true; } @@ -428,7 +455,17 @@ void PTO2OrchestratorState::destroy() { orch->scope_begins = nullptr; } -void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; } +void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { + this->scheduler = scheduler; +#if PTO2_PROFILING + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + int32_t dep_cap = scheduler ? scheduler->ring_sched_states[r].dep_pool.capacity : 0; + scope_stats_set_ring_capacity( + r, rings[r].task_allocator.window_size(), rings[r].task_allocator.heap_capacity(), dep_cap + ); + } +#endif +} // ============================================================================= // Scope Management @@ -467,6 +504,18 @@ void PTO2OrchestratorState::begin_scope(PTO2ScopeMode mode) { if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) { orch->manual_begin_depth = orch->scope_stack_top; } +#if PTO2_PROFILING + scope_stats_on_begin(); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &alloc = orch->rings[r].task_allocator; + scope_stats_update_allocator_peaks(r, alloc.heap_used_bytes(), alloc.active_count()); + scope_stats_update_pool_peaks( + r, orch->rings[r].fanin_pool.used(), + orch->scheduler ? orch->scheduler->ring_sched_states[r].dep_pool.used() : 0 + ); + } + scope_stats_update_tensormap_peak(orch->tensor_map.current_used()); +#endif } void PTO2OrchestratorState::end_scope() { @@ -476,6 +525,14 @@ void PTO2OrchestratorState::end_scope() { } assert(orch->scope_stack_top >= 0 && "Scope stack underflow"); + // Snapshot peak intra-scope queue fill BEFORE the orchestrator drains + // pending tasks via scheduler->on_scope_end. The user is measuring how + // much ring/heap the work submitted inside this scope holds at its peak, + // not the residual after teardown. +#if PTO2_PROFILING + scope_stats_on_end(); +#endif + #if PTO2_ORCH_PROFILING uint64_t _se0 = get_sys_cnt_aicpu(); #endif diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index 5a3e3d3d3..2d6493595 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -181,6 +181,11 @@ class PTO2TaskAllocator { uint64_t heap_top() const { return heap_top_; } uint64_t heap_capacity() const { return heap_size_; } + uint64_t heap_used_bytes() const { + if (heap_size_ == 0) return 0; + return (heap_top_ + heap_size_ - heap_tail_) % heap_size_; + } + private: // --- Task Ring --- PTO2TaskDescriptor *descriptors_ = nullptr; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index c801d5c15..122611e3f 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -28,6 +28,9 @@ #include "aicpu/device_time.h" #include "common/unified_log.h" +#if PTO2_PROFILING +#include "aicpu/scope_stats_collector_aicpu.h" +#endif // Weak fallback for HOST .so builds (never called, but satisfies linker). // The AICPU build links the strong symbol from platform/.../device_time.cpp. @@ -231,6 +234,14 @@ void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, cons memcpy(ptr, &value, elem_size); } +// Ops-table entry that hands the call-site captured by PTO2ScopeGuard to the +// [ScopeStats] collector. The slot is always present in the struct to keep +// the layout stable; at PTO2_PROFILING=0 we fill nullptr so the orchestration +// .so's null-check skips it. +#if PTO2_PROFILING +static void scope_set_site_impl(const char *file, int line) { scope_stats_set_pending_site(file, line); } +#endif + static const PTO2RuntimeOps s_runtime_ops = { .submit_task = submit_task_impl, .scope_begin = rt_scope_begin, @@ -246,6 +257,11 @@ static const PTO2RuntimeOps s_runtime_ops = { .set_tensor_data = set_tensor_data, .alloc_tensors = alloc_tensors_impl, .submit_dummy_task = submit_dummy_task_impl, +#if PTO2_PROFILING + .scope_set_site = scope_set_site_impl, +#else + .scope_set_site = nullptr, +#endif }; // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index 1da622407..51bb95248 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -89,6 +89,11 @@ struct PTO2RuntimeOps { ); TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args); TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args); + + // Stash the call-site captured by PTO2ScopeGuard into the [ScopeStats] + // collector. Always present to keep ops-table layout stable across + // PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0. + void (*scope_set_site)(const char *file, int line); }; /** diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index 39d6e4ad2..2e13fdbb3 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -373,6 +373,13 @@ struct PTO2TensorMap { return task_local_id & (task_window_sizes[ring_id] - 1); } + // Accessors read by scope_stats_collector. Declared unconditionally so the + // collector .cpp compiles at PTO2_PROFILING=0 (collector is unconditional — + // setter symbols must export for host dlsym; the probe call sites that use + // these accessors stay gated by PTO2_PROFILING). + int32_t current_used() const { return next_entry_idx - free_num; } + int32_t pool_capacity() const { return pool_size; } + // new_entry only allocates memory, does not assign attributes PTO2TensorMapEntry *new_entry() { if (free_num > 0) { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h index 32887d0be..2aff666c1 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h @@ -49,6 +49,10 @@ } while (0) #endif +#if PTO2_PROFILING +#include "aicpu/scope_stats_collector_aicpu.h" +#endif + // ============================================================================= // Ready Queue (Lock-free bounded MPMC — Vyukov design) // ============================================================================= @@ -720,6 +724,9 @@ struct PTO2SchedulerState { early_finished++; } else { producer->fanout_head = rss.dep_pool.prepend(producer->fanout_head, ws); +#if PTO2_PROFILING + scope_stats_update_pool_peaks(ws->ring_id, 0, rss.dep_pool.used()); +#endif } producer->unlock_fanout(); }); diff --git a/src/common/task_interface/call_config.h b/src/common/task_interface/call_config.h index b8afbd32b..58ca0076b 100644 --- a/src/common/task_interface/call_config.h +++ b/src/common/task_interface/call_config.h @@ -11,9 +11,13 @@ /** * CallConfig — per-NEXT_LEVEL-task config. Carries execution knobs - * (block_dim, aicpu_thread_num) plus the four parallel diagnostics + * (block_dim, aicpu_thread_num) plus the five parallel diagnostics * sub-features under the profiling umbrella: `enable_l2_swimlane` (swimlane), - * `enable_dump_tensor`, `enable_pmu`, and `enable_dep_gen`. + * `enable_dump_tensor`, `enable_pmu`, `enable_dep_gen`, and + * `enable_scope_stats`. All five require `output_prefix` because they each + * write a sibling artifact into that directory + * (`l2_perf_records.json` / `tensor_dump/` / `pmu.csv` / `deps.json` / + * `scope_stats.json`). * * `block_dim == 0` is a sentinel for "auto" — DeviceRunner resolves it at * run() time to the max block_dim the AICore stream allows @@ -32,9 +36,9 @@ * * `output_prefix` is a NUL-terminated directory path under which all * diagnostic artifacts (l2_perf_records.json / tensor_dump/ / pmu.csv / - * submit_trace.bin) are written. The caller is responsible for filling it - * whenever any diagnostic flag is enabled — `validate()` enforces this - * contract at every submit/run entry point so the runtime never has to + * deps.json / scope_stats.json) are written. The caller is responsible for + * filling it whenever any diagnostic flag is enabled — `validate()` enforces + * this contract at every submit/run entry point so the runtime never has to * invent a path. */ @@ -52,10 +56,12 @@ struct CallConfig { int32_t enable_dump_tensor = 0; int32_t enable_pmu = 0; // 0 = disabled; >0 = enabled, value selects event type int32_t enable_dep_gen = 0; + int32_t enable_scope_stats = 0; // writes /scope_stats.json char output_prefix[1024] = {}; bool diagnostics_any() const noexcept { - return enable_l2_swimlane != 0 || enable_dump_tensor != 0 || enable_pmu != 0 || enable_dep_gen != 0; + return enable_l2_swimlane != 0 || enable_dump_tensor != 0 || enable_pmu != 0 || enable_dep_gen != 0 || + enable_scope_stats != 0; } bool output_prefix_set() const noexcept { return output_prefix[0] != '\0'; } @@ -67,10 +73,11 @@ struct CallConfig { if (diagnostics_any() && !output_prefix_set()) { throw std::invalid_argument( "CallConfig: output_prefix must be set whenever any of " - "enable_l2_swimlane / enable_dump_tensor / enable_pmu / enable_dep_gen is enabled" + "enable_l2_swimlane / enable_dump_tensor / enable_pmu / enable_dep_gen / " + "enable_scope_stats is enabled" ); } } }; #pragma pack(pop) -static_assert(sizeof(CallConfig) == 6 * sizeof(int32_t) + 1024, "CallConfig wire layout drift"); +static_assert(sizeof(CallConfig) == 7 * sizeof(int32_t) + 1024, "CallConfig wire layout drift"); diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index 2ee392ab1..a3dd90eb7 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -323,7 +323,8 @@ RunTiming ChipWorker::run(int32_t callable_id, const ChipStorageTaskArgs *args, PtoRunTiming timing{0, 0}; int rc = run_prepared_fn_( device_ctx_, rt, callable_id, args, config.block_dim, config.aicpu_thread_num, config.enable_l2_swimlane, - config.enable_dump_tensor, config.enable_pmu, config.enable_dep_gen, config.output_prefix, &timing + config.enable_dump_tensor, config.enable_pmu, config.enable_dep_gen, config.enable_scope_stats, + config.output_prefix, &timing ); if (rc != 0) { throw std::runtime_error("run_prepared failed with code " + std::to_string(rc)); diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index e1632eb2a..e4fb958c5 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -143,7 +143,7 @@ class ChipWorker { int (*)(void *, int, const uint8_t *, size_t, const uint8_t *, size_t, const uint8_t *, size_t); using PrepareCallableFn = int (*)(void *, int32_t, const void *); using RunPreparedFn = - int (*)(void *, void *, int32_t, const void *, int, int, int, int, int, int, const char *, PtoRunTiming *); + int (*)(void *, void *, int32_t, const void *, int, int, int, int, int, int, int, const char *, PtoRunTiming *); using UnregisterCallableFn = int (*)(void *, int32_t); using GetAicpuDlopenCountFn = size_t (*)(void *); using FinalizeDeviceFn = int (*)(void *); diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h index c4f6b7adf..e3fa4eb68 100644 --- a/src/common/worker/pto_runtime_c_api.h +++ b/src/common/worker/pto_runtime_c_api.h @@ -194,7 +194,7 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c int run_prepared( DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim, int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen, - const char *output_prefix, PtoRunTiming *out_timing + int enable_scope_stats, const char *output_prefix, PtoRunTiming *out_timing ); /** diff --git a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py index 9a43d54ec..39cfd778c 100644 --- a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py +++ b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py @@ -96,7 +96,7 @@ def compute_golden(self, args, params): a, b = args.a, args.b args.f[:] = (a + b + 1) * (a + b + 2) - def _run_and_validate_l2( + def _run_and_validate_l2( # noqa: PLR0913 self, worker, callable_obj, @@ -107,6 +107,7 @@ def _run_and_validate_l2( enable_dump_tensor=False, enable_pmu=0, enable_dep_gen=False, + enable_scope_stats=False, output_prefix="", ): params = case.get("params", {}) diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/scope_stats/test_scope_stats.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/scope_stats/test_scope_stats.py new file mode 100644 index 000000000..9756f90fd --- /dev/null +++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/scope_stats/test_scope_stats.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""scope_stats smoke — capture pipeline produces a usable ``scope_stats.json``. + +Re-uses ``vector_example`` (outer executor scope + one inner ``PTO2_SCOPE()``). +With ``--enable-scope-stats`` the platform collector +(``scope_stats_collector_aicpu.h``) appends one record per scope_end into +the host-allocated buffer, and the host dumps it as JSON. Enabling the +flag is the entire user surface for the new API — the runtime takes care +of the ``set_pending_site`` / ``on_begin`` / ``update_*_peaks`` / ``on_end`` +calls. Schema lives in ``docs/dfx/scope-stats.md`` §5. +""" + +import json + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.scene_test import _outputs_dir, _sanitize_for_filename + +KERNELS_BASE = "../../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels" +_REQUIRED_RECORD_FIELDS = {"site", "depth", "task_window", "heap", "dep", "fanin_used", "tensormap"} + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestScopeStats(SceneTestCase): + """Vector example with --enable-scope-stats, then assert scope_stats.json.""" + + CALLABLE = { + "orchestration": { + "source": f"{KERNELS_BASE}/orchestration/example_orchestration.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": f"{KERNELS_BASE}/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": f"{KERNELS_BASE}/aiv/kernel_add_scalar.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT], + }, + { + "func_id": 2, + "source": f"{KERNELS_BASE}/aiv/kernel_mul.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + ], + } + + CASES = [ + { + "name": "default", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 3}, + "params": {}, + }, + ] + + def generate_args(self, params): + SIZE = 128 * 128 + return TaskArgsBuilder( + Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)), + Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32)), + Tensor("f", torch.zeros(SIZE, dtype=torch.float32)), + ) + + def compute_golden(self, args, params): + args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b) + + def test_run(self, st_platform, st_worker, request): + super().test_run(st_platform, st_worker, request) + if not request.config.getoption("--enable-scope-stats", default=False): + return + for case in self.CASES: + if st_platform in case["platforms"]: + self._validate_scope_stats_artifact(case) + + def _validate_scope_stats_artifact(self, case): + safe_label = _sanitize_for_filename(f"TestScopeStats_{case['name']}") + matches = sorted(_outputs_dir().glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime) + assert matches, ( + f"no output directory under {_outputs_dir()} matching {safe_label}_* — " + f"--enable-scope-stats was on but the run produced no per-case output dir" + ) + path = matches[-1] / "scope_stats.json" + assert path.exists(), f"scope_stats.json missing under {matches[-1]} — collector finalize failed?" + data = json.loads(path.read_text()) + assert data.get("version") == 2, f"unexpected schema version: {data!r}" + assert data.get("fatal") is False, f"run latched fatal: {data!r}" + records = data.get("records", []) + # vector_example has outer (executor) + inner PTO2_SCOPE → ≥2 records. + assert len(records) >= 2, f"expected ≥2 records (inner + outer), got {records!r}" + for rec in records: + assert _REQUIRED_RECORD_FIELDS <= rec.keys(), f"record missing fields: {rec!r}" + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py index b40ffcde4..fbd0e6d09 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py +++ b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py @@ -97,7 +97,7 @@ def generate_args(self, params): def compute_golden(self, args, params): args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b) - def _run_and_validate_l2( + def _run_and_validate_l2( # noqa: PLR0913 self, worker, callable_obj, @@ -108,6 +108,7 @@ def _run_and_validate_l2( enable_dump_tensor=False, enable_pmu=0, enable_dep_gen=False, + enable_scope_stats=False, output_prefix="", ): params = case.get("params", {}) diff --git a/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py index 2a0158dc7..1b0810714 100644 --- a/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py +++ b/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py @@ -85,7 +85,7 @@ def compute_golden(self, args, params): # dump_tensor orchestration computes f = (a + b) + 1 args.f[:] = (args.a + args.b) + 1 - def _run_and_validate_l2( + def _run_and_validate_l2( # noqa: PLR0913 self, worker, callable_obj, @@ -96,6 +96,7 @@ def _run_and_validate_l2( enable_dump_tensor=False, enable_pmu=0, enable_dep_gen=False, + enable_scope_stats=False, output_prefix="", ): params = case.get("params", {}) diff --git a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py index dbede6ac7..a2bf0b2a1 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py +++ b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py @@ -98,7 +98,7 @@ def generate_args(self, params): def compute_golden(self, args, params): args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b) - def _run_and_validate_l2( + def _run_and_validate_l2( # noqa: PLR0913 self, worker, callable_obj, @@ -109,6 +109,7 @@ def _run_and_validate_l2( enable_dump_tensor=False, enable_pmu=0, enable_dep_gen=False, + enable_scope_stats=False, output_prefix="", ): params = case.get("params", {}) diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 89314d800..374179754 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -90,6 +90,7 @@ add_library(a2a3_rt_objs OBJECT ${A2A3_RUNTIME_DIR}/shared/pto_shared_memory.cpp ${A2A3_RUNTIME_DIR}/scheduler/pto_scheduler.cpp ${A2A3_RUNTIME_DIR}/shared/pto_tensormap.cpp + ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/src/aicpu/scope_stats_collector_aicpu.cpp ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp ) target_include_directories(a2a3_rt_objs PUBLIC