hw-native-sys · doraemonmj · May 27, 2026 · ChaoZheng109 · May 27, 2026 · doraemonmj
diff --git a/conftest.py b/conftest.py
@@ -163,6 +163,12 @@ def pytest_addoption(parser):
         help="Enable PMU collection. Bare flag = PIPE_UTILIZATION(2). "
         "Pass event type to override (e.g. --enable-pmu 4)",
     )
+    parser.addoption(
+        "--enable-scope-stats",
+        action="store_true",
+        default=False,
+        help="Enable per-scope peak collection and emit <output_prefix>/scope_stats.json (per-scope ring-fill peaks).",
+    )
     parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source")
     parser.addoption(
         "--pto-isa-commit",

diff --git a/docs/dfx/scope-stats.md b/docs/dfx/scope-stats.md
@@ -0,0 +1,264 @@
+# Scope Stats — Per-scope Resource Usage Peaks
+
+## 1. Background & Motivation
+
+When a model runs out of task windows, heap, or dep/fanin pool entries,
+the failure message tells you *which* resource is exhausted but not
+*which scope* caused the peak. Without per-scope attribution, debugging
+requires binary-searching the orchestration code to find the offending
+scope — slow and error-prone.
+
+Scope stats captures the peak resource usage (heap bytes, task
+in-flight, dep/fanin pool entries, tensormap entries) for every
+`PTO2_SCOPE` region, so the output directly tells you which scope drove
+each resource to its high-water mark.
+
+## 2. Overview
+
+- **One row per scope exit.** Peaks are sampled continuously inside the
+  scope and flushed to a shared buffer on `scope_end`.
+- **Per-ring breakdown.** Each ring's task allocator heap/task-window
+  and dep/fanin pool are tracked independently.
+- **JSON output.** A `scope_stats.json` lands under the per-task output
+  prefix with capacities in the header and per-scope records.
+- **Runtime-gated.** Controlled by `--enable-scope-stats` (bit 4 of
+  `enable_profiling_flag`). When off, every probe is a single bool
+  load — no measurement overhead.
+- **T&R runtime only.** See §6 for why.
+
+Enable in one line:
+
+```bash
+python tests/st/<case>/test_<name>.py -p a2a3 -d 0 --enable-scope-stats
+```
+
+## 3. Architecture
+
+### 3.1 Layering
+
+Scope stats uses a clean platform-provides / runtime-calls pattern:
+
+```text
+platform/include/aicpu/scope_stats_collector.h
+    Pure-value API declarations. No runtime types cross this boundary.
+
+platform/src/aicpu/scope_stats_collector.cpp
+    Owns all collector state (depth stack, peak arrays, shared buffer).
+    Implements scope lifecycle (on_begin/on_end), peak comparison logic,
+    capacity registration, and shared buffer record writes.
+
+runtime (pto_orchestrator.cpp, pto_scheduler.h)
+    Calls platform APIs at instrumentation points, passing extracted
+    values (ring_id, heap_bytes, tasks_in_flight, etc.) as plain
+    integers. No scope_stats source files in the runtime directory.
+```
+
+### 3.2 Platform API
+
+Header:
+[`src/a2a3/platform/include/aicpu/scope_stats_collector_aicpu.h`](../../src/a2a3/platform/include/aicpu/scope_stats_collector_aicpu.h)
+
+All entry points are `extern "C"` and take primitive types only — no
+runtime structs cross the boundary, so the same collector links into
+any runtime that wants to wire it up. Symbol resolution is unconditional
+(see §3.4), so callers do not need to guard the call sites.
+
+Single-producer contract: all `*_peaks` updates use non-atomic
+read-max-write and assume the orchestrator thread is the only writer.
+Concurrent callers may lose peaks silently — that is acceptable for
+diagnostic data and saves an atomic on the hot path.
+
+#### Setter symbols (host → AICPU init)
+
+```cpp
+void set_scope_stats_enabled(bool enable);
+void set_platform_scope_stats_base(uint64_t scope_stats_data_base);
+```
+
+`kernel.cpp` calls both at kernel entry from `KernelArgs`. `enable`
+mirrors the host's `--enable-scope-stats` flag; `scope_stats_data_base`
+is the device-visible address of a `ScopeStatsBuffer` host allocated
+during `init_scope_stats()`. When `enable=false` every probe early-returns
+after one bool load — that is the off-cost.
+
+#### Capacity registration (runtime → AICPU init)
+
+```cpp
+void scope_stats_set_ring_capacity(int ring_id, int32_t window_cap,
+                                   uint64_t heap_cap, int32_t dep_cap);
+void scope_stats_set_tensormap_capacity(int32_t cap);
+```
+
+Called once per ring at orchestrator init / scheduler attach. Caps are
+copied verbatim into the buffer header so the host JSON can render
+`used/cap` ratios without a second device→host query. `ring_id` outside
+`[0, PTO2_SCOPE_STATS_MAX_RING_DEPTH)` is silently dropped.
+
+#### Scope lifecycle (runtime → AICPU per-scope)
+
+```cpp
+void scope_stats_set_pending_site(const char *file, int line);
+void scope_stats_on_begin();
+void scope_stats_on_end();
+void scope_stats_on_fatal();
+```
+
+`PTO2_SCOPE()` expansion calls `set_pending_site(__FILE__, __LINE__)`
+immediately before `on_begin()` so the next `on_end()` can stamp the
+record with the originating source location — the basename copy
+(`copy_basename`) keeps the JSON readable without forcing host to chase
+a device pointer into the orchestration `.so`'s string table. `on_fatal`
+sets `header.fatal_latched`, which surfaces as `"fatal": true` in the
+JSON; the host treats that as "the run was diagnostic-only past this
+point" but still emits whatever records made it.
+
+#### Peak updates (runtime → AICPU on resource touch)
+
+```cpp
+void scope_stats_update_allocator_peaks(int ring_id, uint64_t heap_bytes,
+                                        int32_t tasks_in_flight);
+void scope_stats_update_tensormap_peak(int32_t tensormap_used);
+void scope_stats_update_pool_peaks(int ring_id, int32_t fanin_used,
+                                   int32_t dep_used);
+```
+
+Called after each allocation or pool change. Each update walks
+`d ∈ [0, scope_stats_depth]` so peaks bubble up: an alloc spike inside
+an inner scope shows in both the inner and outer record. `ring_id` out
+of range is dropped silently (same clamp as capacity registration).
+
+#### Initial-sample callback (optional)
+
+```cpp
+typedef void (*ScopeStatsInitialSampleFn)(int32_t depth);
+void scope_stats_register_initial_sampler(ScopeStatsInitialSampleFn fn);
+```
+
+If the runtime registers a sampler, `on_begin` calls it instead of
+zero-clearing the new depth's peak arrays. Used to inherit the current
+in-flight resource snapshot at scope entry so the inner-scope peak is
+relative to "what was already in flight when we entered", not zero.
+Without it, the per-depth peak arrays are zeroed at each `on_begin`.
+
+### 3.3 Comparison with other profiling subsystems
+
+| Feature | Layer | Runtime scope | Why |
+| ------- | ----- | ------------- | --- |
+| PMU | platform only | all runtimes | reads hardware registers (platform) |
+| L2 swimlane | platform only | all runtimes | reads AICore ring buffers (platform) |
+| dep_gen | platform only | all runtimes | traces `submit_task` (runtime-agnostic) |
+| tensor dump | platform only | all runtimes | dumps tensor data (platform) |
+| **scope stats** | **platform API + runtime call sites** | **T&R only** | runtime extracts values, platform tracks peaks |
+
+### 3.4 Symbol resolution flow
+
+```text
+kernel.cpp (platform, shared by all runtimes)
+    ├── set_scope_stats_enabled(flag)
+    └── set_platform_scope_stats_base(addr)
+
+For host_build_graph AICPU .so:
+    kernel.cpp ──links──> platform collector
+    → symbols resolve, .so loads, scope_stats is enabled but
+      no runtime call sites invoke update APIs → no records
+
+For tensormap_and_ringbuffer AICPU .so:
+    kernel.cpp ──links──> platform collector
+    runtime call sites invoke update/capacity APIs
+    → full peak tracking active
+```
+
+## 4. Data Flow
+
+```text
+Host                              AICPU (T&R runtime)
+─────                             ─────────────────────
+ScopeStatsCollector                platform scope_stats_collector.cpp
+  allocate ScopeStatsBuffer          set_platform_scope_stats_base(addr)
+  set kernel_args fields             set_scope_stats_enabled(true)
+  launch kernel                      runtime: scope_stats_set_ring_capacity()
+      │                              runtime: scope_stats_set_tensormap_capacity()
+      │                                  │
+      │                              on PTO2_SCOPE begin:
+      │                                scope_stats_on_begin()
+      │                                runtime: scope_stats_update_*_peaks()
+      │                              on alloc/pool change:
+      │                                runtime: scope_stats_update_*_peaks()
+      │                              on PTO2_SCOPE end:
+      │                                scope_stats_on_end()
+      │                                  └─ write record to ScopeStatsBuffer
+      │                                  │
+  stream sync                        kernel exit
+  read ScopeStatsBuffer
+  emit scope_stats.json
+```
+
+## 5. Output: `scope_stats.json`
+
+The host emits `<output_prefix>/scope_stats.json` at finalize, after
+the device stream is synced. Schema (version 2):
+
+```json
+{
+  "version": 2,
+  "fatal": false,
+  "write_count": 2,
+  "cap": 16384,
+  "dropped": 0,
+  "records": [
+    {"site": "example_orchestration.cpp:77", "depth": 1,
+     "task_window": ["0/16384", "4/16384", "0/16384", "0/16384"],
+     "heap":        ["0/268435456", "8192/268435456", "0/268435456", "0/268435456"],
+     "dep":         ["0/16384", "5/16384", "0/16384", "0/16384"],
+     "fanin_used":  [0, 3, 0, 0],
+     "tensormap":   "5/65536"},
+    {"site": "kernel.cpp:80", "depth": 0,
+     "task_window": ["1/16384", "4/16384", "0/16384", "0/16384"],
+     "heap":        ["4096/268435456", "8192/268435456", "0/268435456", "0/268435456"],
+     "dep":         ["1/16384", "5/16384", "0/16384", "0/16384"],
+     "fanin_used":  [0, 3, 0, 0],
+     "tensormap":   "5/65536"}
+  ]
+}
+```
+
+Top-level fields:
+
+| Field | Type | Meaning |
+| ----- | ---- | ------- |
+| `version` | int | Always `2` |
+| `fatal` | bool | `true` iff `scope_stats_on_fatal()` fired during the run |
+| `write_count` | uint64 | Total `scope_end` events observed (incl. dropped) |
+| `cap` | uint32 | Ring capacity, `PTO2_SCOPE_STATS_LOG_CAP` (16384) |
+| `dropped` | uint64 | `max(write_count - cap, 0)` — overflow count |
+| `records` | array | Up to `min(cap, write_count)` records, oldest-first |
+
+Per-record fields:
+
+| Field | Type | Description |
+| ----- | ---- | ----------- |
+| `site` | `"basename:line"` | Source location of the `PTO2_SCOPE()` call |
+| `depth` | int | Nesting depth (0 = root scope inside the executor) |
+| `task_window[ring]` | `"used/cap"` | Peak task-window slots in use |
+| `heap[ring]` | `"used/cap"` | Peak per-ring heap bytes in use |
+| `dep[ring]` | `"used/cap"` | Peak dep-pool entries in use |
+| `fanin_used[ring]` | int32 | Peak fanin-pool entries in use (capacity isn't currently carried — fanin reservation is implicit in dep accounting) |
+| `tensormap` | `"used/cap"` | Peak tensormap entries in use |
+
+The `cap` denominators come from `scope_stats_set_ring_capacity` /
+`scope_stats_set_tensormap_capacity` snapshots, so they always reflect
+the values the runtime actually configured for that run.
+
+A worked example is in
+[`tests/st/a2a3/tensormap_and_ringbuffer/dfx/scope_stats/test_scope_stats.py`](../../tests/st/a2a3/tensormap_and_ringbuffer/dfx/scope_stats/test_scope_stats.py)
+— it runs the `vector_example` orchestration with `--enable-scope-stats`
+and asserts the resulting JSON for the depth=0 / depth=1 records the
+outer-executor + inner `PTO2_SCOPE` produce.
+
+## 6. Future: Cross-runtime Support
+
+If host_build_graph adds scope-like concepts in the future, extending
+scope_stats only requires adding the same platform API call sites in
+the HBG runtime — no platform changes needed. The platform collector
+is already runtime-agnostic; it accepts plain values and has no
+knowledge of T&R types.
diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
@@ -618,6 +618,15 @@ NB_MODULE(_task_interface, m) {
                 c.enable_dep_gen = v ? 1 : 0;
             }
         )
+        .def_prop_rw(
+            "enable_scope_stats",
+            [](const CallConfig &c) {
+                return static_cast<bool>(c.enable_scope_stats);
+            },
+            [](CallConfig &c, bool v) {
+                c.enable_scope_stats = v ? 1 : 0;
+            }
+        )
         .def_prop_rw(
             "output_prefix",
             [](const CallConfig &c) -> std::string {
@@ -639,7 +648,8 @@ NB_MODULE(_task_interface, m) {
             os << "CallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
                << ", enable_l2_swimlane=" << self.enable_l2_swimlane
                << ", enable_dump_tensor=" << (self.enable_dump_tensor ? "True" : "False")
-               << ", enable_pmu=" << self.enable_pmu << ", enable_dep_gen=" << (self.enable_dep_gen ? "True" : "False");
+               << ", enable_pmu=" << self.enable_pmu << ", enable_dep_gen=" << (self.enable_dep_gen ? "True" : "False")
+               << ", enable_scope_stats=" << (self.enable_scope_stats ? "True" : "False");
             if (self.output_prefix_set()) {
                 os << ", output_prefix='" << self.output_prefix << "'";
             }

diff --git a/python/simpler/worker.py b/python/simpler/worker.py
@@ -112,11 +112,11 @@ def my_l4_orch(orch, args, config):
 _OFF_CALLABLE = 8
 _OFF_CONFIG = 16
 # Packed CallConfig wire layout — must match call_config.h byte for byte:
-# 6 int32 (block_dim, aicpu_thread_num, enable_l2_swimlane, enable_dump_tensor,
-# enable_pmu, enable_dep_gen) + 1024-byte NUL-terminated output_prefix. Log
-# config travels separately via ChipWorker.init(log_level, log_info_v) — not
-# on per-task wire.
-_CFG_FMT = struct.Struct("=iiiiii1024s")
+# 7 int32 (block_dim, aicpu_thread_num, enable_l2_swimlane, enable_dump_tensor,
+# enable_pmu, enable_dep_gen, enable_scope_stats) + 1024-byte NUL-terminated
+# output_prefix. Log config travels separately via ChipWorker.init(log_level,
+# log_info_v) — not on per-task wire.
+_CFG_FMT = struct.Struct("=iiiiiii1024s")
 # Args region starts after CONFIG, rounded up to 8 bytes so the first
 # ContinuousTensor.data (uint64_t at OFF_ARGS+8) is 8-byte aligned, avoiding
 # SIGBUS on strict-alignment platforms (aarch64 atomics, some ARM cores).
@@ -718,14 +718,15 @@ def _chip_process_loop(
 
 def _read_config_from_mailbox(buf: memoryview) -> "CallConfig":
     """Reconstruct a CallConfig from the unified mailbox layout."""
-    block_dim, aicpu_tn, swl, dt, pmu, dep_gen, prefix_bytes = _CFG_FMT.unpack_from(buf, _OFF_CONFIG)
+    block_dim, aicpu_tn, swl, dt, pmu, dep_gen, scope_stats, prefix_bytes = _CFG_FMT.unpack_from(buf, _OFF_CONFIG)
     cfg = CallConfig()
     cfg.block_dim = block_dim
     cfg.aicpu_thread_num = aicpu_tn
     cfg.enable_l2_swimlane = swl
     cfg.enable_dump_tensor = bool(dt)
     cfg.enable_pmu = pmu
     cfg.enable_dep_gen = bool(dep_gen)
+    cfg.enable_scope_stats = bool(scope_stats)
     # NUL-terminated C string in a 1024-byte field.
     cfg.output_prefix = prefix_bytes.split(b"\x00", 1)[0].decode("utf-8")
     return cfg