diff --git a/conftest.py b/conftest.py
index bdefdc418..e3c25e285 100644
--- a/conftest.py
+++ b/conftest.py
@@ -163,6 +163,12 @@ def pytest_addoption(parser):
         help="Enable PMU collection. Bare flag = PIPE_UTILIZATION(2). "
         "Pass event type to override (e.g. --enable-pmu 4)",
     )
+    parser.addoption(
+        "--enable-scope-stats",
+        action="store_true",
+        default=False,
+        help="Enable per-scope peak collection and emit <output_prefix>/scope_stats.json (per-scope ring-fill peaks).",
+    )
     parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source")
     parser.addoption(
         "--pto-isa-commit",
diff --git a/docs/dfx/scope-stats.md b/docs/dfx/scope-stats.md
new file mode 100644
index 000000000..c418db54e
--- /dev/null
+++ b/docs/dfx/scope-stats.md
@@ -0,0 +1,264 @@
+# Scope Stats — Per-scope Resource Usage Peaks
+
+## 1. Background & Motivation
+
+When a model runs out of task windows, heap, or dep/fanin pool entries,
+the failure message tells you *which* resource is exhausted but not
+*which scope* caused the peak. Without per-scope attribution, debugging
+requires binary-searching the orchestration code to find the offending
+scope — slow and error-prone.
+
+Scope stats captures the peak resource usage (heap bytes, task
+in-flight, dep/fanin pool entries, tensormap entries) for every
+`PTO2_SCOPE` region, so the output directly tells you which scope drove
+each resource to its high-water mark.
+
+## 2. Overview
+
+- **One row per scope exit.** Peaks are sampled continuously inside the
+  scope and flushed to a shared buffer on `scope_end`.
+- **Per-ring breakdown.** Each ring's task allocator heap/task-window
+  and dep/fanin pool are tracked independently.
+- **JSON output.** A `scope_stats.json` lands under the per-task output
+  prefix with capacities in the header and per-scope records.
+- **Runtime-gated.** Controlled by `--enable-scope-stats` (bit 4 of
+  `enable_profiling_flag`). When off, every probe is a single bool
+  load — no measurement overhead.
+- **T&R runtime only.** See §6 for why.
+
+Enable in one line:
+
+```bash
+python tests/st/<case>/test_<name>.py -p a2a3 -d 0 --enable-scope-stats
+```
+
+## 3. Architecture
+
+### 3.1 Layering
+
+Scope stats uses a clean platform-provides / runtime-calls pattern:
+
+```text
+platform/include/aicpu/scope_stats_collector.h
+    Pure-value API declarations. No runtime types cross this boundary.
+
+platform/src/aicpu/scope_stats_collector.cpp
+    Owns all collector state (depth stack, peak arrays, shared buffer).
+    Implements scope lifecycle (on_begin/on_end), peak comparison logic,
+    capacity registration, and shared buffer record writes.
+
+runtime (pto_orchestrator.cpp, pto_scheduler.h)
+    Calls platform APIs at instrumentation points, passing extracted
+    values (ring_id, heap_bytes, tasks_in_flight, etc.) as plain
+    integers. No scope_stats source files in the runtime directory.
+```
+
+### 3.2 Platform API
+
+Header:
+[`src/a2a3/platform/include/aicpu/scope_stats_collector_aicpu.h`](../../src/a2a3/platform/include/aicpu/scope_stats_collector_aicpu.h)
+
+All entry points are `extern "C"` and take primitive types only — no
+runtime structs cross the boundary, so the same collector links into
+any runtime that wants to wire it up. Symbol resolution is unconditional
+(see §3.4), so callers do not need to guard the call sites.
+
+Single-producer contract: all `*_peaks` updates use non-atomic
+read-max-write and assume the orchestrator thread is the only writer.
+Concurrent callers may lose peaks silently — that is acceptable for
+diagnostic data and saves an atomic on the hot path.
+
+#### Setter symbols (host → AICPU init)
+
+```cpp
+void set_scope_stats_enabled(bool enable);
+void set_platform_scope_stats_base(uint64_t scope_stats_data_base);
+```
+
+`kernel.cpp` calls both at kernel entry from `KernelArgs`. `enable`
+mirrors the host's `--enable-scope-stats` flag; `scope_stats_data_base`
+is the device-visible address of a `ScopeStatsBuffer` host allocated
+during `init_scope_stats()`. When `enable=false` every probe early-returns
+after one bool load — that is the off-cost.
+
+#### Capacity registration (runtime → AICPU init)
+
+```cpp
+void scope_stats_set_ring_capacity(int ring_id, int32_t window_cap,
+                                   uint64_t heap_cap, int32_t dep_cap);
+void scope_stats_set_tensormap_capacity(int32_t cap);
+```
+
+Called once per ring at orchestrator init / scheduler attach. Caps are
+copied verbatim into the buffer header so the host JSON can render
+`used/cap` ratios without a second device→host query. `ring_id` outside
+`[0, PTO2_SCOPE_STATS_MAX_RING_DEPTH)` is silently dropped.
+
+#### Scope lifecycle (runtime → AICPU per-scope)
+
+```cpp
+void scope_stats_set_pending_site(const char *file, int line);
+void scope_stats_on_begin();
+void scope_stats_on_end();
+void scope_stats_on_fatal();
+```
+
+`PTO2_SCOPE()` expansion calls `set_pending_site(__FILE__, __LINE__)`
+immediately before `on_begin()` so the next `on_end()` can stamp the
+record with the originating source location — the basename copy
+(`copy_basename`) keeps the JSON readable without forcing host to chase
+a device pointer into the orchestration `.so`'s string table. `on_fatal`
+sets `header.fatal_latched`, which surfaces as `"fatal": true` in the
+JSON; the host treats that as "the run was diagnostic-only past this
+point" but still emits whatever records made it.
+
+#### Peak updates (runtime → AICPU on resource touch)
+
+```cpp
+void scope_stats_update_allocator_peaks(int ring_id, uint64_t heap_bytes,
+                                        int32_t tasks_in_flight);
+void scope_stats_update_tensormap_peak(int32_t tensormap_used);
+void scope_stats_update_pool_peaks(int ring_id, int32_t fanin_used,
+                                   int32_t dep_used);
+```
+
+Called after each allocation or pool change. Each update walks
+`d ∈ [0, scope_stats_depth]` so peaks bubble up: an alloc spike inside
+an inner scope shows in both the inner and outer record. `ring_id` out
+of range is dropped silently (same clamp as capacity registration).
+
+#### Initial-sample callback (optional)
+
+```cpp
+typedef void (*ScopeStatsInitialSampleFn)(int32_t depth);
+void scope_stats_register_initial_sampler(ScopeStatsInitialSampleFn fn);
+```
+
+If the runtime registers a sampler, `on_begin` calls it instead of
+zero-clearing the new depth's peak arrays. Used to inherit the current
+in-flight resource snapshot at scope entry so the inner-scope peak is
+relative to "what was already in flight when we entered", not zero.
+Without it, the per-depth peak arrays are zeroed at each `on_begin`.
+
+### 3.3 Comparison with other profiling subsystems
+
+| Feature | Layer | Runtime scope | Why |
+| ------- | ----- | ------------- | --- |
+| PMU | platform only | all runtimes | reads hardware registers (platform) |
+| L2 swimlane | platform only | all runtimes | reads AICore ring buffers (platform) |
+| dep_gen | platform only | all runtimes | traces `submit_task` (runtime-agnostic) |
+| tensor dump | platform only | all runtimes | dumps tensor data (platform) |
+| **scope stats** | **platform API + runtime call sites** | **T&R only** | runtime extracts values, platform tracks peaks |
+
+### 3.4 Symbol resolution flow
+
+```text
+kernel.cpp (platform, shared by all runtimes)
+    ├── set_scope_stats_enabled(flag)
+    └── set_platform_scope_stats_base(addr)
+
+For host_build_graph AICPU .so:
+    kernel.cpp ──links──> platform collector
+    → symbols resolve, .so loads, scope_stats is enabled but
+      no runtime call sites invoke update APIs → no records
+
+For tensormap_and_ringbuffer AICPU .so:
+    kernel.cpp ──links──> platform collector
+    runtime call sites invoke update/capacity APIs
+    → full peak tracking active
+```
+
+## 4. Data Flow
+
+```text
+Host                              AICPU (T&R runtime)
+─────                             ─────────────────────
+ScopeStatsCollector                platform scope_stats_collector.cpp
+  allocate ScopeStatsBuffer          set_platform_scope_stats_base(addr)
+  set kernel_args fields             set_scope_stats_enabled(true)
+  launch kernel                      runtime: scope_stats_set_ring_capacity()
+      │                              runtime: scope_stats_set_tensormap_capacity()
+      │                                  │
+      │                              on PTO2_SCOPE begin:
+      │                                scope_stats_on_begin()
+      │                                runtime: scope_stats_update_*_peaks()
+      │                              on alloc/pool change:
+      │                                runtime: scope_stats_update_*_peaks()
+      │                              on PTO2_SCOPE end:
+      │                                scope_stats_on_end()
+      │                                  └─ write record to ScopeStatsBuffer
+      │                                  │
+  stream sync                        kernel exit
+  read ScopeStatsBuffer
+  emit scope_stats.json
+```
+
+## 5. Output: `scope_stats.json`
+
+The host emits `<output_prefix>/scope_stats.json` at finalize, after
+the device stream is synced. Schema (version 2):
+
+```json
+{
+  "version": 2,
+  "fatal": false,
+  "write_count": 2,
+  "cap": 16384,
+  "dropped": 0,
+  "records": [
+    {"site": "example_orchestration.cpp:77", "depth": 1,
+     "task_window": ["0/16384", "4/16384", "0/16384", "0/16384"],
+     "heap":        ["0/268435456", "8192/268435456", "0/268435456", "0/268435456"],
+     "dep":         ["0/16384", "5/16384", "0/16384", "0/16384"],
+     "fanin_used":  [0, 3, 0, 0],
+     "tensormap":   "5/65536"},
+    {"site": "kernel.cpp:80", "depth": 0,
+     "task_window": ["1/16384", "4/16384", "0/16384", "0/16384"],
+     "heap":        ["4096/268435456", "8192/268435456", "0/268435456", "0/268435456"],
+     "dep":         ["1/16384", "5/16384", "0/16384", "0/16384"],
+     "fanin_used":  [0, 3, 0, 0],
+     "tensormap":   "5/65536"}
+  ]
+}
+```
+
+Top-level fields:
+
+| Field | Type | Meaning |
+| ----- | ---- | ------- |
+| `version` | int | Always `2` |
+| `fatal` | bool | `true` iff `scope_stats_on_fatal()` fired during the run |
+| `write_count` | uint64 | Total `scope_end` events observed (incl. dropped) |
+| `cap` | uint32 | Ring capacity, `PTO2_SCOPE_STATS_LOG_CAP` (16384) |
+| `dropped` | uint64 | `max(write_count - cap, 0)` — overflow count |
+| `records` | array | Up to `min(cap, write_count)` records, oldest-first |
+
+Per-record fields:
+
+| Field | Type | Description |
+| ----- | ---- | ----------- |
+| `site` | `"basename:line"` | Source location of the `PTO2_SCOPE()` call |
+| `depth` | int | Nesting depth (0 = root scope inside the executor) |
+| `task_window[ring]` | `"used/cap"` | Peak task-window slots in use |
+| `heap[ring]` | `"used/cap"` | Peak per-ring heap bytes in use |
+| `dep[ring]` | `"used/cap"` | Peak dep-pool entries in use |
+| `fanin_used[ring]` | int32 | Peak fanin-pool entries in use (capacity isn't currently carried — fanin reservation is implicit in dep accounting) |
+| `tensormap` | `"used/cap"` | Peak tensormap entries in use |
+
+The `cap` denominators come from `scope_stats_set_ring_capacity` /
+`scope_stats_set_tensormap_capacity` snapshots, so they always reflect
+the values the runtime actually configured for that run.
+
+A worked example is in
+[`tests/st/a2a3/tensormap_and_ringbuffer/dfx/scope_stats/test_scope_stats.py`](../../tests/st/a2a3/tensormap_and_ringbuffer/dfx/scope_stats/test_scope_stats.py)
+— it runs the `vector_example` orchestration with `--enable-scope-stats`
+and asserts the resulting JSON for the depth=0 / depth=1 records the
+outer-executor + inner `PTO2_SCOPE` produce.
+
+## 6. Future: Cross-runtime Support
+
+If host_build_graph adds scope-like concepts in the future, extending
+scope_stats only requires adding the same platform API call sites in
+the HBG runtime — no platform changes needed. The platform collector
+is already runtime-agnostic; it accepts plain values and has no
+knowledge of T&R types.
diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index 7aa251db2..360c51989 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -618,6 +618,15 @@ NB_MODULE(_task_interface, m) {
                 c.enable_dep_gen = v ? 1 : 0;
             }
         )
+        .def_prop_rw(
+            "enable_scope_stats",
+            [](const CallConfig &c) {
+                return static_cast<bool>(c.enable_scope_stats);
+            },
+            [](CallConfig &c, bool v) {
+                c.enable_scope_stats = v ? 1 : 0;
+            }
+        )
         .def_prop_rw(
             "output_prefix",
             [](const CallConfig &c) -> std::string {
@@ -639,7 +648,8 @@ NB_MODULE(_task_interface, m) {
             os << "CallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
                << ", enable_l2_swimlane=" << self.enable_l2_swimlane
                << ", enable_dump_tensor=" << (self.enable_dump_tensor ? "True" : "False")
-               << ", enable_pmu=" << self.enable_pmu << ", enable_dep_gen=" << (self.enable_dep_gen ? "True" : "False");
+               << ", enable_pmu=" << self.enable_pmu << ", enable_dep_gen=" << (self.enable_dep_gen ? "True" : "False")
+               << ", enable_scope_stats=" << (self.enable_scope_stats ? "True" : "False");
             if (self.output_prefix_set()) {
                 os << ", output_prefix='" << self.output_prefix << "'";
             }
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index e4956e708..64274d35d 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -112,11 +112,11 @@ def my_l4_orch(orch, args, config):
 _OFF_CALLABLE = 8
 _OFF_CONFIG = 16
 # Packed CallConfig wire layout — must match call_config.h byte for byte:
-# 6 int32 (block_dim, aicpu_thread_num, enable_l2_swimlane, enable_dump_tensor,
-# enable_pmu, enable_dep_gen) + 1024-byte NUL-terminated output_prefix. Log
-# config travels separately via ChipWorker.init(log_level, log_info_v) — not
-# on per-task wire.
-_CFG_FMT = struct.Struct("=iiiiii1024s")
+# 7 int32 (block_dim, aicpu_thread_num, enable_l2_swimlane, enable_dump_tensor,
+# enable_pmu, enable_dep_gen, enable_scope_stats) + 1024-byte NUL-terminated
+# output_prefix. Log config travels separately via ChipWorker.init(log_level,
+# log_info_v) — not on per-task wire.
+_CFG_FMT = struct.Struct("=iiiiiii1024s")
 # Args region starts after CONFIG, rounded up to 8 bytes so the first
 # ContinuousTensor.data (uint64_t at OFF_ARGS+8) is 8-byte aligned, avoiding
 # SIGBUS on strict-alignment platforms (aarch64 atomics, some ARM cores).
@@ -718,7 +718,7 @@ def _chip_process_loop(
 
 def _read_config_from_mailbox(buf: memoryview) -> "CallConfig":
     """Reconstruct a CallConfig from the unified mailbox layout."""
-    block_dim, aicpu_tn, swl, dt, pmu, dep_gen, prefix_bytes = _CFG_FMT.unpack_from(buf, _OFF_CONFIG)
+    block_dim, aicpu_tn, swl, dt, pmu, dep_gen, scope_stats, prefix_bytes = _CFG_FMT.unpack_from(buf, _OFF_CONFIG)
     cfg = CallConfig()
     cfg.block_dim = block_dim
     cfg.aicpu_thread_num = aicpu_tn
@@ -726,6 +726,7 @@ def _read_config_from_mailbox(buf: memoryview) -> "CallConfig":
     cfg.enable_dump_tensor = bool(dt)
     cfg.enable_pmu = pmu
     cfg.enable_dep_gen = bool(dep_gen)
+    cfg.enable_scope_stats = bool(scope_stats)
     # NUL-terminated C string in a 1024-byte field.
     cfg.output_prefix = prefix_bytes.split(b"\x00", 1)[0].decode("utf-8")
     return cfg
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
index f179b5cba..04ec7ed1e 100644
--- a/simpler_setup/scene_test.py
+++ b/simpler_setup/scene_test.py
@@ -652,6 +652,7 @@ def run_class_cases(  # noqa: PLR0913 -- shared layer-5 entry; kwargs mirror CLI
     enable_dump_tensor,
     enable_pmu,
     enable_dep_gen,
+    enable_scope_stats,
 ):
     """Execute a pre-filtered list of cases for one class (layers 5-6).
 
@@ -661,11 +662,14 @@ def run_class_cases(  # noqa: PLR0913 -- shared layer-5 entry; kwargs mirror CLI
     """
     cls_name = type(cls_inst).__name__
     callable_spec = getattr(type(cls_inst), "CALLABLE", None)
-    diagnostics_on = enable_l2_swimlane or enable_dump_tensor or enable_pmu or enable_dep_gen
+    diagnostics_on = enable_l2_swimlane or enable_dump_tensor or enable_pmu or enable_dep_gen or enable_scope_stats
     for case in cases:
         case_label = f"{cls_name}_{case['name']}"
         # Per-case directory the runtime writes into. Required (non-empty) when
         # any diagnostic flag is on; CallConfig::validate() throws otherwise.
+        # scope_stats now writes <prefix>/scope_stats.json (sibling of
+        # l2_perf_records.json / deps.json), so it pulls output_prefix the
+        # same way the other DFX flags do.
         prefix = _build_output_prefix(case_label) if diagnostics_on else Path("")
         try:
             cls_inst._run_and_validate(
@@ -679,6 +683,7 @@ def run_class_cases(  # noqa: PLR0913 -- shared layer-5 entry; kwargs mirror CLI
                 enable_dump_tensor=enable_dump_tensor,
                 enable_pmu=enable_pmu,
                 enable_dep_gen=enable_dep_gen,
+                enable_scope_stats=enable_scope_stats,
                 output_prefix=str(prefix) if diagnostics_on else "",
             )
         finally:
@@ -851,6 +856,7 @@ def _build_config(
         enable_dump_tensor=False,
         enable_pmu=0,
         enable_dep_gen=False,
+        enable_scope_stats=False,
         *,
         output_prefix="",
     ):
@@ -867,6 +873,7 @@ def _build_config(
         config.enable_dump_tensor = enable_dump_tensor
         config.enable_pmu = enable_pmu  # 0=disabled, >0=enabled with event type
         config.enable_dep_gen = enable_dep_gen
+        config.enable_scope_stats = enable_scope_stats
         # `output_prefix` is required by CallConfig::validate() whenever any
         # diagnostic flag is enabled. Caller threads it down from the per-case
         # directory built by _build_output_prefix().
@@ -903,6 +910,7 @@ def _run_and_validate(  # noqa: PLR0913 -- threads CLI diagnostic flags + case c
         enable_dump_tensor=False,
         enable_pmu=0,
         enable_dep_gen=False,
+        enable_scope_stats=False,
         output_prefix="",
     ):
         if self._st_level == 2:
@@ -916,6 +924,7 @@ def _run_and_validate(  # noqa: PLR0913 -- threads CLI diagnostic flags + case c
                 enable_dump_tensor=enable_dump_tensor,
                 enable_pmu=enable_pmu,
                 enable_dep_gen=enable_dep_gen,
+                enable_scope_stats=enable_scope_stats,
                 output_prefix=output_prefix,
             )
         elif self._st_level == 3:
@@ -930,10 +939,11 @@ def _run_and_validate(  # noqa: PLR0913 -- threads CLI diagnostic flags + case c
                 enable_dump_tensor=enable_dump_tensor,
                 enable_pmu=enable_pmu,
                 enable_dep_gen=enable_dep_gen,
+                enable_scope_stats=enable_scope_stats,
                 output_prefix=output_prefix,
             )
 
-    def _run_and_validate_l2(
+    def _run_and_validate_l2(  # noqa: PLR0913 -- threads CLI diagnostic flags + case context
         self,
         worker,
         callable_obj,
@@ -944,6 +954,7 @@ def _run_and_validate_l2(
         enable_dump_tensor=False,
         enable_pmu=0,
         enable_dep_gen=False,
+        enable_scope_stats=False,
         output_prefix="",
     ):
         params = case.get("params", {})
@@ -993,6 +1004,7 @@ def _run_and_validate_l2(
                 enable_dump_tensor=enable_dump_tensor,
                 enable_pmu=enable_pmu,
                 enable_dep_gen=enable_dep_gen,
+                enable_scope_stats=enable_scope_stats,
                 output_prefix=output_prefix,
             )
 
@@ -1019,6 +1031,7 @@ def _run_and_validate_l3(  # noqa: PLR0913 -- threads CLI diagnostic flags + L3
         enable_dump_tensor=False,
         enable_pmu=0,
         enable_dep_gen=False,
+        enable_scope_stats=False,
         output_prefix="",
     ):
         # Defensive belt-and-braces: the pytest dispatcher and run_module both
@@ -1073,6 +1086,7 @@ def _run_and_validate_l3(  # noqa: PLR0913 -- threads CLI diagnostic flags + L3
                 enable_dump_tensor=enable_dump_tensor,
                 enable_pmu=enable_pmu,
                 enable_dep_gen=enable_dep_gen,
+                enable_scope_stats=enable_scope_stats,
                 output_prefix=output_prefix,
             )
 
@@ -1126,6 +1140,7 @@ def test_run(self, st_platform, st_worker, request):
         enable_dump_tensor = request.config.getoption("--dump-tensor", default=False)
         enable_pmu = request.config.getoption("--enable-pmu", default=0)
         enable_dep_gen = self._effective_enable_dep_gen(request, warn=True)
+        enable_scope_stats = request.config.getoption("--enable-scope-stats", default=False)
         if rounds > 1:
             if enable_l2_swimlane:
                 logger.warning("Profiling disabled: --rounds > 1")
@@ -1176,6 +1191,7 @@ def test_run(self, st_platform, st_worker, request):
             enable_dump_tensor=enable_dump_tensor,
             enable_pmu=enable_pmu,
             enable_dep_gen=enable_dep_gen,
+            enable_scope_stats=enable_scope_stats,
         )
 
     # ------------------------------------------------------------------
@@ -1243,6 +1259,13 @@ def run_module(module_name):  # noqa: PLR0912, PLR0915 -- CLI parsing + dispatch
             help="Enable PMU collection. Bare flag = PIPE_UTILIZATION(2). "
             "Pass event type to override (e.g. --enable-pmu 4)",
         )
+        parser.add_argument(
+            "--enable-scope-stats",
+            action="store_true",
+            default=False,
+            help="Enable per-scope peak collection and emit <output_prefix>/scope_stats.json "
+            "(per-scope ring-fill peaks).",
+        )
         parser.add_argument("--build", action="store_true", help="Compile runtime from source")
         parser.add_argument(
             "--runtime",
@@ -1433,6 +1456,7 @@ def run_module(module_name):  # noqa: PLR0912, PLR0915 -- CLI parsing + dispatch
                                 enable_dump_tensor=args.dump_tensor,
                                 enable_pmu=args.enable_pmu,
                                 enable_dep_gen=args.enable_dep_gen,
+                                enable_scope_stats=args.enable_scope_stats,
                             )
                             print("PASSED")
                         except Exception as e:  # noqa: BLE001
@@ -1472,6 +1496,8 @@ def _dispatch_test_phases_standalone(module_name, selected_by_cls, args):  # noq
         common.append("--dump-tensor")
     if args.enable_dep_gen:
         common.append("--enable-dep-gen")
+    if args.enable_scope_stats:
+        common.append("--enable-scope-stats")
     if args.build:
         common.append("--build")
 
diff --git a/src/a2a3/platform/include/aicpu/scope_stats_collector_aicpu.h b/src/a2a3/platform/include/aicpu/scope_stats_collector_aicpu.h
new file mode 100644
index 000000000..f3d944165
--- /dev/null
+++ b/src/a2a3/platform/include/aicpu/scope_stats_collector_aicpu.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include "common/scope_stats_buffer.h"
+
+// Scope-stats collector — platform-owned, runtime-agnostic.
+//
+// Platform owns all collector state and peak-tracking logic. Runtime calls
+// pure-value APIs to report resource usage; no runtime types cross the
+// boundary.
+//
+// Setter symbols (set_scope_stats_enabled, set_platform_scope_stats_base)
+// are exported unconditionally so the host-side sim DeviceRunner's dlsym
+// always resolves.
+
+extern "C" {
+
+// --- Scope lifecycle probes (called by orchestrator begin_scope/end_scope) ---
+
+void scope_stats_on_begin();
+void scope_stats_on_end();
+void scope_stats_on_fatal();
+
+// --- Site tracking ---
+
+void scope_stats_set_pending_site(const char *file, int line);
+
+// --- Setter symbols (always exported) ---
+
+void set_scope_stats_enabled(bool enable);
+void set_platform_scope_stats_base(uint64_t scope_stats_data_base);
+
+// --- Initial sampling callback ---
+
+typedef void (*ScopeStatsInitialSampleFn)(int32_t depth);
+void scope_stats_register_initial_sampler(ScopeStatsInitialSampleFn fn);
+
+// --- Pure-value peak update APIs (called by runtime at instrumentation points) ---
+// Single-producer assumption: peak updates use non-atomic read-max-write.
+// Safe when the orchestrator is single-threaded; concurrent callers may
+// lose peaks silently (acceptable for diagnostic data).
+
+void scope_stats_update_allocator_peaks(int ring_id, uint64_t heap_bytes, int32_t tasks_in_flight);
+void scope_stats_update_tensormap_peak(int32_t tensormap_used);
+void scope_stats_update_pool_peaks(int ring_id, int32_t fanin_used, int32_t dep_used);
+
+// --- Capacity registration (called by runtime at init) ---
+
+void scope_stats_set_ring_capacity(int ring_id, int32_t window_cap, uint64_t heap_cap, int32_t dep_cap);
+void scope_stats_set_tensormap_capacity(int32_t cap);
+
+}  // extern "C"
diff --git a/src/a2a3/platform/include/common/kernel_args.h b/src/a2a3/platform/include/common/kernel_args.h
index 99e72fdea..59c1d68be 100644
--- a/src/a2a3/platform/include/common/kernel_args.h
+++ b/src/a2a3/platform/include/common/kernel_args.h
@@ -88,12 +88,15 @@ struct KernelArgs {
     uint64_t pmu_data_base{0};          // PMU shared memory base address; use explicit flags to detect enablement
     uint64_t pmu_reg_addrs{0};          // Per-core PMU MMIO register base address array (onboard only; 0 on sim)
     uint64_t dep_gen_data_base{0};      // dep_gen shared memory base address; use explicit flags to detect enablement
+    uint64_t scope_stats_data_base{0};  // ScopeStatsBuffer shared memory base; 0 when scope_stats is off.
+                                        // Allocated by host's ScopeStatsCollector, read+written by AICPU's
+                                        // scope_stats_collector via set_platform_scope_stats_base.
     uint64_t aicore_ring_addr{0};       // Device ptr to a uint64_t[num_aicore] table holding each core's
                                         // L2PerfAicoreRing address. AICore kernel entry indexes by block_idx
                                         // and forwards into platform set/get state. 0 when L2 swimlane is off.
     uint32_t log_level{1};              // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
     uint32_t log_info_v{5};             // INFO verbosity threshold (0..9); default V5
-    uint32_t enable_profiling_flag{0};  // Profiling umbrella bitmask; bit0=dump_tensor, bit1=l2_swimlane, bit2=pmu
+    uint32_t enable_profiling_flag{0};  // Profiling umbrella bitmask; dump_tensor|l2_swimlane|pmu|dep_gen|scope_stats
     uint32_t _pad{0};                   // Alignment padding
 
     // Device pointer to an 8-byte buffer that the platform AICPU entry writes
diff --git a/src/a2a3/platform/include/common/platform_config.h b/src/a2a3/platform/include/common/platform_config.h
index 2bb14cb9c..53858e89b 100644
--- a/src/a2a3/platform/include/common/platform_config.h
+++ b/src/a2a3/platform/include/common/platform_config.h
@@ -177,6 +177,7 @@ inline double cycles_to_us(uint64_t cycles) {
 #define PROFILING_FLAG_L2_SWIMLANE (1u << 1)
 #define PROFILING_FLAG_PMU (1u << 2)
 #define PROFILING_FLAG_DEP_GEN (1u << 3)
+#define PROFILING_FLAG_SCOPE_STATS (1u << 4)
 #define GET_PROFILING_FLAG(flags, bit) ((((uint32_t)(flags)) & ((uint32_t)(bit))) != 0u)
 #define SET_PROFILING_FLAG(flags, bit) ((flags) |= (uint32_t)(bit))
 #define CLEAR_PROFILING_FLAG(flags, bit) ((flags) &= ~((uint32_t)(bit)))
diff --git a/src/a2a3/platform/include/common/scope_stats_buffer.h b/src/a2a3/platform/include/common/scope_stats_buffer.h
new file mode 100644
index 000000000..7780fc333
--- /dev/null
+++ b/src/a2a3/platform/include/common/scope_stats_buffer.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef PLATFORM_COMMON_SCOPE_STATS_BUFFER_H_
+#define PLATFORM_COMMON_SCOPE_STATS_BUFFER_H_
+
+#include <cstdint>
+
+// Layout shared between AICPU writer (scope_stats_collector_aicpu.cpp on
+// device) and the host reader (ScopeStatsHostBuffer on host). The whole block lives in a
+// host-allocated device-visible memory region; AICPU mutates `header.write_count`
+// and `records[i]` during the run, host snapshots both after the run to write
+// `<output_prefix>/scope_stats.json`.
+//
+// Hot-path semantics: AICPU appends one record per scope_end into the ring
+// using `idx = header.write_count % header.cap`, then increments
+// `write_count`. No locking — single-producer (orchestrator thread) /
+// single-consumer (host post-run). Host never reads while AICPU writes.
+//
+// Capacity (PTO2_SCOPE_STATS_LOG_CAP) is fixed at build time so the layout is
+// stable across host/device builds. 16 384 records × ~96 B = ~1.5 MB; the
+// host opts in via `--enable-scope-stats` and the allocation is skipped when
+// the flag is off.
+
+#define PTO2_SCOPE_STATS_LOG_CAP 16384u
+#define PTO2_SCOPE_STATS_MAX_RING_DEPTH 4
+#define PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH 64
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// One record per scope_end. Layout MUST stay in sync with the device-side
+// writer in platform/src/aicpu/scope_stats_collector_aicpu.cpp.
+struct ScopeStatsRecord {
+    uint64_t site_file_addr;      // device-side const char *; for diagnostics the host
+                                  // only logs the raw pointer (string table lives in
+                                  // the orchestration .so, not in shared memory).
+                                  // AICPU also writes a basename copy into site_file_basename.
+    char site_file_basename[32];  // NUL-terminated basename of site_file, captured
+                                  // at append time so the host JSON contains a
+                                  // human-readable path without dereferencing a
+                                  // device pointer.
+    int32_t site_line;
+    int16_t depth;
+    int16_t _pad0;
+    uint64_t heap_bytes[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    int32_t task_in_flight[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    int32_t dep_used[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    int32_t fanin_used[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    int32_t tensormap_used;
+    int32_t _pad1;
+};
+
+struct ScopeStatsHeader {
+    uint64_t write_count;    // Total append count. write_count > cap means the
+                             // ring wrapped; host reports `dropped = write_count - cap`
+                             // and emits `min(cap, write_count)` records starting
+                             // from `(write_count - kept) % cap`.
+    uint32_t cap;            // Fixed at PTO2_SCOPE_STATS_LOG_CAP; copied in by host
+                             // at init so device and host see the same value
+                             // without needing a separate sync.
+    uint32_t fatal_latched;  // AICPU sets to 1 on first fatal. Host uses this
+                             // to stamp the JSON `fatal` field — no separate
+                             // device→host channel needed.
+    // Per-ring capacities — snapshotted by AICPU once at scope_stats_bind
+    // (constant for the run, so writing them once is fine). Host needs them
+    // to render the "used/cap" ratio in JSON without re-introducing a
+    // separate device→host query.
+    int32_t task_window_cap[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    uint64_t heap_cap[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    int32_t dep_cap[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    int32_t tensormap_cap;
+    int32_t _pad;
+};
+
+struct ScopeStatsBuffer {
+    ScopeStatsHeader header;
+    ScopeStatsRecord records[PTO2_SCOPE_STATS_LOG_CAP];
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // PLATFORM_COMMON_SCOPE_STATS_BUFFER_H_
diff --git a/src/a2a3/platform/include/host/scope_stats_dump.h b/src/a2a3/platform/include/host/scope_stats_dump.h
new file mode 100644
index 000000000..af586b83b
--- /dev/null
+++ b/src/a2a3/platform/include/host/scope_stats_dump.h
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A2A3_PLATFORM_INCLUDE_HOST_SCOPE_STATS_DUMP_H_
+#define SRC_A2A3_PLATFORM_INCLUDE_HOST_SCOPE_STATS_DUMP_H_
+
+#include <cinttypes>
+#include <cstdio>
+#include <cstring>
+#include <filesystem>
+#include <functional>
+#include <string>
+
+#include "common/scope_stats_buffer.h"
+#include "common/unified_log.h"
+#include "host/profiling_common/profiler_base.h"
+
+// Header-only host-side helper for scope_stats. Intentionally NOT modeled on
+// L2PerfCollector / DepGenCollector — scope_stats is a single end-of-run
+// snapshot (no streaming, no mgmt thread, no reconcile), so this whole
+// feature collapses into one class with three short methods. Keeping every
+// host-side scope_stats line in one file isolates the feature from the
+// general device_runner flow: hooking the feature on adds three call sites
+// (init / dump / finalize) and zero protocol details to device_runner.
+//
+// AICPU side is symmetric: layout in `scope_stats_buffer.h`, device probes /
+// writer in `platform/src/aicpu/scope_stats_collector_aicpu.cpp`. The shared
+// layout header is the only file both sides include.
+
+// Memory callbacks — thin aliases for the canonical profiling_common shapes
+// (same pattern as dep_gen_collector / l2_perf_collector / pmu_collector /
+// tensor_dump_collector). Three init modes:
+//   - a2a3 onboard: pass register_cb (halHostRegister maps device→host).
+//   - a5 onboard:   pass copy_from_device_cb (no halHostRegister — host
+//                   shadow is allocated separately and refreshed at dump
+//                   time via rtMemcpy DEVICE_TO_HOST).
+//   - sim:          pass neither; sim is single-address-space, host_ptr
+//                   aliases device_ptr directly.
+using ScopeStatsAllocCallback = profiling_common::ProfAllocCallback;
+using ScopeStatsRegisterCallback = profiling_common::ProfRegisterCallback;
+using ScopeStatsUnregisterCallback = profiling_common::ProfUnregisterCallback;
+using ScopeStatsFreeCallback = profiling_common::ProfFreeCallback;
+using ScopeStatsCopyFromDeviceCallback = std::function<int(void *host_dst, const void *dev_src, std::size_t bytes)>;
+
+// BufferPoolManager template stub. ScopeStatsHostBuffer drives the manager
+// purely for its single-buffer alloc_and_register / free_buffer methods (the
+// canonical dev↔host shared-memory setup path on a2a3); the streaming
+// ready/done queue side is never touched, so the layout-trait aliases below
+// are unused placeholders required only by the manager's static checks.
+struct ScopeStatsModule {
+    using DataHeader = int;
+    using ReadyEntry = int;
+    using ReadyBufferInfo = int;
+    using FreeQueue = int;
+    static constexpr int kBufferKinds = 1;
+};
+
+class ScopeStatsHostBuffer {
+public:
+    // Allocate the device-side buffer and set up the host-side view. Returns
+    // 0 on success; on failure the object stays uninitialized and every
+    // other method is a no-op, so callers can chain without guarding.
+    //
+    // Exactly one of `register_cb` / `copy_from_device_cb` should be set on
+    // hardware (a2a3 vs a5 respectively); pass both null on sim.
+    int init(
+        const ScopeStatsAllocCallback &alloc_cb, ScopeStatsRegisterCallback register_cb,
+        const ScopeStatsFreeCallback &free_cb, const ScopeStatsCopyFromDeviceCallback &copy_from_device_cb,
+        int device_id
+    ) {
+        device_id_ = device_id;
+        copy_from_device_cb_ = copy_from_device_cb;
+        const std::size_t bytes = sizeof(ScopeStatsBuffer);
+
+        if (copy_from_device_cb_) {
+            // a5 onboard — no halHostRegister; allocate device + host shadow
+            // separately. ``dump()`` refreshes the shadow via rtMemcpy
+            // DEVICE_TO_HOST. The AICPU writer zeros the device header itself
+            // in set_platform_scope_stats_base.
+            dev_ptr_ = alloc_cb(bytes);
+            if (dev_ptr_ == nullptr) {
+                LOG_ERROR("scope_stats: failed to alloc %zu bytes", bytes);
+                return -1;
+            }
+            host_ptr_ = std::malloc(bytes);
+            if (host_ptr_ == nullptr) {
+                LOG_ERROR("scope_stats: failed to alloc host shadow %zu bytes", bytes);
+                if (free_cb) free_cb(dev_ptr_);
+                dev_ptr_ = nullptr;
+                return -1;
+            }
+            std::memset(host_ptr_, 0, bytes);
+            host_shadow_owned_ = true;
+        } else {
+            // a2a3 onboard (halHostRegister) and sim (identity) both go
+            // through profiling_common::BufferPoolManager so this collector's
+            // dev↔host setup matches every pool-based collector's. Manager is
+            // used only for alloc_and_register / free_buffer — its streaming
+            // pool/queue side stays untouched.
+            profiling_common::MemoryOps ops;
+            ops.alloc = alloc_cb;
+            ops.free_ = free_cb;
+            if (register_cb != nullptr) {
+                ops.reg = register_cb;
+            } else {
+                // Sim — single address space; install an identity wrapper so
+                // alloc_and_register has a uniform code path (mirrors what
+                // ProfilerBase::start does for sim collectors).
+                ops.reg = [](void *dev, std::size_t /*size*/, int /*device_id*/, void **host_ptr_out) {
+                    *host_ptr_out = dev;
+                    return 0;
+                };
+            }
+            manager_.set_memory_context(std::move(ops), /*shared_mem_host=*/nullptr, device_id);
+            dev_ptr_ = manager_.alloc_and_register(bytes, &host_ptr_);
+            if (dev_ptr_ == nullptr) {
+                return -1;
+            }
+            std::memset(host_ptr_, 0, bytes);
+            host_shadow_owned_ = false;
+        }
+        initialized_ = true;
+        return 0;
+    }
+
+    bool is_initialized() const { return initialized_; }
+
+    void *device_ptr() const { return dev_ptr_; }
+
+    // Snapshot the shared region as JSON at <output_dir>/scope_stats.json.
+    // Assumes the device stream has already been synced (matches dep_gen /
+    // l2_perf export ordering), so AICPU writes are fully visible.
+    int dump(const std::string &output_dir) const {
+        if (!initialized_ || host_ptr_ == nullptr) return 0;
+        if (host_shadow_owned_ && copy_from_device_cb_) {
+            // a5 onboard: refresh the host shadow before reading.
+            int rc = copy_from_device_cb_(host_ptr_, dev_ptr_, sizeof(ScopeStatsBuffer));
+            if (rc != 0) {
+                LOG_ERROR("scope_stats: copy_from_device failed: %d", rc);
+                return rc;
+            }
+        }
+        const std::string path = make_path(output_dir);
+        const auto *buf = static_cast<const ScopeStatsBuffer *>(host_ptr_);
+        return write_json(buf, path);
+    }
+
+    void finalize(ScopeStatsUnregisterCallback unregister_cb, const ScopeStatsFreeCallback &free_cb) {
+        if (!initialized_) return;
+        if (host_shadow_owned_) {
+            // a5 path — manual cleanup; the BufferPoolManager was never set up
+            // because halHostRegister isn't available on a5.
+            if (dev_ptr_ != nullptr && free_cb) free_cb(dev_ptr_);
+            if (host_ptr_ != nullptr) std::free(host_ptr_);
+        } else {
+            // a2a3 / sim — undo manager_.alloc_and_register: unregister the
+            // halHostRegister mapping (BufferPoolManager does not own that
+            // side), then let the manager free the device buffer and erase
+            // its dev→host map entry.
+            if (dev_ptr_ != nullptr && unregister_cb != nullptr) {
+                int rc = unregister_cb(dev_ptr_, device_id_);
+                if (rc != 0) {
+                    LOG_ERROR("scope_stats: halHostUnregister failed: %d", rc);
+                }
+            }
+            manager_.free_buffer(dev_ptr_);
+        }
+        dev_ptr_ = nullptr;
+        host_ptr_ = nullptr;
+        host_shadow_owned_ = false;
+        copy_from_device_cb_ = nullptr;
+        initialized_ = false;
+    }
+
+private:
+    static std::string make_path(const std::string &output_dir) {
+        std::filesystem::path dir(output_dir);
+        std::error_code ec;
+        std::filesystem::create_directories(dir, ec);
+        if (ec) {
+            LOG_WARN("scope_stats: failed to create output dir %s: %s", output_dir.c_str(), ec.message().c_str());
+        }
+        return (dir / "scope_stats.json").string();
+    }
+
+    // Schema (version 2) — flat, not Chrome-trace, because scope_stats is a
+    // list of per-scope_end snapshots, not a timeline. Each metric is rendered
+    // as a `"used/cap"` string so the JSON reads the same as the original
+    // `[ScopeStats]` log line:
+    //   { "version": 2, "fatal": bool,
+    //     "write_count": uint, "cap": uint, "dropped": uint,
+    //     "records": [
+    //       { "site": "file:line", "depth": int,
+    //         "task_window":    ["used/cap", "used/cap", "used/cap", "used/cap"],
+    //         "heap":           ["used/cap", "used/cap", "used/cap", "used/cap"],
+    //         "dep":            ["used/cap", "used/cap", "used/cap", "used/cap"],
+    //         "fanin_used":     [int, int, int, int],
+    //         "tensormap":      "used/cap" },
+    //       ...
+    //     ] }
+    static int write_json(const ScopeStatsBuffer *buf, const std::string &path) {
+        if (buf == nullptr) {
+            LOG_ERROR("scope_stats: null buffer");
+            return -1;
+        }
+        std::FILE *fp = std::fopen(path.c_str(), "w");
+        if (fp == nullptr) {
+            LOG_ERROR("scope_stats: failed to open %s", path.c_str());
+            return -1;
+        }
+
+        const std::uint64_t write_count = buf->header.write_count;
+        const std::uint32_t cap = buf->header.cap;
+        const std::uint64_t kept = write_count > cap ? cap : write_count;
+        const std::uint64_t dropped = write_count > cap ? (write_count - cap) : 0;
+        const std::uint64_t start = write_count - kept;
+
+        std::fprintf(fp, "{\n");
+        std::fprintf(fp, "  \"version\": 2,\n");
+        std::fprintf(fp, "  \"fatal\": %s,\n", buf->header.fatal_latched ? "true" : "false");
+        std::fprintf(fp, "  \"write_count\": %" PRIu64 ",\n", write_count);
+        std::fprintf(fp, "  \"cap\": %u,\n", cap);
+        std::fprintf(fp, "  \"dropped\": %" PRIu64 ",\n", dropped);
+
+        std::fprintf(fp, "  \"records\": [");
+        for (std::uint64_t i = 0; i < kept; i++) {
+            const ScopeStatsRecord &rec = buf->records[(start + i) % cap];
+            if (i) std::fputc(',', fp);
+            std::fprintf(fp, "\n    {");
+            // Bound the print to the on-wire field size in case a future
+            // writer change drops the NUL terminator.
+            const std::size_t site_len = strnlen(rec.site_file_basename, sizeof(rec.site_file_basename));
+            std::fprintf(
+                fp, "\"site\": \"%.*s:%d\", ", static_cast<int>(site_len), rec.site_file_basename, rec.site_line
+            );
+            std::fprintf(fp, "\"depth\": %d, ", rec.depth);
+            std::fprintf(fp, "\"task_window\": ");
+            write_i32_over_i32_array(
+                fp, rec.task_in_flight, buf->header.task_window_cap, PTO2_SCOPE_STATS_MAX_RING_DEPTH
+            );
+            std::fprintf(fp, ", \"heap\": ");
+            write_u64_over_u64_array(fp, rec.heap_bytes, buf->header.heap_cap, PTO2_SCOPE_STATS_MAX_RING_DEPTH);
+            std::fprintf(fp, ", \"dep\": ");
+            write_i32_over_i32_array(fp, rec.dep_used, buf->header.dep_cap, PTO2_SCOPE_STATS_MAX_RING_DEPTH);
+            std::fprintf(fp, ", \"fanin_used\": ");
+            write_i32_array(fp, rec.fanin_used, PTO2_SCOPE_STATS_MAX_RING_DEPTH);
+            std::fprintf(fp, ", \"tensormap\": \"%d/%d\"}", rec.tensormap_used, buf->header.tensormap_cap);
+        }
+        std::fprintf(fp, "\n  ]\n}\n");
+        std::fclose(fp);
+
+        LOG_INFO_V1("scope_stats: wrote %" PRIu64 " records (dropped=%" PRIu64 ") to %s", kept, dropped, path.c_str());
+        return 0;
+    }
+
+    static void write_i32_array(std::FILE *fp, const std::int32_t *arr, std::size_t n) {
+        std::fputc('[', fp);
+        for (std::size_t i = 0; i < n; i++) {
+            if (i) std::fputc(',', fp);
+            std::fprintf(fp, "%d", arr[i]);
+        }
+        std::fputc(']', fp);
+    }
+
+    static void
+    write_i32_over_i32_array(std::FILE *fp, const std::int32_t *used, const std::int32_t *cap, std::size_t n) {
+        std::fputc('[', fp);
+        for (std::size_t i = 0; i < n; i++) {
+            if (i) std::fputc(',', fp);
+            std::fprintf(fp, "\"%d/%d\"", used[i], cap[i]);
+        }
+        std::fputc(']', fp);
+    }
+
+    static void
+    write_u64_over_u64_array(std::FILE *fp, const std::uint64_t *used, const std::uint64_t *cap, std::size_t n) {
+        std::fputc('[', fp);
+        for (std::size_t i = 0; i < n; i++) {
+            if (i) std::fputc(',', fp);
+            std::fprintf(fp, "\"%" PRIu64 "/%" PRIu64 "\"", used[i], cap[i]);
+        }
+        std::fputc(']', fp);
+    }
+
+    bool initialized_ = false;
+    int device_id_ = -1;
+    void *dev_ptr_ = nullptr;
+    void *host_ptr_ = nullptr;
+    bool host_shadow_owned_ = false;
+    ScopeStatsCopyFromDeviceCallback copy_from_device_cb_;
+    // Owns the dev↔host shared-memory setup for the a2a3 / sim path. Stays
+    // unconfigured (no set_memory_context call) when init() takes the a5
+    // host-shadow path; finalize() routes accordingly via host_shadow_owned_.
+    profiling_common::BufferPoolManager<ScopeStatsModule> manager_;
+};
+
+#endif  // SRC_A2A3_PLATFORM_INCLUDE_HOST_SCOPE_STATS_DUMP_H_
diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp
index 2e28997e8..9776b423f 100644
--- a/src/a2a3/platform/onboard/aicpu/kernel.cpp
+++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp
@@ -20,6 +20,7 @@
 #include "aicpu/pmu_collector_aicpu.h"
 #include "aicpu/platform_regs.h"
 #include "aicpu/platform_aicpu_affinity.h"
+#include "aicpu/scope_stats_collector_aicpu.h"
 #include "aicpu/tensor_dump_aicpu.h"
 #include "runtime.h"
 
@@ -113,6 +114,8 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a
     set_pmu_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_PMU));
     set_platform_dep_gen_base(k_args->dep_gen_data_base);
     set_dep_gen_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DEP_GEN));
+    set_scope_stats_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_SCOPE_STATS));
+    set_platform_scope_stats_base(k_args->scope_stats_data_base);
 
     // Affinity gate: drop excess threads before entering runtime
     if (!platform_aicpu_affinity_gate(runtime->aicpu_thread_num, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH)) {
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index cf6ddea88..8172a1d78 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -24,6 +24,7 @@
 
 #include <cassert>
 #include <cstddef>
+#include <cstring>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -701,6 +702,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     if (enable_dep_gen_) {
         SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_DEP_GEN);
     }
+    if (enable_scope_stats_) {
+        SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_SCOPE_STATS);
+    }
     kernel_args_.args.enable_profiling_flag = enable_profiling_flag;
 
     for (int i = 0; i < num_aicore; i++) {
@@ -765,6 +769,14 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         }
     }
 
+    if (enable_scope_stats_) {
+        rc = init_scope_stats(device_id_);
+        if (rc != 0) {
+            LOG_ERROR("init_scope_stats failed: %d", rc);
+            return rc;
+        }
+    }
+
     // On any exit from run() — success or early error — release the diagnostics
     // collectors' shared memory. They are only re-initialized per run(), so a
     // Worker reused across runs (e.g. a pytest session-scoped worker pool) would
@@ -931,6 +943,13 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         }
     }
 
+    if (enable_scope_stats_ && scope_stats_buf_.is_initialized()) {
+        // Device stream sync has already completed by this point (matches
+        // l2_perf / dep_gen export ordering), so AICPU writes to the shared
+        // region are fully visible.
+        scope_stats_buf_.dump(output_prefix_);
+    }
+
     // Print handshake results (reads from device memory, must be before free)
     print_handshake_results();
 
@@ -1508,6 +1527,37 @@ int DeviceRunner::init_dep_gen(int num_threads, int device_id) {
     return 0;
 }
 
+int DeviceRunner::init_scope_stats(int device_id) {
+    auto alloc_cb = [this](size_t size) -> void * {
+        return mem_alloc_.alloc(size);
+    };
+
+    auto register_cb = +[](void *dev_ptr, size_t size, int device_id, void **host_ptr) -> int {
+        if (load_hal_if_needed() != 0) {
+            LOG_ERROR("Failed to load ascend_hal for scope_stats: %s", dlerror());
+            return -1;
+        }
+        HalHostRegisterFn fn = get_halHostRegister();
+        if (fn == nullptr) {
+            LOG_ERROR("halHostRegister symbol not found: %s", dlerror());
+            return -1;
+        }
+        return fn(dev_ptr, size, DEV_SVM_MAP_HOST, device_id, host_ptr);
+    };
+
+    auto free_cb = [this](void *dev_ptr) -> int {
+        return mem_alloc_.free(dev_ptr);
+    };
+
+    int rc = scope_stats_buf_.init(alloc_cb, register_cb, free_cb, /*copy_from_device_cb=*/nullptr, device_id);
+    if (rc != 0) {
+        return rc;
+    }
+
+    kernel_args_.args.scope_stats_data_base = reinterpret_cast<uint64_t>(scope_stats_buf_.device_ptr());
+    return 0;
+}
+
 void DeviceRunner::finalize_collectors() {
     auto unregister_cb = [](void *dev_ptr, int device_id) -> int {
         HalHostUnregisterFn fn = get_halHostUnregister();
@@ -1532,4 +1582,8 @@ void DeviceRunner::finalize_collectors() {
     if (dep_gen_collector_.is_initialized()) {
         dep_gen_collector_.finalize(unregister_cb, free_cb);
     }
+    if (scope_stats_buf_.is_initialized()) {
+        scope_stats_buf_.finalize(unregister_cb, free_cb);
+        kernel_args_.args.scope_stats_data_base = 0;
+    }
 }
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index 53fb6555f..d1b27e5f4 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -52,6 +52,7 @@
 #include "host/pmu_collector.h"
 #include "host/dep_gen_collector.h"
 #include "load_aicpu_op.h"
+#include "host/scope_stats_dump.h"
 #include "runtime.h"
 
 /**
@@ -313,6 +314,7 @@ class DeviceRunner {
         pmu_event_type_ = resolve_pmu_event_type(enable_pmu);
     }
     void set_dep_gen_enabled(bool enable) { enable_dep_gen_ = enable; }
+    void set_scope_stats_enabled(bool enable) { enable_scope_stats_ = enable; }
     // Directory under which all diagnostic artifacts (l2_perf_records.json /
     // tensor_dump/ / pmu.csv) land. Required (non-empty) when any diagnostic
     // is enabled; CallConfig::validate() enforces this contract upstream.
@@ -830,6 +832,7 @@ class DeviceRunner {
      * @return 0 on success, error code on failure
      */
     int init_dep_gen(int num_threads, int device_id);
+    int init_scope_stats(int device_id);
 
     /**
      * Finalize whichever diagnostics collectors are currently initialized,
@@ -850,6 +853,12 @@ class DeviceRunner {
     bool enable_dump_tensor_{false};
     bool enable_pmu_{false};
     bool enable_dep_gen_{false};
+    bool enable_scope_stats_{false};
+    // scope_stats has no dedicated collector class — its data is a single
+    // end-of-run snapshot (no streaming, no mgmt thread). All scope_stats
+    // host logic lives in ScopeStatsHostBuffer; this member is the only hook
+    // device_runner needs.
+    ScopeStatsHostBuffer scope_stats_buf_;
     L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED};             // resolved from set_l2_swimlane_enabled()
     PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION};  // resolved from set_pmu_enabled()
     std::string output_prefix_{};                                  // diagnostic artifact root directory
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index 744b7291c..ad44c6bfc 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -333,7 +333,7 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen,
-    const char *output_prefix, PtoRunTiming *out_timing
+    int enable_scope_stats, const char *output_prefix, PtoRunTiming *out_timing
 ) {
     if (out_timing != NULL) {
         out_timing->host_wall_ns = 0;
@@ -400,6 +400,7 @@ int run_prepared(
         runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
         runner->set_pmu_enabled(enable_pmu);
         runner->set_dep_gen_enabled(enable_dep_gen != 0);
+        runner->set_scope_stats_enabled(enable_scope_stats != 0);
         runner->set_output_prefix(output_prefix);
 
         rc = runner->run(*r, block_dim, aicpu_thread_num);
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 1635f3a7a..6b3684cfa 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -285,6 +285,20 @@ int DeviceRunner::ensure_binaries_loaded() {
             return -1;
         }
 
+        set_scope_stats_enabled_func_ =
+            reinterpret_cast<void (*)(bool)>(dlsym(aicpu_so_handle_, "set_scope_stats_enabled"));
+        if (set_scope_stats_enabled_func_ == nullptr) {
+            LOG_ERROR("dlsym failed for set_scope_stats_enabled: %s", dlerror());
+            return -1;
+        }
+
+        set_platform_scope_stats_base_func_ =
+            reinterpret_cast<void (*)(uint64_t)>(dlsym(aicpu_so_handle_, "set_platform_scope_stats_base"));
+        if (set_platform_scope_stats_base_func_ == nullptr) {
+            LOG_ERROR("dlsym failed for set_platform_scope_stats_base: %s", dlerror());
+            return -1;
+        }
+
         // Log config travels via the RTLD_GLOBAL HostLogger singleton in
         // libsimpler_log.so — already seeded by simpler_log_init() before the
         // AICPU sim SO was dlopen'd, so no per-SO setter forwarding is needed.
@@ -436,6 +450,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     if (enable_dep_gen_) {
         SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_DEP_GEN);
     }
+    if (enable_scope_stats_) {
+        SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_SCOPE_STATS);
+    }
     kernel_args_.enable_profiling_flag = enable_profiling_flag;
 
     for (int i = 0; i < num_aicore; i++) {
@@ -503,6 +520,14 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         }
     }
 
+    if (enable_scope_stats_) {
+        rc = init_scope_stats();
+        if (rc != 0) {
+            LOG_ERROR("init_scope_stats failed: %d", rc);
+            return rc;
+        }
+    }
+
     // On any exit from run() — success or early error — release the diagnostics
     // collectors' shared memory. They are only re-initialized per run(), so a
     // Worker reused across runs (e.g. a pytest session-scoped worker pool) would
@@ -589,7 +614,8 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         set_platform_dump_base_func_ == nullptr || set_dump_tensor_enabled_func_ == nullptr ||
         set_platform_pmu_base_func_ == nullptr || set_platform_pmu_reg_addrs_func_ == nullptr ||
         set_pmu_enabled_func_ == nullptr || set_platform_dep_gen_base_func_ == nullptr ||
-        set_dep_gen_enabled_func_ == nullptr) {
+        set_dep_gen_enabled_func_ == nullptr || set_scope_stats_enabled_func_ == nullptr ||
+        set_platform_scope_stats_base_func_ == nullptr) {
         LOG_ERROR("Executor functions not loaded. Call ensure_binaries_loaded first.");
         return -1;
     }
@@ -604,6 +630,8 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     set_pmu_enabled_func_(enable_pmu_);
     set_platform_dep_gen_base_func_(kernel_args_.dep_gen_data_base);
     set_dep_gen_enabled_func_(enable_dep_gen_);
+    set_scope_stats_enabled_func_(enable_scope_stats_);
+    set_platform_scope_stats_base_func_(kernel_args_.scope_stats_data_base);
 
     // No per-SO log-config push: HostLogger lives in libsimpler_log.so
     // (RTLD_GLOBAL singleton) and the AICPU sim SO reads it directly via the
@@ -736,6 +764,10 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         }
     }
 
+    if (enable_scope_stats_ && scope_stats_buf_.is_initialized()) {
+        scope_stats_buf_.dump(output_prefix_);
+    }
+
     // Print handshake results at end of run
     print_handshake_results();
 
@@ -785,6 +817,8 @@ void DeviceRunner::unload_executor_binaries() {
         set_pmu_enabled_func_ = nullptr;
         set_platform_dep_gen_base_func_ = nullptr;
         set_dep_gen_enabled_func_ = nullptr;
+        set_scope_stats_enabled_func_ = nullptr;
+        set_platform_scope_stats_base_func_ = nullptr;
         aicpu_so_loaded_ = false;
     }
     if (!aicpu_so_path_.empty()) {
@@ -1230,6 +1264,28 @@ int DeviceRunner::init_dep_gen(int num_threads, int /*device_id*/) {
     return 0;
 }
 
+int DeviceRunner::init_scope_stats() {
+    auto alloc_cb = [this](size_t size) -> void * {
+        return mem_alloc_.alloc(size);
+    };
+    auto free_cb = [this](void *dev_ptr) -> int {
+        return mem_alloc_.free(dev_ptr);
+    };
+
+    // Sim shares an address space with the AICPU thread, so register_cb is
+    // not needed (mirrors PMU / dep_gen's nullptr in sim).
+    int rc = scope_stats_buf_.init(
+        alloc_cb, /*register_cb=*/nullptr, free_cb, /*copy_from_device_cb=*/nullptr,
+        /*device_id=*/-1
+    );
+    if (rc != 0) {
+        return rc;
+    }
+
+    kernel_args_.scope_stats_data_base = reinterpret_cast<uint64_t>(scope_stats_buf_.device_ptr());
+    return 0;
+}
+
 void DeviceRunner::finalize_collectors() {
     // Free through MemoryAllocator so finalize() can audit. Sim shares an
     // address space with the AICPU thread, so no host unregister is needed.
@@ -1249,4 +1305,8 @@ void DeviceRunner::finalize_collectors() {
     if (dep_gen_collector_.is_initialized()) {
         dep_gen_collector_.finalize(nullptr, free_cb);
     }
+    if (scope_stats_buf_.is_initialized()) {
+        scope_stats_buf_.finalize(nullptr, free_cb);
+        kernel_args_.scope_stats_data_base = 0;
+    }
 }
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 73b3dfea2..1b98c22d9 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -58,6 +58,7 @@
 #include "host/tensor_dump_collector.h"
 #include "host/pmu_collector.h"
 #include "host/dep_gen_collector.h"
+#include "host/scope_stats_dump.h"
 #include "runtime.h"
 
 /**
@@ -186,6 +187,7 @@ class DeviceRunner {
         pmu_event_type_ = resolve_pmu_event_type(enable_pmu);
     }
     void set_dep_gen_enabled(bool enable) { enable_dep_gen_ = enable; }
+    void set_scope_stats_enabled(bool enable) { enable_scope_stats_ = enable; }
     // Directory under which all diagnostic artifacts (l2_perf_records.json /
     // tensor_dump/ / pmu.csv) land. Required (non-empty) when any diagnostic
     // is enabled; CallConfig::validate() enforces this contract upstream.
@@ -370,6 +372,8 @@ class DeviceRunner {
     void (*set_pmu_enabled_func_)(bool){nullptr};
     void (*set_platform_dep_gen_base_func_)(uint64_t){nullptr};
     void (*set_dep_gen_enabled_func_)(bool){nullptr};
+    void (*set_scope_stats_enabled_func_)(bool){nullptr};
+    void (*set_platform_scope_stats_base_func_)(uint64_t){nullptr};
     std::string aicpu_so_path_;
     std::string aicore_so_path_;
 
@@ -382,6 +386,7 @@ class DeviceRunner {
     PmuCollector pmu_collector_;
     // dep_gen collector — captures orchestrator submit_task inputs for offline replay
     DepGenCollector dep_gen_collector_;
+    ScopeStatsHostBuffer scope_stats_buf_;
 
     // Private helper methods — read aicpu_so_binary_ / aicore_kernel_binary_
     // off the runner (populated by set_executors during simpler_init).
@@ -414,6 +419,7 @@ class DeviceRunner {
     int init_pmu(int num_cores, int num_threads, const std::string &csv_path, PmuEventType event_type, int device_id);
 
     int init_dep_gen(int num_threads, int device_id);
+    int init_scope_stats();
 
     /**
      * Finalize whichever diagnostics collectors are currently initialized,
@@ -432,6 +438,7 @@ class DeviceRunner {
     bool enable_dump_tensor_{false};
     bool enable_pmu_{false};
     bool enable_dep_gen_{false};
+    bool enable_scope_stats_{false};
     L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED};             // resolved from set_l2_swimlane_enabled()
     PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION};  // resolved from set_pmu_enabled()
     std::string output_prefix_{};                                  // diagnostic artifact root directory
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index 4ad438a9c..8ef59d95a 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -305,7 +305,7 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen,
-    const char *output_prefix, PtoRunTiming *out_timing
+    int enable_scope_stats, const char *output_prefix, PtoRunTiming *out_timing
 ) {
     if (out_timing != NULL) {
         out_timing->host_wall_ns = 0;
@@ -359,6 +359,7 @@ int run_prepared(
         runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
         runner->set_pmu_enabled(enable_pmu);
         runner->set_dep_gen_enabled(enable_dep_gen != 0);
+        runner->set_scope_stats_enabled(enable_scope_stats != 0);
         runner->set_output_prefix(output_prefix);
 
         rc = runner->run(*r, block_dim, aicpu_thread_num);
diff --git a/src/a2a3/platform/src/aicpu/scope_stats_collector_aicpu.cpp b/src/a2a3/platform/src/aicpu/scope_stats_collector_aicpu.cpp
new file mode 100644
index 000000000..a00ac3988
--- /dev/null
+++ b/src/a2a3/platform/src/aicpu/scope_stats_collector_aicpu.cpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Platform-layer scope_stats collector.
+//
+// Owns all collector state (depth, peak arrays, shared buffer) and exposes
+// pure-value APIs for runtime to report resource usage. No runtime-specific
+// types cross the boundary.
+
+#include "aicpu/scope_stats_collector_aicpu.h"
+
+#include <cstring>
+
+#include "common/scope_stats_buffer.h"
+
+// ---------------------------------------------------------------------------
+// Collector state
+// ---------------------------------------------------------------------------
+
+int32_t scope_stats_depth = -1;
+bool scope_stats_enabled = false;
+
+static uint64_t scope_stats_peak_heap_bytes[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {};
+static int32_t scope_stats_peak_task_in_flight[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {};
+static int32_t scope_stats_peak_fanin_used[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {};
+static int32_t scope_stats_peak_dep_used[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {};
+static int32_t scope_stats_peak_tensormap_used[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH] = {};
+static ScopeStatsBuffer *scope_stats_shared_buf = nullptr;
+
+namespace {
+
+ScopeStatsInitialSampleFn g_initial_sample_fn = nullptr;
+
+const char *s_pending_site_file = nullptr;
+int32_t s_pending_site_line = 0;
+
+const char *s_scope_site_file[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH] = {};
+int32_t s_scope_site_line[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH] = {};
+
+inline const char *basename_of(const char *path) {
+    if (!path) return "(unknown)";
+    const char *base = path;
+    for (const char *p = path; *p; ++p) {
+        if (*p == '/' || *p == '\\') base = p + 1;
+    }
+    return base;
+}
+
+inline void copy_basename(char (&dst)[32], const char *src) {
+    const char *base = basename_of(src);
+    size_t i = 0;
+    for (; i + 1 < sizeof(dst) && base[i]; i++)
+        dst[i] = base[i];
+    dst[i] = '\0';
+}
+
+}  // namespace
+
+// ---------------------------------------------------------------------------
+// Setter symbols — always exported, unconditionally compiled
+// ---------------------------------------------------------------------------
+
+extern "C" void set_scope_stats_enabled(bool enable) { scope_stats_enabled = enable; }
+
+extern "C" void set_platform_scope_stats_base(uint64_t scope_stats_data_base) {
+    scope_stats_shared_buf = reinterpret_cast<ScopeStatsBuffer *>(scope_stats_data_base);
+    // Reset collector-local statics so a prior run that crashed mid-scope (or
+    // reused the same AICPU .so process) can't leak stale depth/peak data into
+    // the new run's records.
+    scope_stats_depth = -1;
+    s_pending_site_file = nullptr;
+    s_pending_site_line = 0;
+    memset(scope_stats_peak_heap_bytes, 0, sizeof(scope_stats_peak_heap_bytes));
+    memset(scope_stats_peak_task_in_flight, 0, sizeof(scope_stats_peak_task_in_flight));
+    memset(scope_stats_peak_fanin_used, 0, sizeof(scope_stats_peak_fanin_used));
+    memset(scope_stats_peak_dep_used, 0, sizeof(scope_stats_peak_dep_used));
+    memset(scope_stats_peak_tensormap_used, 0, sizeof(scope_stats_peak_tensormap_used));
+    memset(s_scope_site_file, 0, sizeof(s_scope_site_file));
+    memset(s_scope_site_line, 0, sizeof(s_scope_site_line));
+    if (scope_stats_shared_buf) {
+        memset(&scope_stats_shared_buf->header, 0, sizeof(scope_stats_shared_buf->header));
+        scope_stats_shared_buf->header.cap = PTO2_SCOPE_STATS_LOG_CAP;
+    }
+}
+
+extern "C" void scope_stats_register_initial_sampler(ScopeStatsInitialSampleFn fn) { g_initial_sample_fn = fn; }
+
+// ---------------------------------------------------------------------------
+// Scope lifecycle probes
+// ---------------------------------------------------------------------------
+
+extern "C" void scope_stats_set_pending_site(const char *file, int line) {
+    s_pending_site_file = file;
+    s_pending_site_line = line;
+}
+
+extern "C" void scope_stats_on_begin() {
+    if (!scope_stats_enabled) return;
+    if (scope_stats_depth + 1 >= PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH) return;
+    ++scope_stats_depth;
+    s_scope_site_file[scope_stats_depth] = s_pending_site_file;
+    s_scope_site_line[scope_stats_depth] = s_pending_site_line;
+    s_pending_site_file = nullptr;
+    s_pending_site_line = 0;
+    if (g_initial_sample_fn) {
+        g_initial_sample_fn(scope_stats_depth);
+    } else {
+        for (int r = 0; r < PTO2_SCOPE_STATS_MAX_RING_DEPTH; r++) {
+            scope_stats_peak_heap_bytes[scope_stats_depth][r] = 0;
+            scope_stats_peak_task_in_flight[scope_stats_depth][r] = 0;
+            scope_stats_peak_fanin_used[scope_stats_depth][r] = 0;
+            scope_stats_peak_dep_used[scope_stats_depth][r] = 0;
+        }
+        scope_stats_peak_tensormap_used[scope_stats_depth] = 0;
+    }
+}
+
+extern "C" void scope_stats_on_end() {
+    if (!scope_stats_enabled) return;
+    if (scope_stats_depth < 0) return;
+    if (scope_stats_depth >= 0 && scope_stats_shared_buf) {
+        uint64_t idx = scope_stats_shared_buf->header.write_count % PTO2_SCOPE_STATS_LOG_CAP;
+        ScopeStatsRecord &rec = scope_stats_shared_buf->records[idx];
+        rec.site_file_addr = reinterpret_cast<uint64_t>(s_scope_site_file[scope_stats_depth]);
+        copy_basename(rec.site_file_basename, s_scope_site_file[scope_stats_depth]);
+        rec.site_line = s_scope_site_line[scope_stats_depth];
+        rec.depth = static_cast<int16_t>(scope_stats_depth);
+        for (int r = 0; r < PTO2_SCOPE_STATS_MAX_RING_DEPTH; r++) {
+            rec.heap_bytes[r] = scope_stats_peak_heap_bytes[scope_stats_depth][r];
+            rec.task_in_flight[r] = scope_stats_peak_task_in_flight[scope_stats_depth][r];
+            rec.dep_used[r] = scope_stats_peak_dep_used[scope_stats_depth][r];
+            rec.fanin_used[r] = scope_stats_peak_fanin_used[scope_stats_depth][r];
+        }
+        rec.tensormap_used = scope_stats_peak_tensormap_used[scope_stats_depth];
+        ++scope_stats_shared_buf->header.write_count;
+    }
+    for (int r = 0; r < PTO2_SCOPE_STATS_MAX_RING_DEPTH; r++) {
+        scope_stats_peak_heap_bytes[scope_stats_depth][r] = 0;
+        scope_stats_peak_task_in_flight[scope_stats_depth][r] = 0;
+        scope_stats_peak_fanin_used[scope_stats_depth][r] = 0;
+        scope_stats_peak_dep_used[scope_stats_depth][r] = 0;
+    }
+    scope_stats_peak_tensormap_used[scope_stats_depth] = 0;
+    s_scope_site_file[scope_stats_depth] = nullptr;
+    s_scope_site_line[scope_stats_depth] = 0;
+    --scope_stats_depth;
+}
+
+extern "C" void scope_stats_on_fatal() {
+    if (!scope_stats_enabled) return;
+    if (!scope_stats_shared_buf) return;
+    scope_stats_shared_buf->header.fatal_latched = 1;
+}
+
+// ---------------------------------------------------------------------------
+// Pure-value peak update APIs — called by runtime at instrumentation points
+// ---------------------------------------------------------------------------
+
+extern "C" void scope_stats_update_allocator_peaks(int ring_id, uint64_t heap_bytes, int32_t tasks_in_flight) {
+    if (!scope_stats_enabled || scope_stats_depth < 0) return;
+    if (ring_id < 0 || ring_id >= PTO2_SCOPE_STATS_MAX_RING_DEPTH) return;
+    for (int d = 0; d <= scope_stats_depth; d++) {
+        if (heap_bytes > scope_stats_peak_heap_bytes[d][ring_id]) scope_stats_peak_heap_bytes[d][ring_id] = heap_bytes;
+        if (tasks_in_flight > scope_stats_peak_task_in_flight[d][ring_id])
+            scope_stats_peak_task_in_flight[d][ring_id] = tasks_in_flight;
+    }
+}
+
+extern "C" void scope_stats_update_tensormap_peak(int32_t tensormap_used) {
+    if (!scope_stats_enabled || scope_stats_depth < 0) return;
+    for (int d = 0; d <= scope_stats_depth; d++) {
+        if (tensormap_used > scope_stats_peak_tensormap_used[d]) scope_stats_peak_tensormap_used[d] = tensormap_used;
+    }
+}
+
+extern "C" void scope_stats_update_pool_peaks(int ring_id, int32_t fanin_used, int32_t dep_used) {
+    if (!scope_stats_enabled || scope_stats_depth < 0) return;
+    if (ring_id < 0 || ring_id >= PTO2_SCOPE_STATS_MAX_RING_DEPTH) return;
+    for (int d = 0; d <= scope_stats_depth; d++) {
+        if (fanin_used > scope_stats_peak_fanin_used[d][ring_id]) scope_stats_peak_fanin_used[d][ring_id] = fanin_used;
+        if (dep_used > scope_stats_peak_dep_used[d][ring_id]) scope_stats_peak_dep_used[d][ring_id] = dep_used;
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Capacity registration — called by runtime at init
+// ---------------------------------------------------------------------------
+
+extern "C" void scope_stats_set_ring_capacity(int ring_id, int32_t window_cap, uint64_t heap_cap, int32_t dep_cap) {
+    if (!scope_stats_shared_buf) return;
+    if (ring_id < 0 || ring_id >= PTO2_SCOPE_STATS_MAX_RING_DEPTH) return;
+    scope_stats_shared_buf->header.task_window_cap[ring_id] = window_cap;
+    scope_stats_shared_buf->header.heap_cap[ring_id] = heap_cap;
+    scope_stats_shared_buf->header.dep_cap[ring_id] = dep_cap;
+}
+
+extern "C" void scope_stats_set_tensormap_capacity(int32_t cap) {
+    if (!scope_stats_shared_buf) return;
+    scope_stats_shared_buf->header.tensormap_cap = cap;
+}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
index eabe3ec3f..0bd3431e6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -123,6 +123,11 @@ typedef struct PTO2RuntimeOps {
     );
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args);
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args);
+
+    // Stash the call-site of the next PTO2ScopeGuard so the [ScopeStats]
+    // collector can log it. Always present to keep ops-table layout stable
+    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
+    void (*scope_set_site)(const char *file, int line);
 } PTO2RuntimeOps;
 
 /**
@@ -361,10 +366,13 @@ static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const u
  */
 class PTO2ScopeGuard {
 public:
-    explicit PTO2ScopeGuard(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) :
+    explicit PTO2ScopeGuard(
+        PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE()
+    ) :
         rt_(current_runtime()) {
         if (!rt_->ops->is_fatal(rt_)) {
             rt_->pending_scope_mode = mode;
+            if (rt_->ops->scope_set_site) rt_->ops->scope_set_site(file, line);
             rt_->ops->scope_begin(rt_);
         }
     }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index d1c039785..d04c7a9cb 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -39,6 +39,10 @@
 extern "C" void set_dump_tensor_selective_mode(bool enable);
 extern "C" void set_dump_tensor_task_mask(uint64_t task_id, uint64_t mask);
 
+#if PTO2_PROFILING
+#include "aicpu/scope_stats_collector_aicpu.h"
+#endif
+
 // Verify the captured Tensor blob size in DepGenRecord matches the runtime
 // Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without
 // including runtime/tensor.h, so this check lives at the orch callsite.
@@ -168,6 +172,14 @@ static void
 orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *func, const char *fmt, va_list args) {
     int32_t latched_code = orch_mark_fatal(orch, error_code);
 
+#if PTO2_PROFILING
+    // Flush the current scope's peaks BEFORE the FATAL log line, so the
+    // diagnostic context (which pool/window filled up) appears right next to
+    // the failure reason. on_fatal is latched, so duplicate fatals from
+    // different layers don't print multiple stats lines.
+    scope_stats_on_fatal();
+#endif
+
     if (fmt == nullptr || fmt[0] == '\0') {
         if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
             unified_log_error(func, "FATAL(code=%d, latched=%d)", error_code, latched_code);
@@ -253,6 +265,11 @@ static bool append_fanin_or_fail(
     }
     entry->slot_state = prod_state;
     fanin_builder->count++;
+#if PTO2_PROFILING
+    scope_stats_update_pool_peaks(
+        ring_id, fanin_pool.used(), orch->scheduler ? orch->scheduler->ring_sched_states[ring_id].dep_pool.used() : 0
+    );
+#endif
     return true;
 }
 
@@ -342,6 +359,10 @@ static bool prepare_task(
         orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK);
         return false;
     }
+#if PTO2_PROFILING
+    scope_stats_update_allocator_peaks(ring_id, allocator.heap_used_bytes(), allocator.active_count());
+    scope_stats_update_tensormap_peak(orch->tensor_map.current_used());
+#endif
 
     out->task_id = PTO2TaskId::make(ring_id, static_cast<uint32_t>(out->alloc_result.task_id));
     out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot);
@@ -437,6 +458,14 @@ bool PTO2OrchestratorState::init_from_layout(
     orch->scope_stack_capacity = layout.scope_stack_capacity;
     orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
 
+#if PTO2_PROFILING
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &alloc = orch->rings[r].task_allocator;
+        scope_stats_set_ring_capacity(r, alloc.window_size(), alloc.heap_capacity(), 0);
+    }
+    scope_stats_set_tensormap_capacity(orch->tensor_map.pool_capacity());
+#endif
+
     return true;
 }
 
@@ -450,7 +479,17 @@ void PTO2OrchestratorState::destroy() {
     orch->scope_begins = nullptr;
 }
 
-void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
+void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) {
+    this->scheduler = scheduler;
+#if PTO2_PROFILING
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        int32_t dep_cap = scheduler ? scheduler->ring_sched_states[r].dep_pool.capacity : 0;
+        scope_stats_set_ring_capacity(
+            r, rings[r].task_allocator.window_size(), rings[r].task_allocator.heap_capacity(), dep_cap
+        );
+    }
+#endif
+}
 
 // =============================================================================
 // Scope Management
@@ -489,6 +528,18 @@ void PTO2OrchestratorState::begin_scope(PTO2ScopeMode mode) {
     if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) {
         orch->manual_begin_depth = orch->scope_stack_top;
     }
+#if PTO2_PROFILING
+    scope_stats_on_begin();
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &alloc = orch->rings[r].task_allocator;
+        scope_stats_update_allocator_peaks(r, alloc.heap_used_bytes(), alloc.active_count());
+        scope_stats_update_pool_peaks(
+            r, orch->rings[r].fanin_pool.used(),
+            orch->scheduler ? orch->scheduler->ring_sched_states[r].dep_pool.used() : 0
+        );
+    }
+    scope_stats_update_tensormap_peak(orch->tensor_map.current_used());
+#endif
 }
 
 void PTO2OrchestratorState::end_scope() {
@@ -498,6 +549,14 @@ void PTO2OrchestratorState::end_scope() {
     }
     assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
 
+    // Snapshot peak intra-scope queue fill BEFORE the orchestrator drains
+    // pending tasks via scheduler->on_scope_end. The user is measuring how
+    // much ring/heap the work submitted inside this scope holds at its peak,
+    // not the residual after teardown.
+#if PTO2_PROFILING
+    scope_stats_on_end();
+#endif
+
 #if PTO2_ORCH_PROFILING
     uint64_t _se0 = get_sys_cnt_aicpu();
 #endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 5a3e3d3d3..e98b8aafa 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -180,6 +180,10 @@ class PTO2TaskAllocator {
 
     uint64_t heap_top() const { return heap_top_; }
     uint64_t heap_capacity() const { return heap_size_; }
+    uint64_t heap_used_bytes() const {
+        if (heap_size_ == 0) return 0;
+        return (heap_top_ + heap_size_ - heap_tail_) % heap_size_;
+    }
 
 private:
     // --- Task Ring ---
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
index c801d5c15..122611e3f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
@@ -28,6 +28,9 @@
 
 #include "aicpu/device_time.h"
 #include "common/unified_log.h"
+#if PTO2_PROFILING
+#include "aicpu/scope_stats_collector_aicpu.h"
+#endif
 
 // Weak fallback for HOST .so builds (never called, but satisfies linker).
 // The AICPU build links the strong symbol from platform/.../device_time.cpp.
@@ -231,6 +234,14 @@ void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, cons
     memcpy(ptr, &value, elem_size);
 }
 
+// Ops-table entry that hands the call-site captured by PTO2ScopeGuard to the
+// [ScopeStats] collector. The slot is always present in the struct to keep
+// the layout stable; at PTO2_PROFILING=0 we fill nullptr so the orchestration
+// .so's null-check skips it.
+#if PTO2_PROFILING
+static void scope_set_site_impl(const char *file, int line) { scope_stats_set_pending_site(file, line); }
+#endif
+
 static const PTO2RuntimeOps s_runtime_ops = {
     .submit_task = submit_task_impl,
     .scope_begin = rt_scope_begin,
@@ -246,6 +257,11 @@ static const PTO2RuntimeOps s_runtime_ops = {
     .set_tensor_data = set_tensor_data,
     .alloc_tensors = alloc_tensors_impl,
     .submit_dummy_task = submit_dummy_task_impl,
+#if PTO2_PROFILING
+    .scope_set_site = scope_set_site_impl,
+#else
+    .scope_set_site = nullptr,
+#endif
 };
 
 // =============================================================================
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 5709a85b7..b0848a9f6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -89,6 +89,10 @@ struct PTO2RuntimeOps {
     );
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args);
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args);
+    // Stash the call-site captured by PTO2ScopeGuard into the [ScopeStats]
+    // collector. Always present in the struct to keep ops-table layout stable
+    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
+    void (*scope_set_site)(const char *file, int line);
 };
 
 /**
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index cf1f2d28d..cce6725b9 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -375,6 +375,13 @@ struct PTO2TensorMap {
         return task_local_id & (task_window_sizes[ring_id] - 1);
     }
 
+    // Accessors read by scope_stats_collector. Declared unconditionally so the
+    // collector .cpp compiles at PTO2_PROFILING=0 (collector is unconditional —
+    // setter symbols must export for host dlsym; the probe call sites that use
+    // these accessors stay gated by PTO2_PROFILING).
+    int32_t current_used() const { return next_entry_idx - free_num; }
+    int32_t pool_capacity() const { return pool_size; }
+
     // new_entry only allocates memory, does not assign attributes
     PTO2TensorMapEntry *new_entry() {
         if (free_num > 0) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
index 8d50681ba..2db12e9e6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
@@ -49,6 +49,10 @@
     } while (0)
 #endif
 
+#if PTO2_PROFILING
+#include "aicpu/scope_stats_collector_aicpu.h"
+#endif
+
 // =============================================================================
 // Ready Queue (Lock-free bounded MPMC — Vyukov design)
 // =============================================================================
@@ -722,6 +726,9 @@ struct PTO2SchedulerState {
                     early_finished++;
                 } else {
                     producer->fanout_head = rss.dep_pool.prepend(producer->fanout_head, ws);
+#if PTO2_PROFILING
+                    scope_stats_update_pool_peaks(ws->ring_id, 0, rss.dep_pool.used());
+#endif
                 }
                 producer->unlock_fanout();
             });
diff --git a/src/a5/platform/include/aicpu/scope_stats_collector_aicpu.h b/src/a5/platform/include/aicpu/scope_stats_collector_aicpu.h
new file mode 100644
index 000000000..f3d944165
--- /dev/null
+++ b/src/a5/platform/include/aicpu/scope_stats_collector_aicpu.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include "common/scope_stats_buffer.h"
+
+// Scope-stats collector — platform-owned, runtime-agnostic.
+//
+// Platform owns all collector state and peak-tracking logic. Runtime calls
+// pure-value APIs to report resource usage; no runtime types cross the
+// boundary.
+//
+// Setter symbols (set_scope_stats_enabled, set_platform_scope_stats_base)
+// are exported unconditionally so the host-side sim DeviceRunner's dlsym
+// always resolves.
+
+extern "C" {
+
+// --- Scope lifecycle probes (called by orchestrator begin_scope/end_scope) ---
+
+void scope_stats_on_begin();
+void scope_stats_on_end();
+void scope_stats_on_fatal();
+
+// --- Site tracking ---
+
+void scope_stats_set_pending_site(const char *file, int line);
+
+// --- Setter symbols (always exported) ---
+
+void set_scope_stats_enabled(bool enable);
+void set_platform_scope_stats_base(uint64_t scope_stats_data_base);
+
+// --- Initial sampling callback ---
+
+typedef void (*ScopeStatsInitialSampleFn)(int32_t depth);
+void scope_stats_register_initial_sampler(ScopeStatsInitialSampleFn fn);
+
+// --- Pure-value peak update APIs (called by runtime at instrumentation points) ---
+// Single-producer assumption: peak updates use non-atomic read-max-write.
+// Safe when the orchestrator is single-threaded; concurrent callers may
+// lose peaks silently (acceptable for diagnostic data).
+
+void scope_stats_update_allocator_peaks(int ring_id, uint64_t heap_bytes, int32_t tasks_in_flight);
+void scope_stats_update_tensormap_peak(int32_t tensormap_used);
+void scope_stats_update_pool_peaks(int ring_id, int32_t fanin_used, int32_t dep_used);
+
+// --- Capacity registration (called by runtime at init) ---
+
+void scope_stats_set_ring_capacity(int ring_id, int32_t window_cap, uint64_t heap_cap, int32_t dep_cap);
+void scope_stats_set_tensormap_capacity(int32_t cap);
+
+}  // extern "C"
diff --git a/src/a5/platform/include/common/kernel_args.h b/src/a5/platform/include/common/kernel_args.h
index 9a5416806..e66ddc58f 100644
--- a/src/a5/platform/include/common/kernel_args.h
+++ b/src/a5/platform/include/common/kernel_args.h
@@ -77,10 +77,13 @@ struct KernelArgs {
     // indexes by block_idx and forwards into per-core platform state.
     uint64_t aicore_l2_perf_ring_addrs{0};  // L2PerfAicoreRing* per core; 0 when L2 swimlane is off
     uint64_t aicore_pmu_ring_addrs{0};      // PmuAicoreRing* per core; 0 when PMU is off
+    uint64_t scope_stats_data_base{0};      // ScopeStatsBuffer device pointer; 0 when scope_stats is off.
+                                            // a5 has no halHostRegister — host keeps a separate shadow and
+                                            // refreshes it via rtMemcpy DEVICE_TO_HOST at dump time.
     uint32_t log_level{1};                  // Severity floor: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR, 4=NUL
     uint32_t log_info_v{5};                 // INFO verbosity threshold (0..9); default V5
-    uint32_t enable_profiling_flag{0};      // Profiling umbrella bitmask; bit0=dump_tensor, bit1=l2_swimlane, bit2=pmu
-    uint32_t _pad{0};                       // Alignment padding
+    uint32_t enable_profiling_flag{0};  // Profiling umbrella bitmask; dump_tensor|l2_swimlane|pmu|dep_gen|scope_stats
+    uint32_t _pad{0};                   // Alignment padding
 
     // Device pointer to an 8-byte buffer that the platform AICPU entry writes
     // the run-wall (ns) into. Allocated once at simpler_init, kept resident.
diff --git a/src/a5/platform/include/common/platform_config.h b/src/a5/platform/include/common/platform_config.h
index 5e740bae8..8fdb40c2b 100644
--- a/src/a5/platform/include/common/platform_config.h
+++ b/src/a5/platform/include/common/platform_config.h
@@ -177,6 +177,7 @@ inline double cycles_to_us(uint64_t cycles) {
 #define PROFILING_FLAG_DUMP_TENSOR (1u << 0)
 #define PROFILING_FLAG_L2_SWIMLANE (1u << 1)
 #define PROFILING_FLAG_PMU (1u << 2)
+#define PROFILING_FLAG_SCOPE_STATS (1u << 4)
 #define GET_PROFILING_FLAG(flags, bit) ((((uint32_t)(flags)) & ((uint32_t)(bit))) != 0u)
 #define SET_PROFILING_FLAG(flags, bit) ((flags) |= (uint32_t)(bit))
 #define CLEAR_PROFILING_FLAG(flags, bit) ((flags) &= ~((uint32_t)(bit)))
diff --git a/src/a5/platform/include/common/scope_stats_buffer.h b/src/a5/platform/include/common/scope_stats_buffer.h
new file mode 100644
index 000000000..7780fc333
--- /dev/null
+++ b/src/a5/platform/include/common/scope_stats_buffer.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef PLATFORM_COMMON_SCOPE_STATS_BUFFER_H_
+#define PLATFORM_COMMON_SCOPE_STATS_BUFFER_H_
+
+#include <cstdint>
+
+// Layout shared between AICPU writer (scope_stats_collector_aicpu.cpp on
+// device) and the host reader (ScopeStatsHostBuffer on host). The whole block lives in a
+// host-allocated device-visible memory region; AICPU mutates `header.write_count`
+// and `records[i]` during the run, host snapshots both after the run to write
+// `<output_prefix>/scope_stats.json`.
+//
+// Hot-path semantics: AICPU appends one record per scope_end into the ring
+// using `idx = header.write_count % header.cap`, then increments
+// `write_count`. No locking — single-producer (orchestrator thread) /
+// single-consumer (host post-run). Host never reads while AICPU writes.
+//
+// Capacity (PTO2_SCOPE_STATS_LOG_CAP) is fixed at build time so the layout is
+// stable across host/device builds. 16 384 records × ~96 B = ~1.5 MB; the
+// host opts in via `--enable-scope-stats` and the allocation is skipped when
+// the flag is off.
+
+#define PTO2_SCOPE_STATS_LOG_CAP 16384u
+#define PTO2_SCOPE_STATS_MAX_RING_DEPTH 4
+#define PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH 64
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// One record per scope_end. Layout MUST stay in sync with the device-side
+// writer in platform/src/aicpu/scope_stats_collector_aicpu.cpp.
+struct ScopeStatsRecord {
+    uint64_t site_file_addr;      // device-side const char *; for diagnostics the host
+                                  // only logs the raw pointer (string table lives in
+                                  // the orchestration .so, not in shared memory).
+                                  // AICPU also writes a basename copy into site_file_basename.
+    char site_file_basename[32];  // NUL-terminated basename of site_file, captured
+                                  // at append time so the host JSON contains a
+                                  // human-readable path without dereferencing a
+                                  // device pointer.
+    int32_t site_line;
+    int16_t depth;
+    int16_t _pad0;
+    uint64_t heap_bytes[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    int32_t task_in_flight[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    int32_t dep_used[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    int32_t fanin_used[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    int32_t tensormap_used;
+    int32_t _pad1;
+};
+
+struct ScopeStatsHeader {
+    uint64_t write_count;    // Total append count. write_count > cap means the
+                             // ring wrapped; host reports `dropped = write_count - cap`
+                             // and emits `min(cap, write_count)` records starting
+                             // from `(write_count - kept) % cap`.
+    uint32_t cap;            // Fixed at PTO2_SCOPE_STATS_LOG_CAP; copied in by host
+                             // at init so device and host see the same value
+                             // without needing a separate sync.
+    uint32_t fatal_latched;  // AICPU sets to 1 on first fatal. Host uses this
+                             // to stamp the JSON `fatal` field — no separate
+                             // device→host channel needed.
+    // Per-ring capacities — snapshotted by AICPU once at scope_stats_bind
+    // (constant for the run, so writing them once is fine). Host needs them
+    // to render the "used/cap" ratio in JSON without re-introducing a
+    // separate device→host query.
+    int32_t task_window_cap[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    uint64_t heap_cap[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    int32_t dep_cap[PTO2_SCOPE_STATS_MAX_RING_DEPTH];
+    int32_t tensormap_cap;
+    int32_t _pad;
+};
+
+struct ScopeStatsBuffer {
+    ScopeStatsHeader header;
+    ScopeStatsRecord records[PTO2_SCOPE_STATS_LOG_CAP];
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // PLATFORM_COMMON_SCOPE_STATS_BUFFER_H_
diff --git a/src/a5/platform/include/host/scope_stats_dump.h b/src/a5/platform/include/host/scope_stats_dump.h
new file mode 100644
index 000000000..e0b9b17e7
--- /dev/null
+++ b/src/a5/platform/include/host/scope_stats_dump.h
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_A5_PLATFORM_INCLUDE_HOST_SCOPE_STATS_DUMP_H_
+#define SRC_A5_PLATFORM_INCLUDE_HOST_SCOPE_STATS_DUMP_H_
+
+#include <cinttypes>
+#include <cstdio>
+#include <cstring>
+#include <filesystem>
+#include <string>
+#include <utility>
+
+#include "common/scope_stats_buffer.h"
+#include "common/unified_log.h"
+#include "host/profiling_common/profiler_base.h"
+#include "host/profiling_copy.h"
+
+// Header-only host-side helper for scope_stats. Intentionally NOT modeled on
+// L2PerfCollector / DepGenCollector — scope_stats is a single end-of-run
+// snapshot (no streaming, no mgmt thread, no reconcile), so this whole
+// feature collapses into one class with three short methods. Keeping every
+// host-side scope_stats line in one file isolates the feature from the
+// general device_runner flow: hooking the feature on adds three call sites
+// (init / dump / finalize) and zero protocol details to device_runner.
+//
+// AICPU side is symmetric: layout in `scope_stats_buffer.h`, device probes /
+// writer in `platform/src/aicpu/scope_stats_collector_aicpu.cpp`. The shared
+// layout header is the only file both sides include.
+
+// Memory callbacks — thin aliases for the canonical profiling_common shapes
+// (same pattern as dep_gen_collector / l2_perf_collector / pmu_collector /
+// tensor_dump_collector). On a5 there is no halHostRegister; the host shadow
+// + rtMemcpy DEVICE_TO_HOST pair is wired through ``BufferPoolManager``'s
+// default_host_shadow_register + ops_.copy_from_device, so callers do not
+// need to supply a copy callback themselves.
+using ScopeStatsAllocCallback = profiling_common::ProfAllocCallback;
+using ScopeStatsRegisterCallback = profiling_common::ProfRegisterCallback;
+using ScopeStatsUnregisterCallback = profiling_common::ProfUnregisterCallback;
+using ScopeStatsFreeCallback = profiling_common::ProfFreeCallback;
+
+// BufferPoolManager template stub. ScopeStatsHostBuffer drives the manager
+// purely for its single-buffer alloc_and_register / copy_buffer_from_device /
+// free_buffer methods (the canonical dev↔host shadow + rtMemcpy pair on a5);
+// the streaming ready/done queue side is never touched, so the layout-trait
+// aliases below are unused placeholders required only by the manager's static
+// checks.
+struct ScopeStatsModule {
+    using DataHeader = int;
+    using ReadyEntry = int;
+    using ReadyBufferInfo = int;
+    using FreeQueue = int;
+    static constexpr int kBufferKinds = 1;
+};
+
+class ScopeStatsHostBuffer {
+public:
+    // Allocate the device-side buffer and set up the host-side view. Returns
+    // 0 on success; on failure the object stays uninitialized and every
+    // other method is a no-op, so callers can chain without guarding.
+    //
+    // Everything goes through profiling_common::BufferPoolManager so this
+    // collector's dev↔host shadow + rtMemcpy setup matches every pool-based
+    // collector on a5. ``register_cb`` defaults to ``default_host_shadow_register``
+    // (the same fallback ProfilerBase::start installs) when callers leave it
+    // null; the manager allocs the device buffer, mallocs a host shadow, zeros
+    // it, and pushes the zeros to device.
+    int init(
+        const ScopeStatsAllocCallback &alloc_cb, ScopeStatsRegisterCallback register_cb,
+        const ScopeStatsFreeCallback &free_cb, int device_id
+    ) {
+        device_id_ = device_id;
+        const std::size_t bytes = sizeof(ScopeStatsBuffer);
+
+        profiling_common::MemoryOps ops;
+        ops.alloc = alloc_cb;
+        ops.free_ = free_cb;
+        ops.reg = register_cb != nullptr ? register_cb : &profiling_common::default_host_shadow_register;
+        ops.copy_to_device = [](void *dev_dst, const void *host_src, std::size_t size) {
+            return profiling_copy_to_device(dev_dst, host_src, size);
+        };
+        ops.copy_from_device = [](void *host_dst, const void *dev_src, std::size_t size) {
+            return profiling_copy_from_device(host_dst, dev_src, size);
+        };
+        manager_.set_memory_context(
+            std::move(ops), /*shared_mem_dev=*/nullptr, /*shared_mem_host=*/nullptr,
+            /*shm_size=*/0, device_id
+        );
+        dev_ptr_ = manager_.alloc_and_register(bytes, &host_ptr_);
+        if (dev_ptr_ == nullptr) {
+            return -1;
+        }
+        initialized_ = true;
+        return 0;
+    }
+
+    bool is_initialized() const { return initialized_; }
+
+    void *device_ptr() const { return dev_ptr_; }
+
+    // Snapshot the shared region as JSON at <output_dir>/scope_stats.json.
+    // Assumes the device stream has already been synced (matches dep_gen /
+    // l2_perf export ordering), so AICPU writes are fully visible.
+    int dump(const std::string &output_dir) {
+        if (!initialized_ || host_ptr_ == nullptr) return 0;
+        // a5 — refresh host shadow before reading (no halHostRegister, so the
+        // shadow is stale until we rtMemcpy DEVICE_TO_HOST).
+        int rc = manager_.copy_buffer_from_device(host_ptr_, dev_ptr_, sizeof(ScopeStatsBuffer));
+        if (rc != 0) {
+            LOG_ERROR("scope_stats: copy_from_device failed: %d", rc);
+            return rc;
+        }
+        const std::string path = make_path(output_dir);
+        const auto *buf = static_cast<const ScopeStatsBuffer *>(host_ptr_);
+        return write_json(buf, path);
+    }
+
+    // unregister_cb / free_cb are accepted for signature symmetry with a2a3 and
+    // with the other collectors' finalize hooks; the manager already owns the
+    // free path via the MemoryOps stashed at init() time, so they go unused on
+    // a5 (a5 has no halHostRegister, hence no unregister either).
+    void finalize(ScopeStatsUnregisterCallback /*unregister_cb*/, const ScopeStatsFreeCallback & /*free_cb*/) {
+        if (!initialized_) return;
+        manager_.free_buffer(dev_ptr_);  // frees dev + paired host shadow
+        dev_ptr_ = nullptr;
+        host_ptr_ = nullptr;
+        initialized_ = false;
+    }
+
+private:
+    static std::string make_path(const std::string &output_dir) {
+        std::filesystem::path dir(output_dir);
+        std::error_code ec;
+        std::filesystem::create_directories(dir, ec);
+        if (ec) {
+            LOG_WARN("scope_stats: failed to create output dir %s: %s", output_dir.c_str(), ec.message().c_str());
+        }
+        return (dir / "scope_stats.json").string();
+    }
+
+    // Schema (version 2) — flat, not Chrome-trace, because scope_stats is a
+    // list of per-scope_end snapshots, not a timeline. Each metric is rendered
+    // as a `"used/cap"` string so the JSON reads the same as the original
+    // `[ScopeStats]` log line:
+    //   { "version": 2, "fatal": bool,
+    //     "write_count": uint, "cap": uint, "dropped": uint,
+    //     "records": [
+    //       { "site": "file:line", "depth": int,
+    //         "task_window":    ["used/cap", "used/cap", "used/cap", "used/cap"],
+    //         "heap":           ["used/cap", "used/cap", "used/cap", "used/cap"],
+    //         "dep":            ["used/cap", "used/cap", "used/cap", "used/cap"],
+    //         "fanin_used":     [int, int, int, int],
+    //         "tensormap":      "used/cap" },
+    //       ...
+    //     ] }
+    static int write_json(const ScopeStatsBuffer *buf, const std::string &path) {
+        if (buf == nullptr) {
+            LOG_ERROR("scope_stats: null buffer");
+            return -1;
+        }
+        std::FILE *fp = std::fopen(path.c_str(), "w");
+        if (fp == nullptr) {
+            LOG_ERROR("scope_stats: failed to open %s", path.c_str());
+            return -1;
+        }
+
+        const std::uint64_t write_count = buf->header.write_count;
+        const std::uint32_t cap = buf->header.cap;
+        const std::uint64_t kept = write_count > cap ? cap : write_count;
+        const std::uint64_t dropped = write_count > cap ? (write_count - cap) : 0;
+        const std::uint64_t start = write_count - kept;
+
+        std::fprintf(fp, "{\n");
+        std::fprintf(fp, "  \"version\": 2,\n");
+        std::fprintf(fp, "  \"fatal\": %s,\n", buf->header.fatal_latched ? "true" : "false");
+        std::fprintf(fp, "  \"write_count\": %" PRIu64 ",\n", write_count);
+        std::fprintf(fp, "  \"cap\": %u,\n", cap);
+        std::fprintf(fp, "  \"dropped\": %" PRIu64 ",\n", dropped);
+
+        std::fprintf(fp, "  \"records\": [");
+        for (std::uint64_t i = 0; i < kept; i++) {
+            const ScopeStatsRecord &rec = buf->records[(start + i) % cap];
+            if (i) std::fputc(',', fp);
+            std::fprintf(fp, "\n    {");
+            // Bound the print to the on-wire field size in case a future
+            // writer change drops the NUL terminator.
+            const std::size_t site_len = strnlen(rec.site_file_basename, sizeof(rec.site_file_basename));
+            std::fprintf(
+                fp, "\"site\": \"%.*s:%d\", ", static_cast<int>(site_len), rec.site_file_basename, rec.site_line
+            );
+            std::fprintf(fp, "\"depth\": %d, ", rec.depth);
+            std::fprintf(fp, "\"task_window\": ");
+            write_i32_over_i32_array(
+                fp, rec.task_in_flight, buf->header.task_window_cap, PTO2_SCOPE_STATS_MAX_RING_DEPTH
+            );
+            std::fprintf(fp, ", \"heap\": ");
+            write_u64_over_u64_array(fp, rec.heap_bytes, buf->header.heap_cap, PTO2_SCOPE_STATS_MAX_RING_DEPTH);
+            std::fprintf(fp, ", \"dep\": ");
+            write_i32_over_i32_array(fp, rec.dep_used, buf->header.dep_cap, PTO2_SCOPE_STATS_MAX_RING_DEPTH);
+            std::fprintf(fp, ", \"fanin_used\": ");
+            write_i32_array(fp, rec.fanin_used, PTO2_SCOPE_STATS_MAX_RING_DEPTH);
+            std::fprintf(fp, ", \"tensormap\": \"%d/%d\"}", rec.tensormap_used, buf->header.tensormap_cap);
+        }
+        std::fprintf(fp, "\n  ]\n}\n");
+        std::fclose(fp);
+
+        LOG_INFO_V1("scope_stats: wrote %" PRIu64 " records (dropped=%" PRIu64 ") to %s", kept, dropped, path.c_str());
+        return 0;
+    }
+
+    static void write_i32_array(std::FILE *fp, const std::int32_t *arr, std::size_t n) {
+        std::fputc('[', fp);
+        for (std::size_t i = 0; i < n; i++) {
+            if (i) std::fputc(',', fp);
+            std::fprintf(fp, "%d", arr[i]);
+        }
+        std::fputc(']', fp);
+    }
+
+    static void
+    write_i32_over_i32_array(std::FILE *fp, const std::int32_t *used, const std::int32_t *cap, std::size_t n) {
+        std::fputc('[', fp);
+        for (std::size_t i = 0; i < n; i++) {
+            if (i) std::fputc(',', fp);
+            std::fprintf(fp, "\"%d/%d\"", used[i], cap[i]);
+        }
+        std::fputc(']', fp);
+    }
+
+    static void
+    write_u64_over_u64_array(std::FILE *fp, const std::uint64_t *used, const std::uint64_t *cap, std::size_t n) {
+        std::fputc('[', fp);
+        for (std::size_t i = 0; i < n; i++) {
+            if (i) std::fputc(',', fp);
+            std::fprintf(fp, "\"%" PRIu64 "/%" PRIu64 "\"", used[i], cap[i]);
+        }
+        std::fputc(']', fp);
+    }
+
+    bool initialized_ = false;
+    int device_id_ = -1;
+    void *dev_ptr_ = nullptr;
+    void *host_ptr_ = nullptr;
+    // Drives the dev↔host shadow + rtMemcpy DEVICE_TO_HOST pair on a5 so
+    // scope_stats's data-copy path matches every pool-based collector's.
+    profiling_common::BufferPoolManager<ScopeStatsModule> manager_;
+};
+
+#endif  // SRC_A5_PLATFORM_INCLUDE_HOST_SCOPE_STATS_DUMP_H_
diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp
index 64bf019b3..c9e16628d 100644
--- a/src/a5/platform/onboard/aicpu/kernel.cpp
+++ b/src/a5/platform/onboard/aicpu/kernel.cpp
@@ -19,6 +19,7 @@
 #include "aicpu/platform_regs.h"
 #include "aicpu/platform_aicpu_affinity.h"
 #include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/scope_stats_collector_aicpu.h"
 #include "aicpu/tensor_dump_aicpu.h"
 #include "runtime.h"
 
@@ -106,6 +107,8 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a
     set_l2_swimlane_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE));
     set_platform_pmu_base(k_args->pmu_data_base);
     set_pmu_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_PMU));
+    set_scope_stats_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_SCOPE_STATS));
+    set_platform_scope_stats_base(k_args->scope_stats_data_base);
 
     // Affinity gate: drop excess threads before entering runtime
     if (!platform_aicpu_affinity_gate(runtime->aicpu_thread_num, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH)) {
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 38242555d..1a72ac8f7 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -544,6 +544,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     if (enable_pmu_) {
         SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU);
     }
+    if (enable_scope_stats_) {
+        SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_SCOPE_STATS);
+    }
 
     for (int i = 0; i < num_aicore; i++) {
         runtime.workers[i].aicpu_ready = 0;
@@ -608,6 +611,14 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         }
     }
 
+    if (enable_scope_stats_) {
+        rc = init_scope_stats(device_id_);
+        if (rc != 0) {
+            LOG_ERROR("init_scope_stats failed: %d", rc);
+            return rc;
+        }
+    }
+
     // Cleanup guard for early returns: stops all started collectors so
     // their mgmt + poll threads exit cleanly. stop() is idempotent and a
     // no-op on collectors that never started.
@@ -743,6 +754,12 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         pmu_collector_.reconcile_counters();
     }
 
+    if (enable_scope_stats_ && scope_stats_buf_.is_initialized()) {
+        // Stream sync has already completed; refresh the host shadow from
+        // device memory and write <output_prefix>/scope_stats.json.
+        scope_stats_buf_.dump(output_prefix_);
+    }
+
     // Print handshake results (reads from device memory, must be before free)
     print_handshake_results();
 
@@ -1038,6 +1055,10 @@ int DeviceRunner::finalize() {
     if (pmu_collector_.is_initialized()) {
         pmu_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb);
     }
+    if (scope_stats_buf_.is_initialized()) {
+        scope_stats_buf_.finalize(/*unregister_cb=*/nullptr, prof_free_cb);
+        kernel_args_.args.scope_stats_data_base = 0;
+    }
 
     // Release per-Worker static arena (GM heap + PTO2 SM in a single backing
     // device allocation). Must precede mem_alloc_.finalize() so the arena
@@ -1244,3 +1265,17 @@ int DeviceRunner::init_pmu(
     }
     return rc;
 }
+
+int DeviceRunner::init_scope_stats(int device_id) {
+    // a5: ScopeStatsHostBuffer drives BufferPoolManager which, via the default
+    // host_shadow_register, allocs the dev buffer + host shadow and pushes the
+    // zeroed shadow to device — so no explicit copy_from_device wire-up or
+    // pre-zero is needed here, and the JSON's `write_count` will read 0 even
+    // if AICPU never runs (kernel launch failure).
+    int rc = scope_stats_buf_.init(prof_alloc_cb, /*register_cb=*/nullptr, prof_free_cb, device_id);
+    if (rc != 0) {
+        return rc;
+    }
+    kernel_args_.args.scope_stats_data_base = reinterpret_cast<uint64_t>(scope_stats_buf_.device_ptr());
+    return 0;
+}
diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h
index a07ab28bb..8869df7df 100644
--- a/src/a5/platform/onboard/host/device_runner.h
+++ b/src/a5/platform/onboard/host/device_runner.h
@@ -49,6 +49,7 @@
 #include "host/memory_allocator.h"
 #include "host/l2_perf_collector.h"
 #include "host/pmu_collector.h"
+#include "host/scope_stats_dump.h"
 #include "host/tensor_dump_collector.h"
 #include "load_aicpu_op.h"
 #include "runtime.h"
@@ -300,6 +301,7 @@ class DeviceRunner {
         enable_pmu_ = (enable_pmu > 0);
         pmu_event_type_ = resolve_pmu_event_type(enable_pmu);
     }
+    void set_scope_stats_enabled(bool enable) { enable_scope_stats_ = enable; }
     // Directory under which all diagnostic artifacts (l2_perf_records.json /
     // tensor_dump/ / pmu.csv) land. Required (non-empty) when any diagnostic
     // is enabled; CallConfig::validate() enforces this contract upstream.
@@ -699,11 +701,16 @@ class DeviceRunner {
     bool enable_l2_swimlane_{false};
     bool enable_dump_tensor_{false};
     bool enable_pmu_{false};
+    bool enable_scope_stats_{false};
+    // scope_stats: single end-of-run snapshot, no streaming. All host-side
+    // logic in ScopeStatsHostBuffer; this is the only hook device_runner needs.
+    ScopeStatsHostBuffer scope_stats_buf_;
     L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED};             // resolved from set_l2_swimlane_enabled()
     PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION};  // resolved from set_pmu_enabled()
     std::string output_prefix_{};                                  // diagnostic artifact root directory
 
     int init_pmu(int num_cores, int num_threads, const std::string &csv_path, PmuEventType event_type, int device_id);
+    int init_scope_stats(int device_id);
 
     // Per-run collector teardown: stops mgmt + poll threads on every collector
     // whose init succeeded, in the only safe order (stop() joins mgmt before
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index 0cc17c81f..1114f647d 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -389,7 +389,7 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int /*enable_dep_gen*/,
-    const char *output_prefix, PtoRunTiming *out_timing
+    int enable_scope_stats, const char *output_prefix, PtoRunTiming *out_timing
 ) {
     if (out_timing != NULL) {
         out_timing->host_wall_ns = 0;
@@ -455,6 +455,7 @@ int run_prepared(
         runner->set_l2_swimlane_enabled(enable_l2_swimlane);
         runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
         runner->set_pmu_enabled(enable_pmu);
+        runner->set_scope_stats_enabled(enable_scope_stats != 0);
         runner->set_output_prefix(output_prefix);
 
         rc = runner->run(*r, block_dim, aicpu_thread_num);
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index c0d26fbe1..969667530 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -254,6 +254,20 @@ int DeviceRunner::ensure_binaries_loaded() {
             return -1;
         }
 
+        set_scope_stats_enabled_func_ =
+            reinterpret_cast<void (*)(bool)>(dlsym(aicpu_so_handle_, "set_scope_stats_enabled"));
+        if (set_scope_stats_enabled_func_ == nullptr) {
+            LOG_ERROR("dlsym failed for set_scope_stats_enabled: %s", dlerror());
+            return -1;
+        }
+
+        set_platform_scope_stats_base_func_ =
+            reinterpret_cast<void (*)(uint64_t)>(dlsym(aicpu_so_handle_, "set_platform_scope_stats_base"));
+        if (set_platform_scope_stats_base_func_ == nullptr) {
+            LOG_ERROR("dlsym failed for set_platform_scope_stats_base: %s", dlerror());
+            return -1;
+        }
+
         // Log config travels via the RTLD_GLOBAL HostLogger singleton in
         // libsimpler_log.so — already seeded by simpler_log_init() before the
         // AICPU sim SO was dlopen'd, so no per-SO setter forwarding is needed.
@@ -399,6 +413,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     if (enable_pmu_) {
         SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU);
     }
+    if (enable_scope_stats_) {
+        SET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_SCOPE_STATS);
+    }
 
     for (int i = 0; i < num_aicore; i++) {
         runtime.workers[i].aicpu_ready = 0;
@@ -460,6 +477,14 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         }
     }
 
+    if (enable_scope_stats_) {
+        rc = init_scope_stats();
+        if (rc != 0) {
+            LOG_ERROR("init_scope_stats failed: %d", rc);
+            return rc;
+        }
+    }
+
     // Cleanup guard for early returns: stops all started collectors so
     // their mgmt + poll threads exit cleanly. stop() is idempotent and a
     // no-op on collectors that never started.
@@ -507,7 +532,8 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     // Check if executors are loaded
     if (aicpu_execute_func_ == nullptr || aicore_execute_func_ == nullptr || set_platform_regs_func_ == nullptr ||
         set_platform_dump_base_func_ == nullptr || set_dump_tensor_enabled_func_ == nullptr ||
-        set_platform_pmu_base_func_ == nullptr || set_pmu_enabled_func_ == nullptr) {
+        set_platform_pmu_base_func_ == nullptr || set_pmu_enabled_func_ == nullptr ||
+        set_scope_stats_enabled_func_ == nullptr || set_platform_scope_stats_base_func_ == nullptr) {
         LOG_ERROR("Executor functions not loaded. Call ensure_binaries_loaded first.");
         return -1;
     }
@@ -519,6 +545,8 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     set_l2_swimlane_enabled_func_(enable_l2_swimlane_);
     set_platform_pmu_base_func_(kernel_args_.pmu_data_base);
     set_pmu_enabled_func_(enable_pmu_);
+    set_scope_stats_enabled_func_(enable_scope_stats_);
+    set_platform_scope_stats_base_func_(kernel_args_.scope_stats_data_base);
 
     // No per-SO log-config push: HostLogger lives in libsimpler_log.so
     // (RTLD_GLOBAL singleton) and the AICPU sim SO reads it directly via the
@@ -634,6 +662,10 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         pmu_collector_.reconcile_counters();
     }
 
+    if (enable_scope_stats_ && scope_stats_buf_.is_initialized()) {
+        scope_stats_buf_.dump(output_prefix_);
+    }
+
     // Print handshake results at end of run
     print_handshake_results();
 
@@ -680,6 +712,8 @@ void DeviceRunner::unload_executor_binaries() {
         set_l2_swimlane_enabled_func_ = nullptr;
         set_platform_pmu_base_func_ = nullptr;
         set_pmu_enabled_func_ = nullptr;
+        set_scope_stats_enabled_func_ = nullptr;
+        set_platform_scope_stats_base_func_ = nullptr;
         aicpu_so_loaded_ = false;
     }
     if (!aicpu_so_path_.empty()) {
@@ -890,6 +924,10 @@ int DeviceRunner::finalize() {
     if (pmu_collector_.is_initialized()) {
         pmu_collector_.finalize(/*unregister_cb=*/nullptr, prof_free_cb);
     }
+    if (scope_stats_buf_.is_initialized()) {
+        scope_stats_buf_.finalize(/*unregister_cb=*/nullptr, prof_free_cb);
+        kernel_args_.scope_stats_data_base = 0;
+    }
 
     // Release any chip callable buffers uploaded via upload_chip_callable_buffer.
     // Pool semantics mirror per-fid binaries: never freed until finalize.
@@ -1106,3 +1144,16 @@ int DeviceRunner::init_pmu(
     }
     return rc;
 }
+
+int DeviceRunner::init_scope_stats() {
+    // a5 sim: ScopeStatsHostBuffer drives BufferPoolManager with the default
+    // host_shadow_register; sim's profiling_copy_* are plain memcpys, so the
+    // dev/host shadow path collapses to one allocation pair without any
+    // address-space tricks.
+    int rc = scope_stats_buf_.init(prof_alloc_cb, /*register_cb=*/nullptr, prof_free_cb, /*device_id=*/-1);
+    if (rc != 0) {
+        return rc;
+    }
+    kernel_args_.scope_stats_data_base = reinterpret_cast<uint64_t>(scope_stats_buf_.device_ptr());
+    return 0;
+}
diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h
index 0aa6e6fa1..772c1663a 100644
--- a/src/a5/platform/sim/host/device_runner.h
+++ b/src/a5/platform/sim/host/device_runner.h
@@ -54,6 +54,7 @@
 #include "host/memory_allocator.h"
 #include "host/l2_perf_collector.h"
 #include "host/pmu_collector.h"
+#include "host/scope_stats_dump.h"
 #include "host/tensor_dump_collector.h"
 #include "runtime.h"
 
@@ -182,6 +183,7 @@ class DeviceRunner {
         enable_pmu_ = (enable_pmu > 0);
         pmu_event_type_ = resolve_pmu_event_type(enable_pmu);
     }
+    void set_scope_stats_enabled(bool enable) { enable_scope_stats_ = enable; }
     // Directory under which all diagnostic artifacts (l2_perf_records.json /
     // tensor_dump/ / pmu.csv) land. Required (non-empty) when any diagnostic
     // is enabled; CallConfig::validate() enforces this contract upstream.
@@ -361,6 +363,8 @@ class DeviceRunner {
     void (*set_platform_l2_perf_base_func_)(uint64_t){nullptr};
     void (*set_l2_swimlane_enabled_func_)(bool){nullptr};
     void (*set_pmu_enabled_func_)(bool){nullptr};
+    void (*set_scope_stats_enabled_func_)(bool){nullptr};
+    void (*set_platform_scope_stats_base_func_)(uint64_t){nullptr};
     std::string aicpu_so_path_;
     std::string aicore_so_path_;
 
@@ -372,6 +376,7 @@ class DeviceRunner {
 
     // PMU profiling (per-task AICore hardware counters)
     PmuCollector pmu_collector_;
+    ScopeStatsHostBuffer scope_stats_buf_;
 
     // Private helper methods — read aicpu_so_binary_ / aicore_kernel_binary_
     // off the runner (populated by set_executors during simpler_init).
@@ -416,11 +421,13 @@ class DeviceRunner {
     bool enable_l2_swimlane_{false};
     bool enable_dump_tensor_{false};
     bool enable_pmu_{false};
+    bool enable_scope_stats_{false};
     L2PerfLevel l2_perf_level_{L2PerfLevel::DISABLED};             // resolved from set_l2_swimlane_enabled()
     PmuEventType pmu_event_type_{PmuEventType::PIPE_UTILIZATION};  // resolved from set_pmu_enabled()
     std::string output_prefix_{};                                  // diagnostic artifact root directory
 
     int init_pmu(int num_cores, int num_threads, const std::string &csv_path, PmuEventType event_type, int device_id);
+    int init_scope_stats();
 
     // Per-run collector teardown: stops mgmt + poll threads on every collector
     // whose init succeeded. Idempotent. Mirrors the onboard helper.
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index 81e9b138f..9c08b30f7 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -300,7 +300,7 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int /*enable_dep_gen*/,
-    const char *output_prefix, PtoRunTiming *out_timing
+    int enable_scope_stats, const char *output_prefix, PtoRunTiming *out_timing
 ) {
     if (out_timing != NULL) {
         out_timing->host_wall_ns = 0;
@@ -353,6 +353,7 @@ int run_prepared(
         runner->set_l2_swimlane_enabled(enable_l2_swimlane);
         runner->set_dump_tensor_enabled(enable_dump_tensor != 0);
         runner->set_pmu_enabled(enable_pmu);
+        runner->set_scope_stats_enabled(enable_scope_stats != 0);
         runner->set_output_prefix(output_prefix);
 
         rc = runner->run(*r, block_dim, aicpu_thread_num);
diff --git a/src/a5/platform/src/aicpu/scope_stats_collector_aicpu.cpp b/src/a5/platform/src/aicpu/scope_stats_collector_aicpu.cpp
new file mode 100644
index 000000000..a00ac3988
--- /dev/null
+++ b/src/a5/platform/src/aicpu/scope_stats_collector_aicpu.cpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+// Platform-layer scope_stats collector.
+//
+// Owns all collector state (depth, peak arrays, shared buffer) and exposes
+// pure-value APIs for runtime to report resource usage. No runtime-specific
+// types cross the boundary.
+
+#include "aicpu/scope_stats_collector_aicpu.h"
+
+#include <cstring>
+
+#include "common/scope_stats_buffer.h"
+
+// ---------------------------------------------------------------------------
+// Collector state
+// ---------------------------------------------------------------------------
+
+int32_t scope_stats_depth = -1;
+bool scope_stats_enabled = false;
+
+static uint64_t scope_stats_peak_heap_bytes[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {};
+static int32_t scope_stats_peak_task_in_flight[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {};
+static int32_t scope_stats_peak_fanin_used[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {};
+static int32_t scope_stats_peak_dep_used[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH][PTO2_SCOPE_STATS_MAX_RING_DEPTH] = {};
+static int32_t scope_stats_peak_tensormap_used[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH] = {};
+static ScopeStatsBuffer *scope_stats_shared_buf = nullptr;
+
+namespace {
+
+ScopeStatsInitialSampleFn g_initial_sample_fn = nullptr;
+
+const char *s_pending_site_file = nullptr;
+int32_t s_pending_site_line = 0;
+
+const char *s_scope_site_file[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH] = {};
+int32_t s_scope_site_line[PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH] = {};
+
+inline const char *basename_of(const char *path) {
+    if (!path) return "(unknown)";
+    const char *base = path;
+    for (const char *p = path; *p; ++p) {
+        if (*p == '/' || *p == '\\') base = p + 1;
+    }
+    return base;
+}
+
+inline void copy_basename(char (&dst)[32], const char *src) {
+    const char *base = basename_of(src);
+    size_t i = 0;
+    for (; i + 1 < sizeof(dst) && base[i]; i++)
+        dst[i] = base[i];
+    dst[i] = '\0';
+}
+
+}  // namespace
+
+// ---------------------------------------------------------------------------
+// Setter symbols — always exported, unconditionally compiled
+// ---------------------------------------------------------------------------
+
+extern "C" void set_scope_stats_enabled(bool enable) { scope_stats_enabled = enable; }
+
+extern "C" void set_platform_scope_stats_base(uint64_t scope_stats_data_base) {
+    scope_stats_shared_buf = reinterpret_cast<ScopeStatsBuffer *>(scope_stats_data_base);
+    // Reset collector-local statics so a prior run that crashed mid-scope (or
+    // reused the same AICPU .so process) can't leak stale depth/peak data into
+    // the new run's records.
+    scope_stats_depth = -1;
+    s_pending_site_file = nullptr;
+    s_pending_site_line = 0;
+    memset(scope_stats_peak_heap_bytes, 0, sizeof(scope_stats_peak_heap_bytes));
+    memset(scope_stats_peak_task_in_flight, 0, sizeof(scope_stats_peak_task_in_flight));
+    memset(scope_stats_peak_fanin_used, 0, sizeof(scope_stats_peak_fanin_used));
+    memset(scope_stats_peak_dep_used, 0, sizeof(scope_stats_peak_dep_used));
+    memset(scope_stats_peak_tensormap_used, 0, sizeof(scope_stats_peak_tensormap_used));
+    memset(s_scope_site_file, 0, sizeof(s_scope_site_file));
+    memset(s_scope_site_line, 0, sizeof(s_scope_site_line));
+    if (scope_stats_shared_buf) {
+        memset(&scope_stats_shared_buf->header, 0, sizeof(scope_stats_shared_buf->header));
+        scope_stats_shared_buf->header.cap = PTO2_SCOPE_STATS_LOG_CAP;
+    }
+}
+
+extern "C" void scope_stats_register_initial_sampler(ScopeStatsInitialSampleFn fn) { g_initial_sample_fn = fn; }
+
+// ---------------------------------------------------------------------------
+// Scope lifecycle probes
+// ---------------------------------------------------------------------------
+
+extern "C" void scope_stats_set_pending_site(const char *file, int line) {
+    s_pending_site_file = file;
+    s_pending_site_line = line;
+}
+
+extern "C" void scope_stats_on_begin() {
+    if (!scope_stats_enabled) return;
+    if (scope_stats_depth + 1 >= PTO2_SCOPE_STATS_MAX_SCOPE_DEPTH) return;
+    ++scope_stats_depth;
+    s_scope_site_file[scope_stats_depth] = s_pending_site_file;
+    s_scope_site_line[scope_stats_depth] = s_pending_site_line;
+    s_pending_site_file = nullptr;
+    s_pending_site_line = 0;
+    if (g_initial_sample_fn) {
+        g_initial_sample_fn(scope_stats_depth);
+    } else {
+        for (int r = 0; r < PTO2_SCOPE_STATS_MAX_RING_DEPTH; r++) {
+            scope_stats_peak_heap_bytes[scope_stats_depth][r] = 0;
+            scope_stats_peak_task_in_flight[scope_stats_depth][r] = 0;
+            scope_stats_peak_fanin_used[scope_stats_depth][r] = 0;
+            scope_stats_peak_dep_used[scope_stats_depth][r] = 0;
+        }
+        scope_stats_peak_tensormap_used[scope_stats_depth] = 0;
+    }
+}
+
+extern "C" void scope_stats_on_end() {
+    if (!scope_stats_enabled) return;
+    if (scope_stats_depth < 0) return;
+    if (scope_stats_depth >= 0 && scope_stats_shared_buf) {
+        uint64_t idx = scope_stats_shared_buf->header.write_count % PTO2_SCOPE_STATS_LOG_CAP;
+        ScopeStatsRecord &rec = scope_stats_shared_buf->records[idx];
+        rec.site_file_addr = reinterpret_cast<uint64_t>(s_scope_site_file[scope_stats_depth]);
+        copy_basename(rec.site_file_basename, s_scope_site_file[scope_stats_depth]);
+        rec.site_line = s_scope_site_line[scope_stats_depth];
+        rec.depth = static_cast<int16_t>(scope_stats_depth);
+        for (int r = 0; r < PTO2_SCOPE_STATS_MAX_RING_DEPTH; r++) {
+            rec.heap_bytes[r] = scope_stats_peak_heap_bytes[scope_stats_depth][r];
+            rec.task_in_flight[r] = scope_stats_peak_task_in_flight[scope_stats_depth][r];
+            rec.dep_used[r] = scope_stats_peak_dep_used[scope_stats_depth][r];
+            rec.fanin_used[r] = scope_stats_peak_fanin_used[scope_stats_depth][r];
+        }
+        rec.tensormap_used = scope_stats_peak_tensormap_used[scope_stats_depth];
+        ++scope_stats_shared_buf->header.write_count;
+    }
+    for (int r = 0; r < PTO2_SCOPE_STATS_MAX_RING_DEPTH; r++) {
+        scope_stats_peak_heap_bytes[scope_stats_depth][r] = 0;
+        scope_stats_peak_task_in_flight[scope_stats_depth][r] = 0;
+        scope_stats_peak_fanin_used[scope_stats_depth][r] = 0;
+        scope_stats_peak_dep_used[scope_stats_depth][r] = 0;
+    }
+    scope_stats_peak_tensormap_used[scope_stats_depth] = 0;
+    s_scope_site_file[scope_stats_depth] = nullptr;
+    s_scope_site_line[scope_stats_depth] = 0;
+    --scope_stats_depth;
+}
+
+extern "C" void scope_stats_on_fatal() {
+    if (!scope_stats_enabled) return;
+    if (!scope_stats_shared_buf) return;
+    scope_stats_shared_buf->header.fatal_latched = 1;
+}
+
+// ---------------------------------------------------------------------------
+// Pure-value peak update APIs — called by runtime at instrumentation points
+// ---------------------------------------------------------------------------
+
+extern "C" void scope_stats_update_allocator_peaks(int ring_id, uint64_t heap_bytes, int32_t tasks_in_flight) {
+    if (!scope_stats_enabled || scope_stats_depth < 0) return;
+    if (ring_id < 0 || ring_id >= PTO2_SCOPE_STATS_MAX_RING_DEPTH) return;
+    for (int d = 0; d <= scope_stats_depth; d++) {
+        if (heap_bytes > scope_stats_peak_heap_bytes[d][ring_id]) scope_stats_peak_heap_bytes[d][ring_id] = heap_bytes;
+        if (tasks_in_flight > scope_stats_peak_task_in_flight[d][ring_id])
+            scope_stats_peak_task_in_flight[d][ring_id] = tasks_in_flight;
+    }
+}
+
+extern "C" void scope_stats_update_tensormap_peak(int32_t tensormap_used) {
+    if (!scope_stats_enabled || scope_stats_depth < 0) return;
+    for (int d = 0; d <= scope_stats_depth; d++) {
+        if (tensormap_used > scope_stats_peak_tensormap_used[d]) scope_stats_peak_tensormap_used[d] = tensormap_used;
+    }
+}
+
+extern "C" void scope_stats_update_pool_peaks(int ring_id, int32_t fanin_used, int32_t dep_used) {
+    if (!scope_stats_enabled || scope_stats_depth < 0) return;
+    if (ring_id < 0 || ring_id >= PTO2_SCOPE_STATS_MAX_RING_DEPTH) return;
+    for (int d = 0; d <= scope_stats_depth; d++) {
+        if (fanin_used > scope_stats_peak_fanin_used[d][ring_id]) scope_stats_peak_fanin_used[d][ring_id] = fanin_used;
+        if (dep_used > scope_stats_peak_dep_used[d][ring_id]) scope_stats_peak_dep_used[d][ring_id] = dep_used;
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Capacity registration — called by runtime at init
+// ---------------------------------------------------------------------------
+
+extern "C" void scope_stats_set_ring_capacity(int ring_id, int32_t window_cap, uint64_t heap_cap, int32_t dep_cap) {
+    if (!scope_stats_shared_buf) return;
+    if (ring_id < 0 || ring_id >= PTO2_SCOPE_STATS_MAX_RING_DEPTH) return;
+    scope_stats_shared_buf->header.task_window_cap[ring_id] = window_cap;
+    scope_stats_shared_buf->header.heap_cap[ring_id] = heap_cap;
+    scope_stats_shared_buf->header.dep_cap[ring_id] = dep_cap;
+}
+
+extern "C" void scope_stats_set_tensormap_capacity(int32_t cap) {
+    if (!scope_stats_shared_buf) return;
+    scope_stats_shared_buf->header.tensormap_cap = cap;
+}
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
index f71766618..cf448bba1 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -123,6 +123,11 @@ typedef struct PTO2RuntimeOps {
     );
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args);
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args);
+
+    // Stash the call-site of the next PTO2ScopeGuard so the [ScopeStats]
+    // collector can log it. Always present to keep ops-table layout stable
+    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
+    void (*scope_set_site)(const char *file, int line);
 } PTO2RuntimeOps;
 
 /**
@@ -361,10 +366,13 @@ static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const u
  */
 class PTO2ScopeGuard {
 public:
-    explicit PTO2ScopeGuard(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) :
+    explicit PTO2ScopeGuard(
+        PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE()
+    ) :
         rt_(current_runtime()) {
         if (!rt_->ops->is_fatal(rt_)) {
             rt_->pending_scope_mode = mode;
+            if (rt_->ops->scope_set_site) rt_->ops->scope_set_site(file, line);
             rt_->ops->scope_begin(rt_);
         }
     }
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 05ac105a8..36555a40b 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -37,6 +37,10 @@
 extern "C" void set_dump_tensor_selective_mode(bool enable);
 extern "C" void set_dump_tensor_task_mask(uint64_t task_id, uint64_t mask);
 
+#if PTO2_PROFILING
+#include "aicpu/scope_stats_collector_aicpu.h"
+#endif
+
 // =============================================================================
 // Orchestrator Profiling (compile-time toggle)
 // =============================================================================
@@ -146,6 +150,12 @@ static int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code)
 static void
 orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *func, const char *fmt, va_list args) {
     int32_t latched_code = orch_mark_fatal(orch, error_code);
+#if PTO2_PROFILING
+    // Flush the active scope's peaks before the FATAL line so the diagnostic
+    // context lands adjacent in the log. Latched internally — safe to call
+    // from every cascaded report_fatal.
+    scope_stats_on_fatal();
+#endif
 
     if (fmt == nullptr || fmt[0] == '\0') {
         if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
@@ -232,6 +242,11 @@ static bool append_fanin_or_fail(
     }
     entry->slot_state = prod_state;
     fanin_builder->count++;
+#if PTO2_PROFILING
+    scope_stats_update_pool_peaks(
+        ring_id, fanin_pool.used(), orch->scheduler ? orch->scheduler->ring_sched_states[ring_id].dep_pool.used() : 0
+    );
+#endif
     return true;
 }
 
@@ -321,6 +336,10 @@ static bool prepare_task(
         orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK);
         return false;
     }
+#if PTO2_PROFILING
+    scope_stats_update_allocator_peaks(ring_id, allocator.heap_used_bytes(), allocator.active_count());
+    scope_stats_update_tensormap_peak(orch->tensor_map.current_used());
+#endif
 
     out->task_id = PTO2TaskId::make(ring_id, static_cast<uint32_t>(out->alloc_result.task_id));
     out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot);
@@ -415,6 +434,14 @@ bool PTO2OrchestratorState::init_from_layout(
     orch->scope_stack_capacity = layout.scope_stack_capacity;
     orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
 
+#if PTO2_PROFILING
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &alloc = orch->rings[r].task_allocator;
+        scope_stats_set_ring_capacity(r, alloc.window_size(), alloc.heap_capacity(), 0);
+    }
+    scope_stats_set_tensormap_capacity(orch->tensor_map.pool_capacity());
+#endif
+
     return true;
 }
 
@@ -428,7 +455,17 @@ void PTO2OrchestratorState::destroy() {
     orch->scope_begins = nullptr;
 }
 
-void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
+void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) {
+    this->scheduler = scheduler;
+#if PTO2_PROFILING
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        int32_t dep_cap = scheduler ? scheduler->ring_sched_states[r].dep_pool.capacity : 0;
+        scope_stats_set_ring_capacity(
+            r, rings[r].task_allocator.window_size(), rings[r].task_allocator.heap_capacity(), dep_cap
+        );
+    }
+#endif
+}
 
 // =============================================================================
 // Scope Management
@@ -467,6 +504,18 @@ void PTO2OrchestratorState::begin_scope(PTO2ScopeMode mode) {
     if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) {
         orch->manual_begin_depth = orch->scope_stack_top;
     }
+#if PTO2_PROFILING
+    scope_stats_on_begin();
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        auto &alloc = orch->rings[r].task_allocator;
+        scope_stats_update_allocator_peaks(r, alloc.heap_used_bytes(), alloc.active_count());
+        scope_stats_update_pool_peaks(
+            r, orch->rings[r].fanin_pool.used(),
+            orch->scheduler ? orch->scheduler->ring_sched_states[r].dep_pool.used() : 0
+        );
+    }
+    scope_stats_update_tensormap_peak(orch->tensor_map.current_used());
+#endif
 }
 
 void PTO2OrchestratorState::end_scope() {
@@ -476,6 +525,14 @@ void PTO2OrchestratorState::end_scope() {
     }
     assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
 
+    // Snapshot peak intra-scope queue fill BEFORE the orchestrator drains
+    // pending tasks via scheduler->on_scope_end. The user is measuring how
+    // much ring/heap the work submitted inside this scope holds at its peak,
+    // not the residual after teardown.
+#if PTO2_PROFILING
+    scope_stats_on_end();
+#endif
+
 #if PTO2_ORCH_PROFILING
     uint64_t _se0 = get_sys_cnt_aicpu();
 #endif
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 5a3e3d3d3..2d6493595 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -181,6 +181,11 @@ class PTO2TaskAllocator {
     uint64_t heap_top() const { return heap_top_; }
     uint64_t heap_capacity() const { return heap_size_; }
 
+    uint64_t heap_used_bytes() const {
+        if (heap_size_ == 0) return 0;
+        return (heap_top_ + heap_size_ - heap_tail_) % heap_size_;
+    }
+
 private:
     // --- Task Ring ---
     PTO2TaskDescriptor *descriptors_ = nullptr;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
index c801d5c15..122611e3f 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
@@ -28,6 +28,9 @@
 
 #include "aicpu/device_time.h"
 #include "common/unified_log.h"
+#if PTO2_PROFILING
+#include "aicpu/scope_stats_collector_aicpu.h"
+#endif
 
 // Weak fallback for HOST .so builds (never called, but satisfies linker).
 // The AICPU build links the strong symbol from platform/.../device_time.cpp.
@@ -231,6 +234,14 @@ void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, cons
     memcpy(ptr, &value, elem_size);
 }
 
+// Ops-table entry that hands the call-site captured by PTO2ScopeGuard to the
+// [ScopeStats] collector. The slot is always present in the struct to keep
+// the layout stable; at PTO2_PROFILING=0 we fill nullptr so the orchestration
+// .so's null-check skips it.
+#if PTO2_PROFILING
+static void scope_set_site_impl(const char *file, int line) { scope_stats_set_pending_site(file, line); }
+#endif
+
 static const PTO2RuntimeOps s_runtime_ops = {
     .submit_task = submit_task_impl,
     .scope_begin = rt_scope_begin,
@@ -246,6 +257,11 @@ static const PTO2RuntimeOps s_runtime_ops = {
     .set_tensor_data = set_tensor_data,
     .alloc_tensors = alloc_tensors_impl,
     .submit_dummy_task = submit_dummy_task_impl,
+#if PTO2_PROFILING
+    .scope_set_site = scope_set_site_impl,
+#else
+    .scope_set_site = nullptr,
+#endif
 };
 
 // =============================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 1da622407..51bb95248 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -89,6 +89,11 @@ struct PTO2RuntimeOps {
     );
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args);
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args);
+
+    // Stash the call-site captured by PTO2ScopeGuard into the [ScopeStats]
+    // collector. Always present to keep ops-table layout stable across
+    // PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
+    void (*scope_set_site)(const char *file, int line);
 };
 
 /**
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 39d6e4ad2..2e13fdbb3 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -373,6 +373,13 @@ struct PTO2TensorMap {
         return task_local_id & (task_window_sizes[ring_id] - 1);
     }
 
+    // Accessors read by scope_stats_collector. Declared unconditionally so the
+    // collector .cpp compiles at PTO2_PROFILING=0 (collector is unconditional —
+    // setter symbols must export for host dlsym; the probe call sites that use
+    // these accessors stay gated by PTO2_PROFILING).
+    int32_t current_used() const { return next_entry_idx - free_num; }
+    int32_t pool_capacity() const { return pool_size; }
+
     // new_entry only allocates memory, does not assign attributes
     PTO2TensorMapEntry *new_entry() {
         if (free_num > 0) {
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
index 32887d0be..2aff666c1 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
@@ -49,6 +49,10 @@
     } while (0)
 #endif
 
+#if PTO2_PROFILING
+#include "aicpu/scope_stats_collector_aicpu.h"
+#endif
+
 // =============================================================================
 // Ready Queue (Lock-free bounded MPMC — Vyukov design)
 // =============================================================================
@@ -720,6 +724,9 @@ struct PTO2SchedulerState {
                     early_finished++;
                 } else {
                     producer->fanout_head = rss.dep_pool.prepend(producer->fanout_head, ws);
+#if PTO2_PROFILING
+                    scope_stats_update_pool_peaks(ws->ring_id, 0, rss.dep_pool.used());
+#endif
                 }
                 producer->unlock_fanout();
             });
diff --git a/src/common/task_interface/call_config.h b/src/common/task_interface/call_config.h
index b8afbd32b..58ca0076b 100644
--- a/src/common/task_interface/call_config.h
+++ b/src/common/task_interface/call_config.h
@@ -11,9 +11,13 @@
 
 /**
  * CallConfig — per-NEXT_LEVEL-task config. Carries execution knobs
- * (block_dim, aicpu_thread_num) plus the four parallel diagnostics
+ * (block_dim, aicpu_thread_num) plus the five parallel diagnostics
  * sub-features under the profiling umbrella: `enable_l2_swimlane` (swimlane),
- * `enable_dump_tensor`, `enable_pmu`, and `enable_dep_gen`.
+ * `enable_dump_tensor`, `enable_pmu`, `enable_dep_gen`, and
+ * `enable_scope_stats`. All five require `output_prefix` because they each
+ * write a sibling artifact into that directory
+ * (`l2_perf_records.json` / `tensor_dump/` / `pmu.csv` / `deps.json` /
+ * `scope_stats.json`).
  *
  * `block_dim == 0` is a sentinel for "auto" — DeviceRunner resolves it at
  * run() time to the max block_dim the AICore stream allows
@@ -32,9 +36,9 @@
  *
  * `output_prefix` is a NUL-terminated directory path under which all
  * diagnostic artifacts (l2_perf_records.json / tensor_dump/ / pmu.csv /
- * submit_trace.bin) are written. The caller is responsible for filling it
- * whenever any diagnostic flag is enabled — `validate()` enforces this
- * contract at every submit/run entry point so the runtime never has to
+ * deps.json / scope_stats.json) are written. The caller is responsible for
+ * filling it whenever any diagnostic flag is enabled — `validate()` enforces
+ * this contract at every submit/run entry point so the runtime never has to
  * invent a path.
  */
 
@@ -52,10 +56,12 @@ struct CallConfig {
     int32_t enable_dump_tensor = 0;
     int32_t enable_pmu = 0;  // 0 = disabled; >0 = enabled, value selects event type
     int32_t enable_dep_gen = 0;
+    int32_t enable_scope_stats = 0;  // writes <output_prefix>/scope_stats.json
     char output_prefix[1024] = {};
 
     bool diagnostics_any() const noexcept {
-        return enable_l2_swimlane != 0 || enable_dump_tensor != 0 || enable_pmu != 0 || enable_dep_gen != 0;
+        return enable_l2_swimlane != 0 || enable_dump_tensor != 0 || enable_pmu != 0 || enable_dep_gen != 0 ||
+               enable_scope_stats != 0;
     }
 
     bool output_prefix_set() const noexcept { return output_prefix[0] != '\0'; }
@@ -67,10 +73,11 @@ struct CallConfig {
         if (diagnostics_any() && !output_prefix_set()) {
             throw std::invalid_argument(
                 "CallConfig: output_prefix must be set whenever any of "
-                "enable_l2_swimlane / enable_dump_tensor / enable_pmu / enable_dep_gen is enabled"
+                "enable_l2_swimlane / enable_dump_tensor / enable_pmu / enable_dep_gen / "
+                "enable_scope_stats is enabled"
             );
         }
     }
 };
 #pragma pack(pop)
-static_assert(sizeof(CallConfig) == 6 * sizeof(int32_t) + 1024, "CallConfig wire layout drift");
+static_assert(sizeof(CallConfig) == 7 * sizeof(int32_t) + 1024, "CallConfig wire layout drift");
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index 2ee392ab1..a3dd90eb7 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -323,7 +323,8 @@ RunTiming ChipWorker::run(int32_t callable_id, const ChipStorageTaskArgs *args,
     PtoRunTiming timing{0, 0};
     int rc = run_prepared_fn_(
         device_ctx_, rt, callable_id, args, config.block_dim, config.aicpu_thread_num, config.enable_l2_swimlane,
-        config.enable_dump_tensor, config.enable_pmu, config.enable_dep_gen, config.output_prefix, &timing
+        config.enable_dump_tensor, config.enable_pmu, config.enable_dep_gen, config.enable_scope_stats,
+        config.output_prefix, &timing
     );
     if (rc != 0) {
         throw std::runtime_error("run_prepared failed with code " + std::to_string(rc));
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index e1632eb2a..e4fb958c5 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -143,7 +143,7 @@ class ChipWorker {
         int (*)(void *, int, const uint8_t *, size_t, const uint8_t *, size_t, const uint8_t *, size_t);
     using PrepareCallableFn = int (*)(void *, int32_t, const void *);
     using RunPreparedFn =
-        int (*)(void *, void *, int32_t, const void *, int, int, int, int, int, int, const char *, PtoRunTiming *);
+        int (*)(void *, void *, int32_t, const void *, int, int, int, int, int, int, int, const char *, PtoRunTiming *);
     using UnregisterCallableFn = int (*)(void *, int32_t);
     using GetAicpuDlopenCountFn = size_t (*)(void *);
     using FinalizeDeviceFn = int (*)(void *);
diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h
index c4f6b7adf..e3fa4eb68 100644
--- a/src/common/worker/pto_runtime_c_api.h
+++ b/src/common/worker/pto_runtime_c_api.h
@@ -194,7 +194,7 @@ int prepare_callable(DeviceContextHandle ctx, int32_t callable_id, const void *c
 int run_prepared(
     DeviceContextHandle ctx, RuntimeHandle runtime, int32_t callable_id, const void *args, int block_dim,
     int aicpu_thread_num, int enable_l2_swimlane, int enable_dump_tensor, int enable_pmu, int enable_dep_gen,
-    const char *output_prefix, PtoRunTiming *out_timing
+    int enable_scope_stats, const char *output_prefix, PtoRunTiming *out_timing
 );
 
 /**
diff --git a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
index 9a43d54ec..39cfd778c 100644
--- a/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a2a3/host_build_graph/prepared_callable/test_prepared_callable.py
@@ -96,7 +96,7 @@ def compute_golden(self, args, params):
         a, b = args.a, args.b
         args.f[:] = (a + b + 1) * (a + b + 2)
 
-    def _run_and_validate_l2(
+    def _run_and_validate_l2(  # noqa: PLR0913
         self,
         worker,
         callable_obj,
@@ -107,6 +107,7 @@ def _run_and_validate_l2(
         enable_dump_tensor=False,
         enable_pmu=0,
         enable_dep_gen=False,
+        enable_scope_stats=False,
         output_prefix="",
     ):
         params = case.get("params", {})
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/scope_stats/test_scope_stats.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/scope_stats/test_scope_stats.py
new file mode 100644
index 000000000..9756f90fd
--- /dev/null
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/scope_stats/test_scope_stats.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""scope_stats smoke — capture pipeline produces a usable ``scope_stats.json``.
+
+Re-uses ``vector_example`` (outer executor scope + one inner ``PTO2_SCOPE()``).
+With ``--enable-scope-stats`` the platform collector
+(``scope_stats_collector_aicpu.h``) appends one record per scope_end into
+the host-allocated buffer, and the host dumps it as JSON. Enabling the
+flag is the entire user surface for the new API — the runtime takes care
+of the ``set_pending_site`` / ``on_begin`` / ``update_*_peaks`` / ``on_end``
+calls. Schema lives in ``docs/dfx/scope-stats.md`` §5.
+"""
+
+import json
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.scene_test import _outputs_dir, _sanitize_for_filename
+
+KERNELS_BASE = "../../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels"
+_REQUIRED_RECORD_FIELDS = {"site", "depth", "task_window", "heap", "dep", "fanin_used", "tensormap"}
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestScopeStats(SceneTestCase):
+    """Vector example with --enable-scope-stats, then assert scope_stats.json."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{KERNELS_BASE}/orchestration/example_orchestration.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{KERNELS_BASE}/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{KERNELS_BASE}/aiv/kernel_add_scalar.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{KERNELS_BASE}/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        SIZE = 128 * 128
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)),
+            Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32)),
+            Tensor("f", torch.zeros(SIZE, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
+
+    def test_run(self, st_platform, st_worker, request):
+        super().test_run(st_platform, st_worker, request)
+        if not request.config.getoption("--enable-scope-stats", default=False):
+            return
+        for case in self.CASES:
+            if st_platform in case["platforms"]:
+                self._validate_scope_stats_artifact(case)
+
+    def _validate_scope_stats_artifact(self, case):
+        safe_label = _sanitize_for_filename(f"TestScopeStats_{case['name']}")
+        matches = sorted(_outputs_dir().glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime)
+        assert matches, (
+            f"no output directory under {_outputs_dir()} matching {safe_label}_* — "
+            f"--enable-scope-stats was on but the run produced no per-case output dir"
+        )
+        path = matches[-1] / "scope_stats.json"
+        assert path.exists(), f"scope_stats.json missing under {matches[-1]} — collector finalize failed?"
+        data = json.loads(path.read_text())
+        assert data.get("version") == 2, f"unexpected schema version: {data!r}"
+        assert data.get("fatal") is False, f"run latched fatal: {data!r}"
+        records = data.get("records", [])
+        # vector_example has outer (executor) + inner PTO2_SCOPE → ≥2 records.
+        assert len(records) >= 2, f"expected ≥2 records (inner + outer), got {records!r}"
+        for rec in records:
+            assert _REQUIRED_RECORD_FIELDS <= rec.keys(), f"record missing fields: {rec!r}"
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
index b40ffcde4..fbd0e6d09 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
@@ -97,7 +97,7 @@ def generate_args(self, params):
     def compute_golden(self, args, params):
         args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
 
-    def _run_and_validate_l2(
+    def _run_and_validate_l2(  # noqa: PLR0913
         self,
         worker,
         callable_obj,
@@ -108,6 +108,7 @@ def _run_and_validate_l2(
         enable_dump_tensor=False,
         enable_pmu=0,
         enable_dep_gen=False,
+        enable_scope_stats=False,
         output_prefix="",
     ):
         params = case.get("params", {})
diff --git a/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py b/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py
index 2a0158dc7..1b0810714 100644
--- a/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a5/host_build_graph/prepared_callable/test_prepared_callable.py
@@ -85,7 +85,7 @@ def compute_golden(self, args, params):
         # dump_tensor orchestration computes f = (a + b) + 1
         args.f[:] = (args.a + args.b) + 1
 
-    def _run_and_validate_l2(
+    def _run_and_validate_l2(  # noqa: PLR0913
         self,
         worker,
         callable_obj,
@@ -96,6 +96,7 @@ def _run_and_validate_l2(
         enable_dump_tensor=False,
         enable_pmu=0,
         enable_dep_gen=False,
+        enable_scope_stats=False,
         output_prefix="",
     ):
         params = case.get("params", {})
diff --git a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
index dbede6ac7..a2bf0b2a1 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/prepared_callable/test_prepared_callable.py
@@ -98,7 +98,7 @@ def generate_args(self, params):
     def compute_golden(self, args, params):
         args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b)
 
-    def _run_and_validate_l2(
+    def _run_and_validate_l2(  # noqa: PLR0913
         self,
         worker,
         callable_obj,
@@ -109,6 +109,7 @@ def _run_and_validate_l2(
         enable_dump_tensor=False,
         enable_pmu=0,
         enable_dep_gen=False,
+        enable_scope_stats=False,
         output_prefix="",
     ):
         params = case.get("params", {})
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 89314d800..374179754 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -90,6 +90,7 @@ add_library(a2a3_rt_objs OBJECT
     ${A2A3_RUNTIME_DIR}/shared/pto_shared_memory.cpp
     ${A2A3_RUNTIME_DIR}/scheduler/pto_scheduler.cpp
     ${A2A3_RUNTIME_DIR}/shared/pto_tensormap.cpp
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/src/aicpu/scope_stats_collector_aicpu.cpp
     ${CMAKE_SOURCE_DIR}/stubs/test_stubs.cpp
 )
 target_include_directories(a2a3_rt_objs PUBLIC