diff --git a/docs/dfx/dep_gen.md b/docs/dfx/dep_gen.md
index 867472b7c..c1e83f8d4 100644
--- a/docs/dfx/dep_gen.md
+++ b/docs/dfx/dep_gen.md
@@ -1,4 +1,4 @@
-# dep_gen — Complete Per-Submit Dependency Graph (v2, Tensor-Annotated)
+# dep_gen — Complete Per-Submit Dependency Graph (Tensor-Annotated)
 
 ## 1. Background & Motivation
 
@@ -61,8 +61,8 @@ appear too.
   This is the guarantee against silent shotgun modifications — anyone
   who changes `compute_task_fanin` semantics will trip the gate
   immediately and know to update the annotated mirror.
-- **Output.** `<output_prefix>/deps.json` — v2 schema with `tasks[]`,
-  `tensors[]`, and tensor-annotated `edges[]` (see §4).
+- **Output.** `<output_prefix>/deps.json` — strided-Tensor schema with
+  `tasks[]`, `tensors[]`, and tensor-annotated `edges[]` (see §4).
 
 ---
 
@@ -97,29 +97,35 @@ The standard SceneTest path
 
 ---
 
-## 4. Output: `deps.json` (v2)
+## 4. Output: `deps.json`
 
 ```json
 {
-  "version": 2,
   "tasks": [
-    {"task_id": "0",          "scope": "auto"},
-    {"task_id": "4294967296", "scope": "auto"}
+    {"task_id": "0",          "scope": "auto", "args": []},
+    {"task_id": "4294967296", "scope": "auto", "args": [
+      {"idx": 0, "type": "INPUT", "tensor_id": "13451765318376212391",
+       "dtype": "FLOAT32", "shape": [16384],
+       "start_offset": "0", "strides": [1]}
+    ]}
   ],
   "tensors": [
     {"tensor_id": "13451765318376212391",
      "buffer_addr": "29204938752", "version": 0,
-     "dtype": "FLOAT32", "ndims": 1, "raw_shapes": [16384]}
+     "dtype": "FLOAT32", "buffer_numel": "16384"}
   ],
   "edges": [
     {"pred": "0", "succ": "4294967296", "arg": 0, "source": "creator",
      "tensor_id": "13451765318376212391", "consumer_dtype": "FLOAT32",
-     "consumer_shape": [16384], "consumer_offset": [0]},
+     "consumer_shape": [16384],
+     "consumer_start_offset": "0", "consumer_strides": [1]},
     {"pred": "4294967296", "succ": "4294967298", "arg": 0, "source": "tensormap",
      "overlap": "covered",
      "tensor_id": "9514117477438350967", "consumer_dtype": "FLOAT32",
-     "consumer_shape": [16384], "consumer_offset": [0],
-     "producer_shape": [16384], "producer_offset": [0]}
+     "consumer_shape": [16384],
+     "consumer_start_offset": "0", "consumer_strides": [1],
+     "producer_shape": [16384],
+     "producer_start_offset": "0", "producer_strides": [1]}
   ]
 }
 ```
@@ -153,8 +159,9 @@ this block.
 One entry per unique `(buffer_addr, version)` pair touched by the trace.
 `tensor_id` is a stable FNV-1a 64-bit hash of that pair — identical
 inputs across runs yield the same id, making `deps.json` files diffable.
-`raw_shapes` describes the **underlying buffer**, not the slice;
-per-edge slice information lives in the `edges[]` entries.
+`buffer_numel` is the element count of the **underlying buffer**, not the
+slice; per-edge slice geometry (`shape` + `start_offset` + `strides`)
+lives in the `edges[]` entries.
 
 ### `edges[]`
 
@@ -168,8 +175,12 @@ Each edge is `{pred, succ}` plus annotation. Fields:
 | `overlap` | string | `source=tensormap` | `covered` (producer slice fully contains consumer slice) or `other` |
 | `tensor_id` | uint64 (string) | not `explicit` | Identity of the underlying tensor; cross-references `tensors[]` |
 | `consumer_dtype` | string | not `explicit` | Element type the consumer reads as |
-| `consumer_shape`, `consumer_offset` | uint32 array | not `explicit` | The slice the consumer actually reads |
-| `producer_shape`, `producer_offset` | uint32 array | `source=tensormap` | The slice the producer wrote (recovered from the live tensormap entry) |
+| `consumer_shape` | uint32 array | not `explicit` | Per-dim element count of the consumer slice |
+| `consumer_start_offset` | uint64 (string) | not `explicit` | Element offset of the consumer slice into the buffer |
+| `consumer_strides` | uint32 array | not `explicit` | Per-dim stride (in elements) of the consumer slice; runtime invariant > 0 |
+| `producer_shape` | uint32 array | `source=tensormap` | Per-dim element count of the producer slice |
+| `producer_start_offset` | uint64 (string) | `source=tensormap` | Element offset of the producer slice |
+| `producer_strides` | uint32 array | `source=tensormap` | Per-dim stride of the producer slice; runtime invariant > 0 |
 
 A single `(pred, succ)` pair can appear in `edges[]` multiple times if
 the producer drives the consumer through multiple slots, multiple
@@ -222,9 +233,10 @@ Each arg row carries a 4-line block:
 
 ```text
 arg<i> <ARG_TYPE>[ ?] <Tname>:<dtype>
-raw:    [...]    # underlying buffer (from tensors[].raw_shapes)
-shape:  [...]    # slice this slot accesses
-offset: [...]    # slice start in the raw buffer
+storage:      <buffer_numel> elems   # underlying buffer size
+shape:        [...]                  # slice this slot accesses
+strides:      [...]                  # per-dim element strides
+start_offset: <N> (elem)             # slice start in the underlying buffer
 ```
 
 `<Tname>` is `T<idx>` from `tensors[]` order, so two slots referencing
@@ -270,7 +282,7 @@ for this tool.
 
 ## 6. Relationship to `fanout[]` + Validation Gate
 
-When checking fanout coverage, project v2 edges down to a
+When checking fanout coverage, project annotated edges down to a
 `{(pred, succ)}` set first — the per-edge annotation distinguishes
 sources / args / slices, so the raw `edges[]` count is a superset of the
 underlying task-pair count.
@@ -342,7 +354,7 @@ list; only the dep_gen replay graph loses the tail.
 | AICPU writer | `src/a2a3/platform/{include,src}/aicpu/dep_gen_collector_aicpu.{h,cpp}` | Single-instance write path; weak-fallback exported to host build |
 | Host collector | `src/a2a3/platform/{include/host,src/host}/dep_gen_collector.{h,cpp}` | `ProfilerBase<DepGenCollector, DepGenModule>` — drains ring → `records_` vector |
 | Capture call site | `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp` `submit_task_common` | One conditional block that snapshots inputs into the ring when `is_dep_gen_enabled()`; fires for both `submit_task` and `submit_dummy_task`. Dep-only tasks land in the record stream with valid tensor/dep info but no kernel_id field (the schema does not carry kernel_id), so replay treats them as ordinary dep nodes — viewers do not currently distinguish dummy from real tasks. |
-| Replay | `src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.{h,cpp}` | Pure CPU; runs dual-pass differential replay — `compute_task_fanin` (oracle) + inlined STEP A/B mirror (annotated) against two `PTO2TensorMap` instances. Emits v2 `deps.json` when both passes agree per record. |
+| Replay | `src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.{h,cpp}` | Pure CPU; runs dual-pass differential replay — `compute_task_fanin` (oracle) + inlined STEP A/B mirror (annotated) against two `PTO2TensorMap` instances. Emits `deps.json` when both passes agree per record. |
 | Device-runner hookup | `src/a2a3/platform/{onboard,sim}/host/device_runner.cpp` | post-`reconcile_counters` calls `dep_gen_replay_emit_deps_json(records.data(), records.size(), deps_path, nullptr)` |
 | Viewer | `simpler_setup/tools/deps_to_graph.py` | `deps.json` → pan/zoom HTML |
 | Test | `tests/st/a2a3/tensormap_and_ringbuffer/dep_gen_capture/test_dep_gen_capture.py` | Smoke test + `fanout ⊆ deps` validation gate |
diff --git a/docs/dfx/l2-swimlane-profiling.md b/docs/dfx/l2-swimlane-profiling.md
index 6f1dfe680..4f7cc24f3 100644
--- a/docs/dfx/l2-swimlane-profiling.md
+++ b/docs/dfx/l2-swimlane-profiling.md
@@ -25,14 +25,13 @@ available.
 
 ## 2. Overview
 
-- **Per-task AICore timing** — `start_time`, `end_time`,
-  `duration`, plus AICPU-stamped `dispatch_time` / `finish_time`.
+- **Per-task AICore timing** — `start_time_us`, `end_time_us`,
+  `duration_us`, plus AICPU-stamped `dispatch_time_us` / `finish_time_us`.
 - **Per-task fanout chain** — successor `task_id`s recorded in
   the L2 record so dependency arrows show up in the Perfetto
   view.
 - **AICPU scheduler phases** — per-iteration breakdown into
-  `SCHED_COMPLETE` / `SCHED_DISPATCH` / `SCHED_SCAN` /
-  `SCHED_IDLE_WAIT`.
+  `complete` / `dispatch` / `scan` / `idle`.
 - **Orchestrator phase summary** — cumulative cycle counts for
   the orchestrator's nine sub-steps (sync / alloc / params /
   lookup / heap / insert / fanin / finalize / scope_end).
@@ -57,10 +56,10 @@ backward-compatible with the old boolean behavior).
 | Level | Collects | Notes |
 | ----- | -------- | ----- |
 | 0 | Nothing (disabled) | Default when flag is absent |
-| 1 | AICore timing only (start/end/task_id/func_id/core_type) | No AICPU timestamps, no fanout |
-| 2 | + dispatch_time, finish_time, fanout | Full per-task record |
-| 3 | + Scheduler phases (`SCHED_*`) | Skips orchestrator phases |
-| 4 | + Orchestrator phases | Full collection |
+| 1 | AICore timing only (start_time_us/end_time_us/task_id/func_id/core_type) | No AICPU timestamps, no fanout |
+| 2 | + dispatch_time_us, finish_time_us, fanout | Full per-task record |
+| 3 | + scheduler phases (`aicpu_scheduler_phases[]`) | Skips orchestrator phases |
+| 4 | + orchestrator phases (`aicpu_orchestrator_phases[]`) | Full collection |
 
 ```bash
 # Standalone runner
@@ -88,8 +87,8 @@ dispatch/finish timestamps and fanout are recorded only at
 level >= 2, scheduler phase records only at level >= 3, and
 orchestrator phase records only at level >= 4.
 
-The JSON output `"version"` field directly reflects the
-perf_level: `1` = AICore timing only, `2` = +dispatch/fanout,
+The JSON output `"l2_perf_level"` field is the captured perf_level:
+`1` = AICore timing only, `2` = +dispatch/fanout,
 `3` = +scheduler phases, `4` = +orchestrator phases.
 
 `--rounds > 1` collects only on the **first** round so warm-up
@@ -118,22 +117,29 @@ you pass to `swimlane_converter`. Important fields per task:
 
 | Field | Meaning |
 | ----- | ------- |
-| `task_id` | Runtime task id, hex (low 32 bits = AICore register token; full 64 bits filled by AICPU) |
+| `task_id` | Runtime task id (`(ring_id << 32) \| local_id`); also exposed split as`ring_id` |
 | `func_id` | Kernel function id |
-| `core_type` | `0` = AIC, `1` = AIV |
-| `start_time` / `end_time` / `duration` | AICore device-clock cycles (`get_sys_cnt`) |
-| `dispatch_time` | AICPU timestamp when this task was dispatched |
-| `finish_time` | AICPU timestamp when AICPU observed FIN |
-| `fanout[]` / `fanout_count` | Successor task ids, used by Perfetto dependency arrows |
+| `core_id` / `core_type` | Physical core index and `"aic"` / `"aiv"` string |
+| `start_time_us` / `end_time_us` / `duration_us` | AICore execution window in microseconds |
+| `dispatch_time_us` | AICPU timestamp when this task was dispatched (filled at level >= 2) |
+| `finish_time_us` | AICPU timestamp when AICPU observed FIN (filled at level >= 2) |
+| `fanout[]` / `fanout_count` | Successor task ids (level >= 2), used by Perfetto dependency arrows |
 
-Phase records (per scheduler thread):
+Phase records (per scheduler thread, level >= 3 for
+`aicpu_scheduler_phases[]` and level >= 4 for
+`aicpu_orchestrator_phases[]`):
 
 | Field | Meaning |
 | ----- | ------- |
-| `start_time` / `end_time` | Phase start / end timestamps |
-| `loop_iter` | Scheduler loop iteration number |
-| `phase_id` | One of `SCHED_COMPLETE` / `SCHED_DISPATCH` / `SCHED_SCAN` / `SCHED_IDLE_WAIT`, or `ORCH_*` for orchestrator phases |
+| `start_time_us` / `end_time_us` | Phase start / end timestamps in microseconds |
+| `phase` | Lowercase phase name. Scheduler: `complete` / `dispatch` / `scan` / `idle`. Orchestrator: `orch_*` (sync / alloc / params / lookup / heap / insert / fanin / finalize / scope_end). |
+| `loop_iter` (scheduler) / `submit_idx` (orchestrator) | Iteration / submit-call counter for the producing thread |
 | `tasks_processed` (scheduler) / `task_id` (orchestrator) | Phase-specific union field |
+| `pop_hit` / `pop_miss` (dispatch only) | Ready-queue pop deltas since the previous dispatch emit |
+
+`core_to_thread[]` (level >= 3) maps `core_id` (array index) to the
+scheduler thread index that retired that core's tasks (`-1` =
+unassigned).
 
 ### 3.3 Convert and view in Perfetto
 
@@ -162,7 +168,7 @@ in. The trace contains:
   channel). Each task shows `func_name(t<task_id>)`; dependency
   arrows follow `fanout[]`.
 - **AICPU View** — scheduler thread lanes with per-iteration
-  phase blocks coloured by `phase_id`.
+  phase blocks coloured by `phase`.
 - **AICPU Scheduler** — orchestrator phase summary at the top.
 
 When the run also emitted a device log (`device-*` file under
@@ -206,12 +212,13 @@ schema and L3 example.
 What the swimlane shows:
 
 - **Per-task wall-clock placement.** Where each task ran on which
-  AICore, with start / end / duration in device cycles.
-- **Dispatch and finish overhead.** `dispatch_time` and
-  `finish_time` come from AICPU, so the gap between
-  `dispatch_time` and `start_time` is the AICPU→AICore
-  hand-off latency, and the gap between `end_time` and
-  `finish_time` is the FIN-observation latency.
+  AICore, with `start_time_us` / `end_time_us` / `duration_us` in
+  microseconds (converted from device cycles).
+- **Dispatch and finish overhead.** `dispatch_time_us` and
+  `finish_time_us` come from AICPU, so the gap between
+  `dispatch_time_us` and `start_time_us` is the AICPU→AICore
+  hand-off latency, and the gap between `end_time_us` and
+  `finish_time_us` is the FIN-observation latency.
 - **Dependency chains.** `fanout[]` lets Perfetto draw arrows
   between predecessor and successor tasks.
 - **Scheduler-loop time decomposition.** Per-iteration AICPU
@@ -279,7 +286,7 @@ platform-owned AICore state, and never reassigned — so AICore is
 fully decoupled from any AICPU-side records-buffer rotation. AICPU,
 on observing FIN, validates the slot's register token, copies the slot
 record into the current `L2PerfBuffer::records[count]`, fills
-`func_id` / `core_type` / `dispatch_time` / `finish_time` / `fanout`,
+`func_id` / `core_type` / `dispatch_time_us` / `finish_time_us` / `fanout`,
 advances `count`, and rotates the records buffer in place when it
 fills up. The ring is sized to the runtime's in-flight issue depth
 (2 for dual-issue today; raise to the next power of two when issue
@@ -619,7 +626,7 @@ data (only `tensormap_and_ringbuffer` does, and only when
 `AicpuPhaseHeader` was not initialized. Verify the runtime sets
 the magic in its scheduler init path.
 
-**`dispatch_time` < `finish_time` mismatch.** Verify the runtime
+**`dispatch_time_us` < `finish_time_us` mismatch.** Verify the runtime
 overwrites `task_id` with the full encoding on FIN
 (`tensormap_and_ringbuffer` does
 `(ring_id << 32) | local_id`); a half-filled record means AICore
diff --git a/simpler_setup/tools/README.md b/simpler_setup/tools/README.md
index 49817aa53..a1a548440 100644
--- a/simpler_setup/tools/README.md
+++ b/simpler_setup/tools/README.md
@@ -133,7 +133,7 @@ Analyze AICPU scheduler overhead and quantitatively decompose the sources of Tai
 
 `sched_overhead_analysis` reads two artifacts produced by the runtime:
 
-1. **Perf profiling data** (`l2_perf_records_*.json`, v2): per-task Exec / Head OH / Tail OH time breakdowns plus `aicpu_scheduler_phases` — per-thread, per-loop-iteration phase records carrying scan / complete / dispatch / idle timings and per-emit pop_hit / pop_miss deltas.
+1. **Perf profiling data** (`l2_perf_records_*.json`, l2_perf_level >= 3): per-task Exec / Head OH / Tail OH time breakdowns plus `aicpu_scheduler_phases` — per-thread, per-loop-iteration phase records carrying scan / complete / dispatch / idle timings and per-emit pop_hit / pop_miss deltas.
 2. **`deps.json`** (optional, dep_gen replay output): structural task DAG. When colocated with the perf JSON, Part 2 prints per-thread fanout / fanin aggregates derived from it.
 
 ### Basic Usage
@@ -167,7 +167,7 @@ Output is emitted in three parts:
 - **Part 2: AICPU scheduler loop breakdown** — per-scheduler-thread loop statistics, per-phase (scan / complete / dispatch / idle) time ratios, pop_hit / pop_miss totals, and (when deps.json is available) per-thread fanout / fanin aggregates
 - **Part 3: Tail OH distribution & cause analysis** — Tail OH quantile distribution (P10–P99), correlation between scheduler loop iteration time and Tail OH, and data-driven insights into the dominant phase
 
-The perf JSON must have non-empty `aicpu_scheduler_phases` (rerun the case with `--enable-l2-swimlane` if the tool reports the field is missing).
+The perf JSON must be captured at l2_perf_level >= 3 so that `aicpu_scheduler_phases` is non-empty (rerun the case with `--enable-l2-swimlane` if the tool reports the field is missing).
 
 ---
 
@@ -283,15 +283,35 @@ The analysis tools share the same input format - the `l2_perf_records_*.json` fi
 
 ```json
 {
+  "l2_perf_level": 4,
   "tasks": [
     {
       "task_id": 0,
       "func_id": 0,
-      "core_id": 0,
-      "core_type": "aic",
-      "start_time_us": 100.0,
-      "end_time_us": 250.5,
-      "duration_us": 150.5
+      "core_id": 7,
+      "core_type": "aiv",
+      "ring_id": 0,
+      "start_time_us": 47.46,
+      "end_time_us": 55.9,
+      "duration_us": 8.44,
+      "dispatch_time_us": 45.94,
+      "finish_time_us": 60.52,
+      "fanout": [4294967299, 4294967297, 4294967296],
+      "fanout_count": 3
+    },
+    {
+      "task_id": 4294967296,
+      "func_id": 1,
+      "core_id": 7,
+      "core_type": "aiv",
+      "ring_id": 1,
+      "start_time_us": 68.68,
+      "end_time_us": 70.42,
+      "duration_us": 1.74,
+      "dispatch_time_us": 68.24,
+      "finish_time_us": 71.2,
+      "fanout": [4294967298],
+      "fanout_count": 1
     }
   ]
 }
@@ -300,6 +320,15 @@ The analysis tools share the same input format - the `l2_perf_records_*.json` fi
 Dependency edges come from `deps.json` (dep_gen replay) at post-process time —
 not from the perf JSON. See [`swimlane_converter --deps-json`](#swimlane_converter).
 
+Top-level layout depends on `l2_perf_level`:
+
+- All levels: `l2_perf_level`, `tasks[]` (per-task fields above).
+- `>= 3`: also `aicpu_scheduler_phases[]` (per-thread phase records:
+  scan / complete / dispatch / idle) and `core_to_thread[]` (core_id →
+  scheduler thread index).
+- `>= 4`: also `aicpu_orchestrator_phases[]` (per-task orchestrator
+  phase records).
+
 ### Kernel Config Format
 
 To display meaningful function names in the output, provide a `kernel_config.py` file:
@@ -379,10 +408,11 @@ For batch-run hardware regression, see the dev-only script
 - Check the kernel_config.py file format
 - Make sure every KERNELS entry has a 'func_id' and 'name' field
 
-### Error: Unsupported version
+### Error: Unsupported l2_perf_level
 
-- The tools only support version 1 of the profiling data format
-- Regenerate the profiling data with the latest runtime
+- The tools accept l2_perf_level 1–4 (the integer captured at runtime
+  via `--enable-l2-swimlane <N>`)
+- Regenerate the profiling data with a supported level
 
 ### Error: Perf JSON missing required fields for scheduler overhead analysis
 
@@ -407,7 +437,7 @@ For batch-run hardware regression, see the dev-only script
 | ---- | ---- | ------- | ------ |
 | `l2_perf_records_*.json` | Runtime | Raw timing profiling data | JSON |
 | `merged_swimlane_*.json` | swimlane_converter | Perfetto visualization | Chrome Trace Event JSON |
-| `deps.json` | Runtime (dep_gen replay) | Structural task dependency graph + per-edge tensor info | JSON (v2) |
+| `deps.json` | Runtime (dep_gen replay) | Structural task dependency graph + per-edge tensor info | JSON |
 | `deps_graph.html` | deps_to_graph | Pan/zoom dependency graph viewer | HTML (self-contained) |
 
 ---
diff --git a/simpler_setup/tools/deps_to_graph.py b/simpler_setup/tools/deps_to_graph.py
index a5f1f917f..ec185e6ce 100644
--- a/simpler_setup/tools/deps_to_graph.py
+++ b/simpler_setup/tools/deps_to_graph.py
@@ -46,7 +46,7 @@
 def _normalize_task_id(v):
     """Unsigned 64-bit task id (matches deps.json edges and l2_perf task_id).
 
-    Accepts ints (legacy) and strings (current schema): deps.json v2 emits all
+    Accepts ints (legacy) and strings (current schema): deps.json emits all
     uint64 fields as quoted strings to dodge JSON-number precision loss in
     JavaScript-based consumers, since tensor_ids (FNV hashes) and buffer
     addresses routinely exceed Number.MAX_SAFE_INTEGER (2^53 - 1)."""
@@ -107,31 +107,26 @@ def fmt(task_id):
 
 
 def _load_deps_edges(deps_path):
-    """Parse deps.json (v2) into renderer-friendly pieces.
+    """Parse deps.json into renderer-friendly pieces.
 
     Returns a 5-tuple:
         edges: sorted list of unique (pred, succ) pairs — what the graph
-            renders as arrows. v2 may have multiple annotated edges sharing
-            the same (pred, succ) (distinct arg / source / slice); they
-            collapse to one arrow here.
+            renders as arrows. Multiple annotated edges sharing the same
+            (pred, succ) (distinct arg / source / slice) collapse to one
+            arrow here.
         nodes: sorted list of all referenced task ids.
         annotations: dict[(pred, succ) -> list[dict]] of annotation rows
-            (one per annotated edge in v2), keyed in insertion order so
+            (one per annotated edge), keyed in insertion order so
             ``--show-tensor-info`` can resolve per-edge tensor identities
             and target the right input port on the consumer node.
-        tensor_table: dict[tensor_id -> dict] from the v2 tensors[] block.
-        task_table: dict[task_id -> dict] from the v2 tasks[] block,
+        tensor_table: dict[tensor_id -> dict] from the tensors[] block.
+        task_table: dict[task_id -> dict] from the tasks[] block,
             carrying the per-arg input/output slot info that the
             ``--show-tensor-info`` view renders as compartments inside each
             task node.
-
-    Raises ValueError if ``version`` is not 3 — older versions are no longer supported.
     """
     with open(deps_path) as f:
         data = json.load(f)
-    version = data.get("version")
-    if version != 3:
-        raise ValueError(f"deps.json version={version!r}; only v3 is supported (regenerate with current dep_gen)")
     edges_raw = data.get("edges", [])
     seen: set[tuple[int, int]] = set()
     edges: list[tuple[int, int]] = []
@@ -400,9 +395,8 @@ def _arg_row_html(arg, tensor_table, side):
     tname = _short_tensor_label(tid, tensor_table)
     dtype = arg.get("dtype")
     shape = arg.get("shape")
-    # v3 schema: per-slot strided descriptor (start_offset is uint64 quoted as
-    # string, stride is per-dim int32 array). Older v2 args used a multi-dim
-    # `offset` array — viewers consuming older logs should bump producer first.
+    # Strided-Tensor per-slot descriptor: start_offset is uint64 quoted as
+    # string, stride is per-dim int32 array.
     start_offset = arg.get("start_offset")
     strides = arg.get("strides")
     buffer_numel = None
@@ -573,7 +567,7 @@ def emit_dot(edges, nodes, meta, direction="LR", annotations=None, tensor_table=
         # port; pick one representative annotation per (pred, succ) for the
         # producer-port match (multiple annotations sharing the pair all
         # target the same arg in practice — distinct args produce distinct
-        # edges in v2).
+        # edges).
         rows = annotations.get((pred, succ), [])
         if not rows:
             lines.append(f"  {_node_id(pred)} -> {_node_id(succ)};")
diff --git a/simpler_setup/tools/sched_overhead_analysis.py b/simpler_setup/tools/sched_overhead_analysis.py
index 5ab4e7b36..1ac737070 100644
--- a/simpler_setup/tools/sched_overhead_analysis.py
+++ b/simpler_setup/tools/sched_overhead_analysis.py
@@ -11,7 +11,8 @@
 
 Inputs:
   1. Per-task perf profiling data (l2_perf_records_*.json) with
-     ``aicpu_scheduler_phases`` populated by ``--enable-l2-swimlane``.
+     ``aicpu_scheduler_phases`` populated by ``--enable-l2-swimlane`` at
+     level >= 3.
   2. deps.json (optional, dep_gen replay output) colocated with the perf JSON,
      used to derive per-thread fanout / fanin DAG stats.
 
@@ -29,8 +30,9 @@
 
 
 def _to_uint64(v):
-    """Coerce JSON-encoded uint64 (int or string after the deps.json v2 schema
-    bump in #769) to a Python int. Returns None when unparseable."""
+    """Coerce a JSON-encoded uint64 (int, or string — deps.json quotes uint64s
+    so JavaScript-based consumers don't lose precision past 2^53 - 1) to a
+    Python int. Returns None when unparseable."""
     try:
         n = int(v)
     except (TypeError, ValueError):
@@ -292,7 +294,7 @@ def run_analysis(  # noqa: PLR0912, PLR0915
         print_sources: Whether to print selected input files.
         perf_data: Optional pre-parsed perf JSON dict. When provided, skip
             re-reading from disk — main() already parses the file to probe
-            for v2 phase data, so passing the result through saves a second
+            for phase data, so passing the result through saves a second
             load on large artifacts.
         deps_json_path: Optional deps.json (dep_gen replay output) co-located
             with the perf JSON. When present, per-thread fanout / fanin
@@ -485,7 +487,7 @@ def run_analysis(  # noqa: PLR0912, PLR0915
     else:
         pop_hit = pop_miss = 0
         pop_hit_rate = 0.0
-        print("  Pop: (no per-emit pop deltas in input — needs --enable-l2-swimlane)")
+        print("  Pop: (no per-emit pop deltas in input — needs --enable-l2-swimlane at level >= 3)")
 
     print()
     print("=" * 90)
diff --git a/simpler_setup/tools/swimlane_converter.py b/simpler_setup/tools/swimlane_converter.py
index 5841617cb..ebe4c6820 100644
--- a/simpler_setup/tools/swimlane_converter.py
+++ b/simpler_setup/tools/swimlane_converter.py
@@ -92,7 +92,8 @@ def read_perf_data(filepath):
         filepath: Path to input JSON file
 
     Returns:
-        dict: Parsed performance data with key:
+        dict: Parsed performance data with keys:
+            - l2_perf_level
             - tasks (list)
 
     Raises:
@@ -101,8 +102,13 @@ def read_perf_data(filepath):
     with open(filepath) as f:
         data = json.load(f)
 
-    if "tasks" not in data:
-        raise ValueError("Missing required field: tasks")
+    required_fields = ["l2_perf_level", "tasks"]
+    for field in required_fields:
+        if field not in data:
+            raise ValueError(f"Missing required field: {field}")
+
+    if data["l2_perf_level"] not in [1, 2, 3, 4]:
+        raise ValueError(f"Unsupported l2_perf_level: {data['l2_perf_level']} (expected 1, 2, 3, or 4)")
 
     return data
 
@@ -135,13 +141,6 @@ def load_deps_json(deps_path):
     if not isinstance(edges, list):
         print(f"Warning: {deps_path} has no 'edges' array", file=sys.stderr)
         return None
-    version = data.get("version")
-    if version != 2:
-        print(
-            f"Warning: {deps_path} version={version!r}; only v2 is supported.",
-            file=sys.stderr,
-        )
-        return None
     # The converter only needs flow-event endpoints (not the per-edge tensor
     # annotations). Project annotated edges down to a (pred, succ) set and
     # dedup so multiple annotated edges sharing the same pair (distinct arg
@@ -394,15 +393,15 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
         output_path: Path to output JSON file
         func_id_to_name: Optional dict mapping func_id to function name
         verbose: Print progress information
-        scheduler_phases: Optional list of per-thread phase record lists (version 2)
-        orchestrator_phases: Optional list of per-task orchestrator phase records (version 2)
+        scheduler_phases: Optional list of per-thread phase record lists (l2_perf_level >= 3)
+        orchestrator_phases: Optional list of per-task orchestrator phase records (l2_perf_level >= 4)
         core_to_thread: Optional list mapping core_id (index) to scheduler thread index (-1 = unassigned)
 
     Generates processes in the trace:
         - pid=1 "AICore View": start_time_us to end_time_us (kernel execution)
         - pid=2 "AICPU View": dispatch_time_us to finish_time_us (AICPU perspective)
-        - pid=3 "AICPU Scheduler": scheduler phase bars (version 2)
-        - pid=4 "AICPU Orchestrator": orchestrator phase bars or summary (version 2)
+        - pid=3 "AICPU Scheduler": scheduler phase bars (l2_perf_level >= 3)
+        - pid=4 "AICPU Orchestrator": orchestrator phase bars or summary (l2_perf_level >= 4)
     """
     if verbose:
         print("Generating Chrome Trace JSON...")
@@ -699,7 +698,7 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
         if hb_violation_count > 0:
             print(f"  Happens-before violations: {hb_violation_count} edge(s) flagged as 'hb_violation'")
 
-    # AICPU Scheduler phase events (version 2)
+    # AICPU Scheduler phase events (l2_perf_level >= 3)
     if scheduler_phases:
         # Process metadata
         events.append(
@@ -756,7 +755,7 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
                 }
                 events.append(event)
 
-    # AICPU Orchestrator lane (version 2)
+    # AICPU Orchestrator lane (l2_perf_level >= 4)
     #
     # Per-event AicpuPhaseRecord[] is the single source of truth for
     # orchestrator timing. There is no separate aggregate summary — the
@@ -1147,10 +1146,12 @@ def _resolve_output_path(args, input_path):
 
 
 def _print_verbose_data_info(data, verbose):
-    """Print verbose summary of loaded performance data including phase counts."""
+    """Print verbose summary of loaded performance data, including phase counts
+    when present (l2_perf_level >= SCHED_PHASES)."""
     if not verbose:
         return
     print("\n=== Performance Data ===")
+    print(f"  L2 perf level: {data['l2_perf_level']}")
     print(f"  Task Count: {len(data['tasks'])}")
     if data["tasks"]:
         start_times = [t["start_time_us"] for t in data["tasks"]]
diff --git a/src/a2a3/platform/src/host/l2_perf_collector.cpp b/src/a2a3/platform/src/host/l2_perf_collector.cpp
index 745dab8b6..715ee66d6 100644
--- a/src/a2a3/platform/src/host/l2_perf_collector.cpp
+++ b/src/a2a3/platform/src/host/l2_perf_collector.cpp
@@ -595,7 +595,9 @@ int L2PerfCollector::export_swimlane_json() {
     // Fanout fields are emitted as empty/zero — the device-side hot path no
     // longer carries them. Downstream (swimlane_converter.py) joins fanout
     // from the sibling deps.json (dep_gen output).
+    int l2_perf_level = static_cast<int>(l2_perf_level_);
     outfile << "{\n";
+    outfile << "  \"l2_perf_level\": " << l2_perf_level << ",\n";
     outfile << "  \"tasks\": [\n";
 
     for (size_t i = 0; i < tagged_records.size(); ++i) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
index 566249ac7..863299dbc 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -166,8 +166,9 @@ Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX
 ```
 
 Per-thread fanout / fanin edge counts and ready-queue pop hit / miss
-stats live in the v2 JSON `aicpu_scheduler_phases[]` and `deps.json`;
-consume them via `simpler_setup/tools/sched_overhead_analysis.py`.
+stats live in `aicpu_scheduler_phases[]` (in `l2_perf_records.json`
+captured at l2_perf_level >= 3) and `deps.json`; consume them via
+`simpler_setup/tools/sched_overhead_analysis.py`.
 
 ---
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
index 808fce372..027805918 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
@@ -11,9 +11,9 @@
 
 /**
  * @file dep_gen_replay.cpp
- * @brief Replay in-memory DepGenRecord stream → deps.json (v2, tensor-annotated)
- *        via a host-resident PTO2TensorMap, with a differential check against
- *        the runtime template `compute_task_fanin`.
+ * @brief Replay in-memory DepGenRecord stream → deps.json (strided tensor
+ *        representation, tensor-annotated) via a host-resident PTO2TensorMap,
+ *        with a differential check against the runtime template `compute_task_fanin`.
  *
  * Two passes run per record against two parallel PTO2TensorMap instances that
  * evolve in lockstep:
@@ -113,7 +113,7 @@ int32_t count_outputs(const DepGenRecord *records, size_t n) {
 }
 
 // ---------------------------------------------------------------------------
-// v2 schema accumulators
+// JSON output accumulators (in-memory tables that get serialized at the end)
 // ---------------------------------------------------------------------------
 
 // Edge categories — matches the three places a runtime fanin edge is born.
@@ -240,7 +240,7 @@ uint64_t make_tensor_id(uint64_t buffer_addr, int32_t version) {
     return h;
 }
 
-// Register a tensor in the v2 tensors[] table on first sight of (addr,
+// Register a tensor in the tensors[] table on first sight of (addr,
 // version). buffer_numel describes the underlying storage size in elements;
 // per-edge fields describe the slice via (start_offset, strides[]). Subsequent
 // sightings of the same (addr, version) are no-ops.
@@ -288,7 +288,7 @@ void fill_producer(EdgeAnnot &e, const PTO2TensorMapEntry &entry) {
 }
 
 // ---------------------------------------------------------------------------
-// JSON writer (v2)
+// JSON writer
 // ---------------------------------------------------------------------------
 
 void write_uint_array(std::ofstream &out, const uint32_t *data, uint32_t n) {
@@ -300,7 +300,7 @@ void write_uint_array(std::ofstream &out, const uint32_t *data, uint32_t n) {
     out << ']';
 }
 
-bool write_deps_json_v2(
+bool write_deps_json(
     const char *path, const std::vector<TaskTableEntry> &tasks, const std::vector<TensorTableEntry> &tensors,
     const std::vector<EdgeAnnot> &edges
 ) {
@@ -309,13 +309,11 @@ bool write_deps_json_v2(
         LOG_ERROR("dep_gen replay: failed to open '%s' for write", path);
         return false;
     }
-    // Schema v3: strided tensor representation.
-    //   tensors[*]:   buffer_numel replaces raw_shapes (storage size in elements)
-    //   edges[*]:     consumer/producer_offset[]  ->  start_offset (uint64) + strides[] (int32)
-    //   tasks[*].args[*]: offset[]  ->  start_offset + strides[]
-    out << "{\"version\":3";
-
-    out << ",\"tasks\":[";
+    // Strided tensor representation. tensors[].buffer_numel is the underlying
+    // storage element count; tasks[].args[] and edges[] carry per-slice
+    // geometry as (start_offset uint64, strides[] uint32 — runtime invariant
+    // forbids zero / negative strides, see runtime/tensor.h).
+    out << "{\"tasks\":[";
     for (size_t i = 0; i < tasks.size(); i++) {
         if (i > 0) out << ',';
         const auto &t = tasks[i];
@@ -448,7 +446,7 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c
         LOG_ERROR("dep_gen replay: num_records=%zu but records pointer is null", num_records);
         return -1;
     }
-    LOG_INFO_V0("dep_gen replay: processing %zu in-memory records (v2, dual-pass)", num_records);
+    LOG_INFO_V0("dep_gen replay: processing %zu in-memory records (dual-pass)", num_records);
 
     // Per-ring task window sizes — tensormap masks slot indices and requires
     // each to be a power of two. Auto-size from the records themselves so each
@@ -495,7 +493,7 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c
         return -3;
     }
 
-    // v2 accumulators.
+    // JSON output accumulators.
     std::vector<TaskTableEntry> task_table;
     std::vector<TensorTableEntry> tensor_table;
     std::unordered_map<uint64_t, size_t> tensor_index;  // tensor_id → table idx
@@ -763,12 +761,12 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c
     tm_oracle.destroy();
     tm_annot.destroy();
 
-    if (!write_deps_json_v2(deps_json_path, task_table, tensor_table, annot_edges)) {
+    if (!write_deps_json(deps_json_path, task_table, tensor_table, annot_edges)) {
         return -5;
     }
     LOG_INFO_V0(
-        "dep_gen replay: wrote deps.json v2 to %s (tasks=%zu, tensors=%zu, edges=%zu)", deps_json_path,
-        task_table.size(), tensor_table.size(), annot_edges.size()
+        "dep_gen replay: wrote deps.json to %s (tasks=%zu, tensors=%zu, edges=%zu)", deps_json_path, task_table.size(),
+        tensor_table.size(), annot_edges.size()
     );
     return 0;
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h
index ea39bf7ea..daef4dfdd 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.h
@@ -29,26 +29,33 @@
  * 1 KB GM store off the scheduler critical path). Replay sees every
  * submit and reconstructs the complete dependency graph.
  *
- * Output format (deps.json, v2):
+ * Output format (deps.json, strided tensor representation):
  *
- *   {"version":2,
- *    "tasks":   [{"task_id":<u64>, "scope":"auto|manual"}, ...],
+ *   {"tasks":   [{"task_id":<u64>, "scope":"auto|manual",
+ *                 "args":[{"idx":<i32>, "type":"<arg_type>",
+ *                          "tensor_id":<u64>, "dtype":"...", "shape":[...],
+ *                          "start_offset":<u64>, "strides":[...]}, ...]}, ...],
  *    "tensors": [{"tensor_id":<u64>, "buffer_addr":<u64>, "version":<i32>,
- *                 "dtype":"FLOAT32", "ndims":<u32>, "raw_shapes":[...]}, ...],
+ *                 "dtype":"FLOAT32", "buffer_numel":<u64>}, ...],
  *    "edges":   [{"pred":<u64>, "succ":<u64>, "arg":<i32>,
  *                 "source":"explicit|creator|tensormap",
  *                 "overlap":"covered|other" (tensormap only),
  *                 "tensor_id":<u64> (non-explicit),
- *                 "consumer_dtype":"...", "consumer_shape":[...], "consumer_offset":[...],
- *                 "producer_shape":[...] (tensormap), "producer_offset":[...] (tensormap)},
+ *                 "consumer_dtype":"...", "consumer_shape":[...],
+ *                 "consumer_start_offset":<u64>, "consumer_strides":[...],
+ *                 "producer_shape":[...] (tensormap),
+ *                 "producer_start_offset":<u64> (tensormap),
+ *                 "producer_strides":[...] (tensormap)},
  *                ...]}
  *
  *   - All task ids are ``PTO2TaskId::raw`` values (``(ring_id << 32) | local_id``).
  *   - ``tensor_id`` is a stable FNV-1a hash of ``(buffer_addr, version)``.
+ *   - ``buffer_numel`` is the underlying storage element count; tensor shapes
+ *     are carried per-arg / per-edge alongside ``start_offset`` + ``strides``.
  *   - Distinct producers / arg indices / sources keep their own edges; per-record
  *     deduplication of producer ids mirrors the runtime
  *     ``PTO2FaninBuilder::append_fanin_or_fail`` semantics so the set of
- *     ``(pred, succ)`` pairs in v2 is identical to what the runtime would have
+ *     ``(pred, succ)`` pairs is identical to what the runtime would have
  *     recorded.
  *
  * Self-checking: the replay runs two parallel tensormap instances per record —
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index fd356ccc6..ffa6fd335 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -431,9 +431,10 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa
             cycles_to_us(l2_perf.sched_complete_perf_cycle), l2_perf.sched_complete_perf_cycle * 100.0 / c_parent
         );
 
-        // pop_hit / pop_miss per-emit deltas live in each v2 JSON dispatch
-        // record's extras; sum-of-deltas equals the run-cumulative tracked
-        // in this struct (final-drain emit covers the trailing-idle tail).
+        // pop_hit / pop_miss per-emit deltas live in each dispatch-phase
+        // record's extras in aicpu_scheduler_phases[]; sum-of-deltas equals
+        // the run-cumulative tracked in this struct (final-drain emit covers
+        // the trailing-idle tail).
         LOG_INFO_V9(
             "Thread %d:   dispatch       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_dispatch_cycle),
             l2_perf.sched_dispatch_cycle * 100.0 / sched_total
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index 439ff2e61..6304c99df 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -92,7 +92,8 @@ int SchedulerContext::pop_ready_tasks_batch(
     int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
 #endif
     // pop_hit / pop_miss are PTO2_PROFILING-gated (not the inner verbose tier)
-    // so the v2 JSON dispatch records carry queue-health stats on default builds.
+    // so dispatch-phase records in aicpu_scheduler_phases[] carry queue-health
+    // stats on default builds.
     if (count > 0) {
         l2_perf.pop_hit += count;
     } else {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
index a20454615..53cd87b8d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
@@ -348,9 +348,10 @@ struct alignas(64) SchedL2PerfCounters {
     uint64_t sched_loop_count{0};
     uint32_t phase_complete_count{0};
     uint32_t phase_dispatch_count{0};
-    // Run-cumulative pop counters; the v2 JSON dispatch-record emitter writes
-    // per-emit deltas computed as (current - pop_*_at_last_emit) and the
-    // end-of-run cold-path log reads the cumulatives directly.
+    // Run-cumulative pop counters; the dispatch-phase record emitter
+    // (aicpu_scheduler_phases[]) writes per-emit deltas computed as
+    // (current - pop_*_at_last_emit) and the end-of-run cold-path log reads
+    // the cumulatives directly.
     uint64_t pop_hit{0};
     uint64_t pop_miss{0};
     uint64_t pop_hit_at_last_emit{0};
diff --git a/src/a5/platform/src/host/l2_perf_collector.cpp b/src/a5/platform/src/host/l2_perf_collector.cpp
index 93bb9f83d..028d374a6 100644
--- a/src/a5/platform/src/host/l2_perf_collector.cpp
+++ b/src/a5/platform/src/host/l2_perf_collector.cpp
@@ -579,9 +579,9 @@ int L2PerfCollector::export_swimlane_json() {
         return -1;
     }
 
-    int version = static_cast<int>(l2_perf_level_);
+    int l2_perf_level = static_cast<int>(l2_perf_level_);
     outfile << "{\n";
-    outfile << "  \"version\": " << version << ",\n";
+    outfile << "  \"l2_perf_level\": " << l2_perf_level << ",\n";
     outfile << "  \"tasks\": [\n";
 
     for (size_t i = 0; i < tagged_records.size(); ++i) {
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
index 34e17cc86..79ae71b24 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
+++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -166,8 +166,9 @@ Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX
 ```
 
 Per-thread fanout / fanin edge counts and ready-queue pop hit / miss
-stats live in the v2 JSON `aicpu_scheduler_phases[]` and `deps.json`;
-consume them via `simpler_setup/tools/sched_overhead_analysis.py`.
+stats live in `aicpu_scheduler_phases[]` (in `l2_perf_records.json`
+captured at l2_perf_level >= 3) and `deps.json`; consume them via
+`simpler_setup/tools/sched_overhead_analysis.py`.
 
 ---
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index aab3f22d4..ca01eaa0c 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -431,9 +431,10 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa
             cycles_to_us(l2_perf.sched_complete_perf_cycle), l2_perf.sched_complete_perf_cycle * 100.0 / c_parent
         );
 
-        // pop_hit / pop_miss per-emit deltas live in each v2 JSON dispatch
-        // record's extras; sum-of-deltas equals the run-cumulative tracked
-        // in this struct (final-drain emit covers the trailing-idle tail).
+        // pop_hit / pop_miss per-emit deltas live in each dispatch-phase
+        // record's extras in aicpu_scheduler_phases[]; sum-of-deltas equals
+        // the run-cumulative tracked in this struct (final-drain emit covers
+        // the trailing-idle tail).
         LOG_INFO_V9(
             "Thread %d:   dispatch       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_dispatch_cycle),
             l2_perf.sched_dispatch_cycle * 100.0 / sched_total
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index 6bd212f84..5af5c95ed 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -87,7 +87,8 @@ int SchedulerContext::pop_ready_tasks_batch(
     int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
 #endif
     // pop_hit / pop_miss are PTO2_PROFILING-gated (not the inner verbose tier)
-    // so the v2 JSON dispatch records carry queue-health stats on default builds.
+    // so dispatch-phase records in aicpu_scheduler_phases[] carry queue-health
+    // stats on default builds.
     if (count > 0) {
         l2_perf.pop_hit += count;
     } else {
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
index 84991ae8e..ca099982f 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
@@ -350,9 +350,10 @@ struct alignas(64) SchedL2PerfCounters {
     uint64_t sched_loop_count{0};
     uint32_t phase_complete_count{0};
     uint32_t phase_dispatch_count{0};
-    // Run-cumulative pop counters; the v2 JSON dispatch-record emitter writes
-    // per-emit deltas computed as (current - pop_*_at_last_emit) and the
-    // end-of-run cold-path log reads the cumulatives directly.
+    // Run-cumulative pop counters; the dispatch-phase record emitter
+    // (aicpu_scheduler_phases[]) writes per-emit deltas computed as
+    // (current - pop_*_at_last_emit) and the end-of-run cold-path log reads
+    // the cumulatives directly.
     uint64_t pop_hit{0};
     uint64_t pop_miss{0};
     uint64_t pop_hit_at_last_emit{0};
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py
index 35a9d3276..428e6efbd 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen.py
@@ -142,15 +142,15 @@ def _post_validate(self, case):
             return
         with deps_path.open() as f:
             deps = json.load(f)
-        # v3 schema: annotated edges with tasks[] / tensors[] sidecars carrying
-        # strided slice descriptors (start_offset + stride[]). Project annotated
-        # edges down to a (pred, succ) set for the existing structural checks;
-        # the annotation sanity check below verifies the tensor metadata path.
-        assert deps.get("version") == 3, f"deps.json version {deps.get('version')} != 3"
+        # Strided-Tensor schema: annotated edges with tasks[] / tensors[]
+        # sidecars carrying strided slice descriptors (start_offset +
+        # stride[]). Project annotated edges down to a (pred, succ) set for
+        # the existing structural checks; the annotation sanity check below
+        # verifies the tensor metadata path.
         raw_edges = deps.get("edges", [])
         deps_edges = set()
         for e in raw_edges:
-            assert isinstance(e, dict), f"v2 edge must be an object, got {type(e).__name__}: {e!r}"
+            assert isinstance(e, dict), f"deps.json edge must be an object, got {type(e).__name__}: {e!r}"
             pred, succ = e.get("pred"), e.get("succ")
             if pred is None or succ is None:
                 continue
@@ -174,13 +174,13 @@ def _post_validate(self, case):
         bad = {e for e in deps_edges if e[0] not in valid_ids or e[1] not in valid_ids}
         assert not bad, f"deps.json contains edges referencing unknown task ids: {bad}"
 
-        # ---- v2 annotated-edge sanity ----
-        # Replay always emits the v2 schema with the tensor-info sidecar; the
-        # differential check inside the replay would have failed the run before
-        # we got here if the annotated pass disagreed with compute_task_fanin.
-        # These assertions just confirm the schema actually carries the
-        # expected blocks (so e.g. a future "always write empty arrays" bug
-        # would surface here, not silently in a downstream viewer).
+        # ---- Annotated-edge sanity ----
+        # Replay always emits the tensor-info sidecar; the differential check
+        # inside the replay would have failed the run before we got here if
+        # the annotated pass disagreed with compute_task_fanin. These
+        # assertions just confirm the schema actually carries the expected
+        # blocks (so e.g. a future "always write empty arrays" bug would
+        # surface here, not silently in a downstream viewer).
         tasks = deps.get("tasks", [])
         tensors = deps.get("tensors", [])
         task_ids = {int(t["task_id"]) for t in tasks if "task_id" in t}
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen_chain.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen_chain.py
index 90910bab6..cd2c3bc96 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen_chain.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/dep_gen/test_dep_gen_chain.py
@@ -161,7 +161,6 @@ def _post_validate(self, case):
 
         with deps_path.open() as f:
             deps = json.load(f)
-        assert deps.get("version") == 3, f"deps.json version {deps.get('version')} != 3"
 
         raw_edges = deps.get("edges", [])
         # Project annotated edges → (pred, succ) — we only care about graph
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py
index 43e4d7daa..c2b3a18e1 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/dfx/l2_swimlane/_swimlane_validate.py
@@ -53,6 +53,7 @@ def validate_perf_artifact(case_label: str, *, expected_task_count: int | None =
 
     with perf.open() as f:
         data = json.load(f)
+    assert data.get("l2_perf_level") in (1, 2, 3, 4), f"unexpected l2_perf_level: {data.get('l2_perf_level')}"
     tasks = data.get("tasks")
     assert isinstance(tasks, list), "tasks field missing or not a list"
     assert len(tasks) > 0, f"perf records empty under {perf}"