hw-native-sys · ChaoWao · May 27, 2026 · May 27, 2026
diff --git a/simpler_setup/tools/README.md b/simpler_setup/tools/README.md
@@ -47,8 +47,20 @@ python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_perf_rec
 
 # Verbose mode (for debugging)
 python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_perf_records.json -v
+
+# Reuse a deps.json captured in an earlier dep_gen run (different output dir)
+python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_perf_records.json \
+    --deps-json outputs/<case>_<earlier_ts>/deps.json
 ```
 
+> Dependency arrows in the Perfetto trace come from `deps.json` (dep_gen
+> replay). The device hot path no longer records fanout, so the typical
+> workflow is **two runs**: a one-time `--enable-dep-gen` capture per
+> topology to produce `deps.json`, then any number of
+> `--enable-l2-swimlane` runs that consume it. If no `deps.json` is found
+> alongside the perf JSON (and `--deps-json` isn't passed), the trace
+> still renders but has no arrows; the converter prints a warning.
+
 ### Command-Line Options
 
 | Option | Short | Description |
@@ -57,6 +69,7 @@ python -m simpler_setup.tools.swimlane_converter outputs/<case>_<ts>/l2_perf_rec
 | `--output` | `-o` | Output JSON file (default: outputs/merged_swimlane_`<timestamp>`.json) |
 | `--kernel-config` | `-k` | Path to kernel_config.py, used for function name mapping |
 | `--func-names` | | Path to func_id_names_*.json (SceneTest format) for function name mapping |
+| `--deps-json` | | Path to a dep_gen `deps.json` (defaults to sibling of input). Without one, no dependency arrows are drawn. |
 | `--verbose` | `-v` | Enable verbose output |
 
 ### Outputs
@@ -154,7 +167,7 @@ Output is emitted in three parts:
 - **Part 2: AICPU scheduler loop breakdown** — per-scheduler-thread loop statistics, per-phase (scan / complete / dispatch / idle) time ratios, pop_hit / pop_miss totals, and (when deps.json is available) per-thread fanout / fanin aggregates
 - **Part 3: Tail OH distribution & cause analysis** — Tail OH quantile distribution (P10–P99), correlation between scheduler loop iteration time and Tail OH, and data-driven insights into the dominant phase
 
-The perf JSON must be a v2 capture with non-empty `aicpu_scheduler_phases` (rerun the case with `--enable-l2-swimlane` if the tool reports the field is missing).
+The perf JSON must have non-empty `aicpu_scheduler_phases` (rerun the case with `--enable-l2-swimlane` if the tool reports the field is missing).
 
 ---
 
@@ -270,7 +283,6 @@ The analysis tools share the same input format - the `l2_perf_records_*.json` fi
 
 ```json
 {
-  "version": 1,
   "tasks": [
     {
       "task_id": 0,
@@ -279,14 +291,15 @@ The analysis tools share the same input format - the `l2_perf_records_*.json` fi
       "core_type": "aic",
       "start_time_us": 100.0,
       "end_time_us": 250.5,
-      "duration_us": 150.5,
-      "fanout": [1, 2],
-      "fanout_count": 2
+      "duration_us": 150.5
     }
   ]
 }
 ```
 
+Dependency edges come from `deps.json` (dep_gen replay) at post-process time —
+not from the perf JSON. See [`swimlane_converter --deps-json`](#swimlane_converter).
+
 ### Kernel Config Format
 
 To display meaningful function names in the output, provide a `kernel_config.py` file:

diff --git a/simpler_setup/tools/sched_overhead_analysis.py b/simpler_setup/tools/sched_overhead_analysis.py
@@ -10,7 +10,7 @@
 """Scheduler overhead analysis for PTO2.
 
 Inputs:
-  1. Per-task perf profiling data (l2_perf_records_*.json), v2 schema with
+  1. Per-task perf profiling data (l2_perf_records_*.json) with
      ``aicpu_scheduler_phases`` populated by ``--enable-l2-swimlane``.
   2. deps.json (optional, dep_gen replay output) colocated with the perf JSON,
      used to derive per-thread fanout / fanin DAG stats.
@@ -153,18 +153,16 @@ def auto_select_l2_perf_records_json():
 
 
 def parse_scheduler_from_json_phases(data):
-    """Extract scheduler Phase breakdown from l2_perf_records JSON (version >= 2).
+    """Extract scheduler Phase breakdown from l2_perf_records JSON.
 
     Computes per-thread loop counts, task counts, and phase totals
-    from aicpu_scheduler_phases records.
+    from aicpu_scheduler_phases records (present at l2_perf_level >= 3).
 
     Returns:
         dict: Thread data keyed by thread index, with per-phase us / pct,
               pop_hit / pop_miss, loops, completed, tasks_per_loop. Returns
               empty dict if phase data is not available.
     """
-    if data.get("version", 1) < 2:
-        return {}
     phases_by_thread = data.get("aicpu_scheduler_phases", [])
     if not phases_by_thread:
         return {}
@@ -487,7 +485,7 @@ def run_analysis(  # noqa: PLR0912, PLR0915
     else:
         pop_hit = pop_miss = 0
         pop_hit_rate = 0.0
-        print("  Pop: (no per-emit pop deltas in input — needs --enable-l2-swimlane on a v2 JSON capture)")
+        print("  Pop: (no per-emit pop deltas in input — needs --enable-l2-swimlane)")
 
     print()
     print("=" * 90)

diff --git a/simpler_setup/tools/swimlane_converter.py b/simpler_setup/tools/swimlane_converter.py
@@ -92,8 +92,7 @@ def read_perf_data(filepath):
         filepath: Path to input JSON file
 
     Returns:
-        dict: Parsed performance data with keys:
-            - version
+        dict: Parsed performance data with key:
             - tasks (list)
 
     Raises:
@@ -102,50 +101,44 @@ def read_perf_data(filepath):
     with open(filepath) as f:
         data = json.load(f)
 
-    # Validate required fields
-    required_fields = ["version", "tasks"]
-    for field in required_fields:
-        if field not in data:
-            raise ValueError(f"Missing required field: {field}")
-
-    # Validate version
-    if data["version"] not in [1, 2, 3, 4]:
-        raise ValueError(f"Unsupported version: {data['version']} (expected 1, 2, 3, or 4)")
+    if "tasks" not in data:
+        raise ValueError("Missing required field: tasks")
 
     return data
 
 
-def load_deps_json(perf_records_path):
-    """Load deps.json (dep_gen replay output) co-located with ``l2_perf_records.json``.
+def load_deps_json(deps_path):
+    """Load a dep_gen replay output (``deps.json``).
 
-    deps.json supersedes ``task["fanout"]``: fanout is sealed at the moment the
-    producer's L2PerfRecord retires, so consumers submitted after a fast producer
-    completes can never get attributed to it. dep_gen's replay reconstructs the
-    complete graph by replaying every captured ``submit_task`` through a host
-    PTO2TensorMap.
+    deps.json is the sole source of truth for the task graph in this tool:
+    the device hot path no longer records per-task fanout (see PR #863). The
+    typical workflow is a dep_gen run once per topology (``--enable-dep-gen``)
+    to produce ``deps.json``, then any number of ``--enable-l2-swimlane`` runs
+    that join their per-task timing against that captured graph.
 
     Returns:
-        dict[int, list[int]] mapping ``pred_raw → [succ_raw, ...]`` (i.e. the
-        same shape as ``task["fanout"]``), or ``None`` if no deps.json is present.
-        Tasks with no successors are absent from the dict (mirrors ``defaultdict``
-        semantics on lookup miss).
+        dict[int, list[int]] mapping ``pred_raw → [succ_raw, ...]``, or
+        ``None`` if the file is missing, unreadable, or not v2-shaped. Tasks
+        with no successors are absent from the dict (``defaultdict``-like
+        lookup-miss semantics).
     """
-    deps_path = Path(perf_records_path).parent / "deps.json"
+    deps_path = Path(deps_path)
     if not deps_path.exists():
         return None
     try:
         with deps_path.open() as f:
             data = json.load(f)
     except (OSError, ValueError) as e:
-        print(f"Warning: failed to read {deps_path}: {e}; falling back to fanout", file=sys.stderr)
+        print(f"Warning: failed to read {deps_path}: {e}", file=sys.stderr)
         return None
     edges = data.get("edges")
     if not isinstance(edges, list):
+        print(f"Warning: {deps_path} has no 'edges' array", file=sys.stderr)
         return None
     version = data.get("version")
     if version != 2:
         print(
-            f"Warning: deps.json version={version!r}; only v2 is supported. Falling back to fanout[].",
+            f"Warning: {deps_path} version={version!r}; only v2 is supported.",
             file=sys.stderr,
         )
         return None
@@ -396,7 +389,6 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
         tasks: List of task dicts with fields:
             - task_id, func_id, core_id, core_type
             - start_time_us, end_time_us, duration_us
-            - fanout, fanout_count
             - dispatch_time_us (optional, AICPU dispatch timestamp)
             - finish_time_us (optional, AICPU finish timestamp)
         output_path: Path to output JSON file
@@ -477,9 +469,6 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
         ts = task["start_time_us"]
         dur = task["duration_us"]
 
-        # Build fanout hint string (packed ids → rXtY / tY for readability)
-        fanout_str = "[" + ", ".join(format_task_display(x) for x in task["fanout"]) + "]"
-
         # Get function name if available
         func_id = task["func_id"]
         tdisp = format_task_display(task["task_id"])
@@ -489,6 +478,14 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
         else:
             task_name = f"func_{_func_id_to_letter(func_id)}({tdisp})"
 
+        # Build fanout hint string (packed ids → rXtY / tY for readability)
+        # from deps.json — the device hot path no longer carries fanout.
+        fanout_str = (
+            "["
+            + ", ".join(format_task_display(x) for x in (deps_edges.get(task["task_id"], []) if deps_edges else []))
+            + "]"
+        )
+
         events.append(
             {
                 "args": {
@@ -620,9 +617,9 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
             task_to_aicpu_event_id[(task["task_id"], task["core_id"])] = event_id
             event_id += 1
 
-    # Flow events (Flow events "s" and "f" for dependencies). When deps.json
-    # was produced by dep_gen replay, prefer its edges over task["fanout"] —
-    # fanout is the truncated, race-prone view (see load_deps_json's docstring).
+    # Flow events (Flow events "s" and "f" for dependencies). Edges come from
+    # deps.json (dep_gen replay); without one we emit no flow events at all,
+    # since the device hot path no longer carries fanout (PR #863).
     # Edges where the predecessor's end_time outlives the successor's start_time
     # are flagged as happens-before violations and emitted with a distinct flow
     # name so Perfetto colors them differently from clean dependency arrows.
@@ -631,11 +628,7 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
         task_map[t["task_id"]].append(t)
     flow_id = 0
     hb_violation_count = 0
-
-    def _succs_for(task):
-        if deps_edges is not None:
-            return deps_edges.get(task["task_id"], [])
-        return task["fanout"]
+    edges_by_pred = deps_edges or {}
 
     for task in tasks:
         src_tid = core_to_tid[task["core_id"]]
@@ -646,7 +639,7 @@ def _succs_for(task):
         # Use a small offset (0.01 us) for visual clarity
         flow_start_us = src_ts_end - 0.01
 
-        for succ_task_id in _succs_for(task):
+        for succ_task_id in edges_by_pred.get(task["task_id"], []):
             if succ_task_id not in task_map:
                 if verbose:
                     print(
@@ -699,8 +692,10 @@ def _succs_for(task):
                 flow_id += 1
 
     if verbose:
-        edge_source = "deps.json" if deps_edges is not None else "task.fanout"
-        print(f"  Flow events: {flow_id} edges (source: {edge_source})")
+        if deps_edges is not None:
+            print(f"  Flow events: {flow_id} edges (source: deps.json)")
+        else:
+            print("  Flow events: 0 (no deps.json — re-run dep_gen and pass --deps-json to add arrows)")
         if hb_violation_count > 0:
             print(f"  Happens-before violations: {hb_violation_count} edge(s) flagged as 'hb_violation'")
 
@@ -841,7 +836,7 @@ def _succs_for(task):
             src_tid = task_to_aicpu_tid.get((task["task_id"], task["core_id"]), core_to_tid[task["core_id"]])
             src_aicpu_eid = task_to_aicpu_event_id.get((task["task_id"], task["core_id"]))
 
-            for succ_task_id in _succs_for(task):
+            for succ_task_id in edges_by_pred.get(task["task_id"], []):
                 if succ_task_id not in task_map:
                     continue
 
@@ -1108,6 +1103,13 @@ def _build_parser():
         "--func-names",
         help="Path to func_id_names_*.json (SceneTest format) for func_id to function name mapping",
     )
+    parser.add_argument(
+        "--deps-json",
+        help=(
+            "Path to a dep_gen replay deps.json (defaults to sibling of the perf JSON). "
+            "Without one the trace has no dependency arrows — re-run with --enable-dep-gen first."
+        ),
+    )
     parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
     return parser
 
@@ -1145,11 +1147,10 @@ def _resolve_output_path(args, input_path):
 
 
 def _print_verbose_data_info(data, verbose):
-    """Print verbose summary of loaded performance data including v2 phase counts."""
+    """Print verbose summary of loaded performance data including phase counts."""
     if not verbose:
         return
     print("\n=== Performance Data ===")
-    print(f"  Version: {data['version']}")
     print(f"  Task Count: {len(data['tasks'])}")
     if data["tasks"]:
         start_times = [t["start_time_us"] for t in data["tasks"]]
@@ -1158,8 +1159,6 @@ def _print_verbose_data_info(data, verbose):
         max_time = max(end_times)
         print(f"  Time Range: {min_time:.3f} us - {max_time:.3f} us (span: {max_time - min_time:.3f} us)")
     print()
-    if data["version"] != 2:
-        return
     scheduler_phases = data.get("aicpu_scheduler_phases")
     orchestrator_phases = data.get("aicpu_orchestrator_phases")
     core_to_thread = data.get("core_to_thread")
@@ -1227,9 +1226,17 @@ def main():
 
         output_path = _resolve_output_path(args, input_path)
 
-        deps_edges = load_deps_json(input_path)
-        if args.verbose and deps_edges is not None:
-            print(f"  Using deps.json edges ({sum(len(v) for v in deps_edges.values())} total)")
+        deps_path = Path(args.deps_json) if args.deps_json else Path(input_path).parent / "deps.json"
+        deps_edges = load_deps_json(deps_path)
+        if deps_edges is not None:
+            if args.verbose:
+                print(f"  Using deps.json edges ({sum(len(v) for v in deps_edges.values())} total) from {deps_path}")
+        else:
+            print(
+                f"Warning: no usable deps.json at {deps_path}; Perfetto trace will have no dependency arrows. "
+                f"Run a dep_gen capture (--enable-dep-gen) and pass --deps-json <path> to add them.",
+                file=sys.stderr,
+            )
 
         generate_chrome_trace_json(
             data["tasks"],
@@ -1250,13 +1257,15 @@ def main():
 
         print_task_statistics(data["tasks"], func_names)
 
-        # The deep-dive reads only the perf JSON and (optionally) the colocated
-        # deps.json — sibling auto-discovery happens inside run_sched_overhead_analysis.
+        # The deep-dive reads the perf JSON plus deps.json (for per-thread
+        # fanout / fanin aggregates). Forward the resolved deps path so an
+        # explicit --deps-json overrides sibling auto-discovery there too.
         print("\n=== Scheduler Overhead Deep Dive ===")
         deep_dive_rc = run_sched_overhead_analysis(
             input_path,
             print_sources=True,
             perf_data=data,
+            deps_json_path=deps_path if deps_edges is not None else None,
         )
         if deep_dive_rc != 0:
             print(

diff --git a/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h
@@ -73,8 +73,9 @@ void l2_perf_aicpu_init(int worker_count);
  * `dual_issue_slots[expected_reg_task_id % PLATFORM_L2_AICORE_RING_SIZE]`,
  * validates the task_id match, fills all AICPU-side fields, commits into
  * the current records buffer, and rotates the records buffer internally
- * once it fills up. Callers must pre-extract fanout into a plain uint64_t
- * array (platform layer cannot depend on runtime linked-list types).
+ * once it fills up. Fanout edges live in the static DAG (deps.json from
+ * dep_gen) and are joined by the host's swimlane converter post-run, so
+ * this commit path does not touch fanout.
  *
  * Per-core counter accounting:
  *   total_record_count++       — every commit attempt (success or failure)
@@ -95,12 +96,10 @@ void l2_perf_aicpu_init(int worker_count);
  * @param core_type             Core type (AIC/AIV)
  * @param dispatch_time         AICPU timestamp when task was dispatched
  * @param finish_time           AICPU timestamp when task completion was observed
- * @param fanout                Pre-extracted successor task ID array (nullptr if none)
- * @param fanout_count          Number of entries in fanout array (0 if none)
  */
 int l2_perf_aicpu_complete_record(
     int core_id, int thread_idx, uint32_t expected_reg_task_id, uint64_t task_id, uint32_t func_id, CoreType core_type,
-    uint64_t dispatch_time, uint64_t finish_time, const uint64_t *fanout, int32_t fanout_count
+    uint64_t dispatch_time, uint64_t finish_time
 );
 
 /**

diff --git a/src/a2a3/platform/include/common/dep_gen.h b/src/a2a3/platform/include/common/dep_gen.h
@@ -15,9 +15,9 @@
  *
  * Captures the inputs to every Orchestrator::submit_task call into a streaming
  * ring of DepGenRecord. The host side replays these records offline to
- * reconstruct the full task dependency graph (deps.json), bypassing the race
- * window in L2PerfRecord::fanout[] (where an early-finishing producer would
- * have its record sealed before later-submitted consumers can register).
+ * reconstruct the full task dependency graph (deps.json). deps.json is the
+ * sole source of truth for fanout edges; the L2 swimlane hot path no longer
+ * carries fanout to keep AICPU off the per-task GM-store critical path.
  *
  * Streaming buffer design mirrors PMU / L2Perf / TensorDump (single source of
  * algorithmic truth in src/a2a3/platform/include/host/profiling_common/profiler_base.h):