Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions docs/dfx/l2-swimlane-profiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@ available.
the L2 record so dependency arrows show up in the Perfetto
view.
- **AICPU scheduler phases** — per-iteration breakdown into
`complete` / `dispatch` / `scan` / `idle`.
`complete` / `dispatch`. Idle iterations no longer emit a record
on a2a3; the host tooling reconstructs idle spans from the gap
between consecutive work records on the same thread. Legacy
captures (and a5) may still carry `scan` / `idle` records — both
are silently dropped by the parser (idle is double-painted
by the gap reconstruction; `scan` was never emitted in a2a3).
- **Orchestrator phase summary** — cumulative cycle counts for
the orchestrator's nine sub-steps (sync / alloc / params /
lookup / heap / insert / fanin / finalize / scope_end).
Expand Down Expand Up @@ -132,7 +137,7 @@ Phase records (per scheduler thread, level >= 3 for
| Field | Meaning |
| ----- | ------- |
| `start_time_us` / `end_time_us` | Phase start / end timestamps in microseconds |
| `phase` | Lowercase phase name. Scheduler: `complete` / `dispatch` / `scan` / `idle`. Orchestrator: `orch_*` (sync / alloc / params / lookup / heap / insert / fanin / finalize / scope_end). |
| `phase` | Lowercase phase name. Scheduler: `complete` / `dispatch` (`scan` / `idle` may appear in legacy captures and a5; both are dropped by the parser). Orchestrator: `orch_*` (sync / alloc / params / lookup / heap / insert / fanin / finalize / scope_end). |
| `loop_iter` (scheduler) / `submit_idx` (orchestrator) | Iteration / submit-call counter for the producing thread |
| `tasks_processed` (scheduler) / `task_id` (orchestrator) | Phase-specific union field |
| `pop_hit` / `pop_miss` (dispatch only) | Ready-queue pop deltas since the previous dispatch emit |
Expand Down
75 changes: 47 additions & 28 deletions simpler_setup/tools/sched_overhead_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,42 +169,57 @@ def parse_scheduler_from_json_phases(data):
if not phases_by_thread:
return {}

# Map JSON phase names to internal names
phase_map = {
"complete": "complete",
"dispatch": "dispatch",
"scan": "scan",
"idle": "idle",
}

threads = {}
for tid, records in enumerate(phases_by_thread):
if not records:
continue

phase_us = {p: 0.0 for p in ["complete", "scan", "dispatch", "idle"]}
# Only "complete" and "dispatch" emit records on a2a3 post-#869.
# Legacy a2a3 captures (or current a5 captures, which still emit
# SCHED_IDLE_WAIT) may carry "idle" / "scan" records; both are
# dropped because idle is reconstructed from gaps between work
# records on the same thread — for the a5/legacy case the gap
# exactly equals the dropped idle records' total span, so the
# numeric idle_us is preserved (only the per-iter granularity is
# lost, which Part 2 doesn't surface).
work_recs = sorted(
(r for r in records if r.get("phase") in ("complete", "dispatch")),
key=lambda r: r.get("start_time_us", 0),
)
if not work_recs:
continue

phase_us = {"complete": 0.0, "dispatch": 0.0, "idle": 0.0}
total_tasks = 0
max_loop_iter = 0
pop_hit = 0
pop_miss = 0

for rec in records:
phase = phase_map.get(rec.get("phase", ""))
if phase is None:
continue
dur = rec.get("end_time_us", 0) - rec.get("start_time_us", 0)
prev_end = None

for rec in work_recs:
phase = rec["phase"]
start = rec.get("start_time_us", 0)
end = rec.get("end_time_us", 0)
# Idle = wall-clock gap between this record and the previous
# work record on this thread. Pre-first-work and post-last-work
# spans are not captured here (see device's cumulative
# sched_idle_cycle in the cold-path summary log for those).
if prev_end is not None and start > prev_end:
phase_us["idle"] += start - prev_end
dur = end - start
if dur > 0:
phase_us[phase] += dur
if rec.get("tasks_processed", 0) > 0 and phase == "complete":
total_tasks += rec["tasks_processed"]
loop_iter = rec.get("loop_iter", 0)
max_loop_iter = max(max_loop_iter, loop_iter)
if phase == "complete":
total_tasks += rec.get("tasks_processed", 0)
# Per-emit queue-health deltas; only present on dispatch records.
# Summing across records gives the run-cumulative pop_hit /
# pop_miss (the runtime's final-drain emit closes the tail).
if phase == "dispatch":
pop_hit += rec.get("pop_hit", 0)
pop_miss += rec.get("pop_miss", 0)
max_loop_iter = max(max_loop_iter, rec.get("loop_iter", 0))
if prev_end is None or end > prev_end:
prev_end = end

total_us = sum(phase_us.values())
loops = max_loop_iter
Expand Down Expand Up @@ -431,13 +446,14 @@ def run_analysis( # noqa: PLR0912, PLR0915
print(fmt2.format("SUM", total_loops, total_completed, f"{avg_tpl:.1f}", f"{total_us:.1f}"))
print()

# Phase breakdown
phases = ["complete", "scan", "dispatch", "idle"]
# Phase breakdown. Idle is reconstructed from gaps between work
# records on the same thread (no explicit idle record is emitted by
# the device anymore).
phases = ["complete", "dispatch", "idle"]
phase_labels = {
"complete": "Complete (poll handshake, resolve deps)",
"scan": "Scan (update perf header)",
"dispatch": "Dispatch (pop queue, build payload, flush)",
"idle": "Idle (spinning, no progress)",
"idle": "Idle (spinning, no progress — reconstructed from gaps)",
}

fmt3 = " {:<50} {:>11} {:>10} {:>14}"
Expand Down Expand Up @@ -514,14 +530,19 @@ def run_analysis( # noqa: PLR0912, PLR0915
avg_loop_us = total_us / total_loops if total_loops > 0 else 0
avg_tail_oh = sum(tails) / n
loop_ratio = avg_tail_oh / avg_loop_us if avg_loop_us > 0 else 0
print(f" Avg scheduler loop iteration: {avg_loop_us:.1f} us (approx avg polling interval per loop)")
# %.3f instead of %.1f: idle is now reconstructed from gaps between work
# records on the same thread, so threads that did no work (only spinning)
# contribute their loop count without any measured us — the SUM-based
# average lands in the sub-us range on small captures and would round to
# 0.0 under .1f.
print(f" Avg scheduler loop iteration: {avg_loop_us:.3f} us (approx avg polling interval per loop)")
print()
print(f" Avg Tail OH = {avg_tail_oh:.1f} us ~= {loop_ratio:.1f} x avg loop iteration ({avg_loop_us:.1f} us)")
print(f" Avg Tail OH = {avg_tail_oh:.1f} us ~= {loop_ratio:.1f} x avg loop iteration ({avg_loop_us:.3f} us)")
print(f" -> On average, a completed task waits ~{loop_ratio:.1f} loop iterations before being detected")
print()

# Data-driven insight: find the dominant phase (excluding idle which is not useful work)
work_phases = {p: phase_totals.get(p, 0) for p in ["scan", "complete", "dispatch"]}
work_phases = {p: phase_totals.get(p, 0) for p in ["complete", "dispatch"]}
dominant_phase = max(work_phases, key=lambda p: work_phases[p])
dominant_pct = work_phases[dominant_phase] / total_us * 100 if total_us > 0 else 0
key_phase_label = phase_labels[dominant_phase].split(" (")[0]
Expand All @@ -540,8 +561,6 @@ def run_analysis( # noqa: PLR0912, PLR0915
print(" Fanout traversal and atomic ops dominate the complete phase.")
else:
print(" DAG stats unavailable (no deps.json); cannot attribute complete-phase cost further.")
elif dominant_phase == "scan":
print(" Scan phase overhead indicates frequent perf header updates.")
print("=" * 90)

return 0
Expand Down
47 changes: 28 additions & 19 deletions simpler_setup/tools/swimlane_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,12 +708,15 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915
{"args": {"sort_index": 2}, "cat": "__metadata", "name": "process_sort_index", "ph": "M", "pid": 3}
)

# Phase color mapping
# Phase color mapping. The Perfetto sched lane only renders the
# two work phases (complete, dispatch). Idle is the wall-clock gap
# between consecutive work bars — Perfetto's empty-track regions
# already convey that visually, so we don't paint a synthetic bar
# for it. (Idle is still tallied numerically by
# sched_overhead_analysis.py Part 2 via gap reconstruction.)
phase_colors = {
"complete": "good", # green
"dispatch": "terrible", # red
"scan": "thread_state_running", # blue
"idle": "yellow", # yellow
}

for thread_idx, thread_records in enumerate(scheduler_phases):
Expand All @@ -731,29 +734,35 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915
}
)

# Render only the work phases (complete / dispatch). Legacy
# captures may carry "idle" / "scan" records from older builds;
# drop them so the lane shows only actual work.
for record in thread_records:
phase = record.get("phase", "unknown")
if phase not in ("complete", "dispatch"):
continue
start_us = record["start_time_us"]
end_us = record["end_time_us"]
dur = end_us - start_us
tasks_processed = record.get("tasks_processed", 0)

event = {
"args": {
"phase": phase,
"loop_iter": record.get("loop_iter", 0),
"tasks_processed": tasks_processed,
},
"cat": "scheduler",
"cname": phase_colors.get(phase, "generic_work"),
"name": f"{phase}({tasks_processed})",
"ph": "X",
"pid": 3,
"tid": tid,
"ts": start_us,
"dur": dur,
}
events.append(event)
events.append(
{
"args": {
"phase": phase,
"loop_iter": record.get("loop_iter", 0),
"tasks_processed": tasks_processed,
},
"cat": "scheduler",
"cname": phase_colors.get(phase, "generic_work"),
"name": f"{phase}({tasks_processed})",
"ph": "X",
"pid": 3,
"tid": tid,
"ts": start_us,
"dur": dur,
}
)
Comment thread
ChaoWao marked this conversation as resolved.

# AICPU Orchestrator lane (l2_perf_level >= 4)
#
Expand Down
22 changes: 14 additions & 8 deletions src/a2a3/platform/include/common/l2_perf_profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,16 +306,21 @@ struct L2PerfDataHeader {
/**
* AICPU phase identifier
*
* Scheduler phases (0-3): four phases in each scheduler loop iteration.
* Scheduler phases (0-1): the two work-emitting phases per scheduler loop
* iteration. Idle iterations no longer emit a record — host tooling recovers
* idle spans from the gap between consecutive sched records on the same
* thread (see swimlane_converter.py / sched_overhead_analysis.py).
*
* Orchestrator phases (16-24): sub-steps within each submit_task() call.
*
* IDs 2-3 were SCHED_SCAN (unused in this runtime) and SCHED_IDLE_WAIT in
* older builds. Legacy capture JSON may still carry phase_id=3 records; the
* host parser skips them since idle is now reconstructed from record gaps.
*/
enum class AicpuPhaseId : uint32_t {
// Scheduler phases (0-3)
SCHED_COMPLETE = 0, // Process completed tasks (fanout traversal)
SCHED_DISPATCH = 1, // Dispatch ready tasks to idle cores
SCHED_SCAN = 2, // Incremental scan for root tasks
SCHED_IDLE_WAIT = 3, // Idle/spinning (no progress)
SCHED_PHASE_COUNT = 4, // Sentinel: number of scheduler phases
// Scheduler phases
SCHED_COMPLETE = 0, // Process completed tasks (fanin traversal)
SCHED_DISPATCH = 1, // Dispatch ready tasks to idle cores
// Orchestrator phases (16-24)
ORCH_SYNC = 16, // tensormap sync
ORCH_ALLOC = 17, // task_ring_alloc
Expand All @@ -337,7 +342,8 @@ enum class AicpuPhaseId : uint32_t {
* extra1 / extra2 carry phase-specific stats; meaning is keyed by phase_id:
* SCHED_DISPATCH: extra1 = pop_hit delta since last emit
* extra2 = pop_miss delta since last emit
* All other phases: extras are 0 (reserved for future per-phase metrics).
* SCHED_COMPLETE: extras are 0.
* Orchestrator phases: extras are 0 (reserved for future per-phase metrics).
*/
struct AicpuPhaseRecord {
uint64_t start_time; // Phase start timestamp
Expand Down
19 changes: 12 additions & 7 deletions src/a2a3/platform/src/host/l2_perf_collector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,16 @@

/**
* Check if a phase ID belongs to a scheduler phase (vs orchestrator phase).
* Scheduler phases: SCHED_COMPLETE(0), SCHED_DISPATCH(1), SCHED_SCAN(2), SCHED_IDLE_WAIT(3)
* Orchestrator phases: ORCH_SYNC(16) through ORCH_SCOPE_END(24)
* Scheduler phases: SCHED_COMPLETE(0), SCHED_DISPATCH(1).
* Legacy IDs 2 (SCHED_SCAN, never emitted in this runtime) and 3
* (SCHED_IDLE_WAIT, no longer emitted) may appear in old captures and are
* still classified as scheduler-side so the host parser routes them through
* the scheduler branch (where they are then dropped, since idle is
* reconstructed from record gaps).
* Orchestrator phases: ORCH_SYNC(16) through ORCH_SCOPE_END(24).
*/
static bool is_scheduler_phase(AicpuPhaseId id) {
return static_cast<uint32_t>(id) < static_cast<uint32_t>(AicpuPhaseId::SCHED_PHASE_COUNT);
return static_cast<uint32_t>(id) < static_cast<uint32_t>(AicpuPhaseId::ORCH_SYNC);
}

L2PerfCollector::~L2PerfCollector() {
Expand Down Expand Up @@ -642,11 +647,11 @@ int L2PerfCollector::export_swimlane_json() {
return "complete";
case AicpuPhaseId::SCHED_DISPATCH:
return "dispatch";
case AicpuPhaseId::SCHED_SCAN:
return "scan";
case AicpuPhaseId::SCHED_IDLE_WAIT:
return "idle";
default:
// Legacy SCHED_IDLE_WAIT (3) and SCHED_SCAN (2) land here on
// old captures; host tools skip "unknown" sched records and
// rebuild idle from gaps between known records on the
// same thread.
return "unknown";
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -363,8 +363,8 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa
cycles_to_us(sched_end_ts - l2_perf.sched_start_ts)
);

uint64_t sched_total = l2_perf.sched_wiring_cycle + l2_perf.sched_complete_cycle + l2_perf.sched_scan_cycle +
l2_perf.sched_dispatch_cycle + l2_perf.sched_idle_cycle;
uint64_t sched_total = l2_perf.sched_wiring_cycle + l2_perf.sched_complete_cycle + l2_perf.sched_dispatch_cycle +
l2_perf.sched_idle_cycle;
if (sched_total == 0) sched_total = 1;

#if PTO2_SCHED_PROFILING
Expand Down Expand Up @@ -466,11 +466,6 @@ void SchedulerContext::log_l2_perf_summary(int32_t thread_idx, int32_t cur_threa
cycles_to_us(l2_perf.sched_dispatch_setup_cycle), l2_perf.sched_dispatch_setup_cycle * 100.0 / d_parent
);

LOG_INFO_V9(
"Thread %d: scan : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_perf.sched_scan_cycle),
l2_perf.sched_scan_cycle * 100.0 / sched_total
);

#if PTO2_SCHED_PROFILING
LOG_INFO_V9(
"Thread %d: wiring : %.3fus (%.1f%%) tasks=%d", thread_idx,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -763,10 +763,13 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
}
#if PTO2_PROFILING
CYCLE_COUNT_LAP(l2_perf.sched_idle_cycle);
// Idle iterations no longer emit a phase record. Host tooling
// recovers idle spans from the gap between consecutive sched
// phase records on the same thread. _t0_phase still advances
// so the next emitted COMPLETE/DISPATCH gets the correct
// start_time (the iter it actually ran in), not the start of
// the preceding idle stretch.
if (l2_perf_level_ >= L2PerfLevel::SCHED_PHASES) {
l2_perf_aicpu_record_phase(
thread_idx, AicpuPhaseId::SCHED_IDLE_WAIT, _t0_phase, _t1, l2_perf.sched_loop_count, 0
);
_t0_phase = _t1;
}
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,6 @@ struct SlotTransition {
struct alignas(64) SchedL2PerfCounters {
bool l2_perf_enabled{false};
uint64_t sched_start_ts{0};
uint64_t sched_scan_cycle{0};
uint64_t sched_complete_cycle{0};
uint64_t sched_dispatch_cycle{0};
uint64_t sched_wiring_cycle{0};
Expand Down
Loading