diff --git a/.github/workflows/benchmarks-report.yml b/.github/workflows/benchmarks-report.yml
index 911e9f663..185cd66a2 100644
--- a/.github/workflows/benchmarks-report.yml
+++ b/.github/workflows/benchmarks-report.yml
@@ -51,15 +51,6 @@ jobs:
git fetch origin main --depth=1
git checkout origin/main -- tools/ci/bench/reporter/ 2>/dev/null || true
- # Overlay main's bench-history.json onto the PR checkout so the
- # reporter's peak-attribution runs against the freshest history,
- # not whatever version was on the PR's branch point. `|| true` keeps
- # the step non-fatal if main has no file yet (first run after D3a).
- - name: Fetch latest bench-history.json from main
- run: |
- git fetch origin main --depth=1
- git show origin/main:tools/ci/bench/reporter/bench-history.json > tools/ci/bench/reporter/bench-history.json 2>/dev/null || true
-
- name: Download bench artifacts
uses: dawidd6/action-download-artifact@v21
with:
@@ -106,6 +97,9 @@ jobs:
ENDED: ${{ github.event.workflow_run.updated_at }}
run: |
WALL_CLOCK=$(( $(date -d "$ENDED" +%s) - $(date -d "$STARTED" +%s) ))
+ # --scope pr: peak attribution uses PR-iteration history only.
+ # main-history is still loaded for drift quantification but excluded
+ # from the comparison set.
node tools/ci/bench/reporter/reporter.js \
--results results \
--sha '${{ github.event.workflow_run.head_sha }}' \
@@ -115,6 +109,7 @@ jobs:
--base-ref 'main' \
--repo '${{ github.repository }}' \
--pr-history pr-history.json \
+ --scope pr \
--wall-clock "$WALL_CLOCK" \
--out bench-report
@@ -191,18 +186,21 @@ jobs:
- name: Append history entry
run: |
- # The benched commit is the workflow_run's head_sha (the merge
- # commit on main). Parent comes from the git history we just
- # fetched. Timestamp is the bench run's completion time so the
- # history is ordered by when the measurement was taken, not
- # when the commit landed.
+ # Benched commit is the workflow_run's head_sha; parent comes from
+ # the depth-2 fetch above. Timestamp is the bench run's completion
+ # so history is ordered by measurement, not commit-land time.
+ # baseline-sha.txt is the sidecar uploaded next to each matrix
+ # cell's tachometer JSON. Any cell's value works — all cells in
+ # one workflow_run benched against the same baseline.
BENCHED_SHA='${{ github.event.workflow_run.head_sha }}'
PARENT_SHA=$(git rev-parse "$BENCHED_SHA^" 2>/dev/null || echo '')
+ BASELINE_SHA=$(find results -name baseline-sha.txt -type f -exec cat {} \; -quit)
node tools/ci/bench/reporter/append-history.js \
--results results \
--sha "$BENCHED_SHA" \
--msg '${{ github.event.workflow_run.display_title }}' \
--parent-sha "$PARENT_SHA" \
+ --baseline-sha "$BASELINE_SHA" \
--timestamp '${{ github.event.workflow_run.updated_at }}' \
--history tools/ci/bench/reporter/bench-history.json
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index f63b3b5a0..7db4662d8 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -127,19 +127,28 @@ jobs:
# PR: baseline = base branch tip.
# Push to main: baseline = this commit's parent (so the delta captures
# the merged commit's effect; bench-history indexes the current
- # commit's absolute CI).
+ # commit's absolute CI alongside the within-session percent-delta).
+ #
+ # Resolve baseline SHA inline so it can be written to the artifact as
+ # a sidecar (baseline-sha.txt). The reporter pins each metric's
+ # percent_delta_ci to that SHA — required for cross-iteration drift
+ # detection (see tools/ci/bench/reporter/reporter.js:computeBaselineDrift).
- name: Build baseline
run: |
if [ '${{ github.event_name }}' = 'push' ]; then
# Fetch enough history to reach the parent commit locally.
git fetch origin main --depth=2
+ BASELINE_SHA=$(git rev-parse HEAD~1)
git checkout HEAD~1 -- packages/*/src/
else
git fetch origin ${{ github.event.pull_request.base.ref }} --depth=1
+ BASELINE_SHA=$(git rev-parse FETCH_HEAD)
git checkout FETCH_HEAD -- packages/*/src/
fi
node packages/${{ matrix.entry.package }}/bench/tachometer/build-ci.js baseline
git checkout HEAD -- packages/*/src/
+ mkdir -p results
+ echo "$BASELINE_SHA" > results/baseline-sha.txt
# Run just this matrix cell's single config.
# Per-cell auto-sample tail is governed by the config's own `timeout`
@@ -156,4 +165,9 @@ jobs:
uses: actions/upload-artifact@v7
with:
name: results-${{ matrix.entry.name }}
- path: results/*.json
+ # Include baseline-sha.txt sidecar — the reporter and history
+ # archiver read it to pin percent_delta_ci entries to their
+ # baseline SHA.
+ path: |
+ results/*.json
+ results/baseline-sha.txt
diff --git a/ai/plans/ROADMAP.md b/ai/plans/ROADMAP.md
index 4eb281560..99c01588b 100644
--- a/ai/plans/ROADMAP.md
+++ b/ai/plans/ROADMAP.md
@@ -83,6 +83,7 @@ Plans with an open PR or live pair work. Updated as ceremony when a PR opens; en
- [Release 0.18.0](active/release-0-18-0.md) — [PR #122](https://github.com/Semantic-Org/Semantic-Next/pull/122) `docs/shippable` (menu trimming + audit pass pending). Ships the next tagged release; last was 0.17.0 in November.
- [Signal Performance](active/signal-performance.md) — [PR #150](https://github.com/Semantic-Org/Semantic-Next/pull/150) freeze-by-default. Perf story unresolved (see plan's Bench Results); release inclusion is the open call.
+- [Bench Peak Attribution](active/bench-peak-attribution.md) — [PR #178](https://github.com/Semantic-Org/Semantic-Next/pull/178) methodology fix for cross-session absolute-ms comparisons. Eliminates phantom "Regressions from peak" on PRs.
---
@@ -200,7 +201,7 @@ Slot in wherever there's a gap; not phase-gated.
| P12 | [Template Spread Syntax](template-spread-syntax.md) | 4-8h | pair | scoped | `{>card ...friend}` — object spread in data passing. Ship when component templates demonstrate need. |
| P13 | [Template Content Projection](template-wrapper-snippets.md) | 12-16h (1.5-2d) | pair | scoped | `{>content}` — content projection for snippets + subtemplates. Ship when component templates demonstrate need. |
| P14 | [Template Let Bindings](template-let-bindings.md) | 10-14h (1-2d) | pair | scoped | `{#let}...{/let}` — snippet-for-vars. Ship when component templates demonstrate need. |
-| P15 | [Bench Reporter Overhaul](bench-reporter-overhaul.md) | 16-24h (2-3d) | pair | initial | Two coordinated tracks. **A — peak attribution correctness**: schema_v2 stores within-session percent-delta + tip-of-tree SHA; reporter peak compares same-session deltas; `--scope pr` drops main-history from PR comments. Fixes PR #174's 23 phantom regressions. **B — suite rationalization remainder** (from `icebox/tachometer-overhaul.md`): story-driven config reorg, triplet collapses, `wake-count-single-key` + `nested-mutation` micros, `timeout` final pass. Four PRs under `workflow_run` constraint. Supersedes the icebox plan. |
+| P15 | [Bench Peak Attribution](active/bench-peak-attribution.md) | 9-11h (1.5d) | pair | scoped | Fix the live peak-attribution bug. PR #174 (test-only, no perf changes) currently surfaces 25 phantom "Regressions from peak"; active perf PRs carry partial false-flagging too. Schema_v2 persists `percent_delta_ci` + `baseline_sha` per metric; reporter switches peak compare to same-session percent-delta; `--scope pr` drops main-history overlay on PR comments; drift flag with chain-of-percent-deltas when baselines differ. `bench-history.json` wiped to empty v2 (v1 entries fed the bug). Two PRs: methodology fix + suite cleanup (`toggle-{first,last}-10` + conditional `timeout` 3→2). |
---
@@ -214,6 +215,6 @@ Plans drafted but not on the active roadmap. See `ai/plans/icebox/` for files.
- [Signals TC39 Integration](icebox/signals-tc39-integration.md) — adopt native `Signal.State`/`Signal.Computed` as backing primitives when TC39 ships. Blocked on TC39 Stage 3+.
- [Add Icon Stroke Width](icebox/add-icon-stroke-width.md) — power-user feature, post-1.0.
- [Audit Fix Continuation](icebox/audit-fix-continuation.md) — process work for follow-up audits.
-- [Tachometer Overhaul — PR B remainder](icebox/tachometer-overhaul.md) — suite rationalization + knob tuning + new benches. PR A (CI parallelization) and PR C (in-house Node reporter) shipped; PR B is the only outstanding piece.
+- [Bench Suite Expansion](icebox/bench-suite-expansion.md) — file-scoped hot-path micros (`micro-expression-evaluator`, `micro-signal`, etc.) + new end-to-end benches (`wake-count-single-key`, `nested-mutation`, `hydrate-1000-card`). Surgical adds; lands when underlying perf work needs them.
- [Contributing Surface](icebox/contributing-surface.md) — pre-1.0 stance + 1.0 graduation pass + post-1.0 triage flow (size + scope, GH-shaped vs md-shaped). Most icebox graduates at 1.0; the rest stays internal.
- [Registry](icebox/registry.md) — community registry for components and behaviors, runtime + compile-time consumption from one source, author-namespaced publishing under `@sui-hub` with editorial canonical aliases above. Post-Phase 4.
diff --git a/ai/plans/active/bench-peak-attribution.md b/ai/plans/active/bench-peak-attribution.md
new file mode 100644
index 000000000..a622c4221
--- /dev/null
+++ b/ai/plans/active/bench-peak-attribution.md
@@ -0,0 +1,187 @@
+# Bench Peak Attribution
+
+## Goal
+
+Eliminate phantom "Regressions from peak" produced by cross-session absolute-ms comparisons in the bench reporter. PR #174 (test-only, no perf-affecting code) currently surfaces 25 false REOPENED verdicts; the active perf PR #150 carries several false REOPENED among its 17 (`+1%`/`+2%` rows where main has drifted between iterations). Today the reporter cannot distinguish "iteration N regressed metric X" from "main moved between iteration N's bench and now."
+
+The fix: persist the same-session percent-delta tachometer already emits in `differences[].percentChange`, switch peak attribution to compare those instead of cross-session absolute ms, and drop the main-history overlay on PR comments. Within-session round-robin divides out environmental variance at each end of the comparison; absolute-ms compare across sessions does not.
+
+## Status
+
+`scoped` — design decisions made; implementation surface concrete (5 source files + tests across 2 PRs).
+
+## Background
+
+Cross-run peak attribution shipped in #146 (D3b), reading `bench-history.json` populated by #145 (D3a). `computeHistoryStatus` (`reporter.js:733`) picks peak per metric as the entry with lowest absolute `ci[1]` upper bound, then classifies current vs peak as WIN / TIED-PEAK / REOPENED.
+
+Tachometer's tight CIs are valid only *within the same session*: round-robin between current and baseline divides out OS scheduling, GC, and JIT jitter that vary across runs. Across sessions — especially when main has moved between two bench runs that each round-robin'd against their own tip-of-tree — absolute-ms comparisons mix real signal with main-side drift.
+
+The percent-delta tachometer emits (`differences[base.index].percentChange`, already extracted at run time by `loadAllMetrics` at `reporter.js:113-150`) is the within-session-tight number. It is comparable across iterations *if the baseline (tip-of-tree) is comparable*. By persisting the baseline SHA, the reporter can both compare correctly and flag confounded comparisons when the baseline SHAs differ enough to matter.
+
+History today has 8 v1 entries spanning 2026-04-18 to 2026-05-02 (`packages/**`-touching merges only; test/harness/docs merges skip the bench). Those entries were the data feed for the buggy peak attribution being replaced — preserving them adds no analytic value and would force dual-version reading complexity. PR 1 wipes the file to empty v2 in the same diff that turns on v2 writes; v1 input is rejected explicitly thereafter.
+
+## In Scope
+
+| # | Change | Track |
+|---|---|---|
+| A1 | Schema_v2 — persist `percent_delta_ci` + `baseline_sha` per metric. Reset `bench-history.json` to empty v2; drop v1 reading entirely. | Methodology |
+| A2 | `append-history` + `fetch-pr-history` extract both fields; workflow plumbs `--baseline-sha`. | Methodology |
+| A3 | Reporter peak attribution operates on `percent_delta_ci`. | Methodology |
+| A4 | `--scope pr` flag; comment job drops main-history overlay. | Methodology |
+| A5 | Drift flag rendering when peak vs current `baseline_sha` differ; quantify magnitude via chain-of-percent-deltas across main entries. | Methodology |
+| B1 | Drop `toggle-first-10` / `toggle-last-10`; keep `toggle-middle-10` (not position-aware). | Cleanup |
+| B2 | `timeout` 3→2 (conditional on validation against last ~10 main runs). | Cleanup |
+
+## Out of Scope (Deferred)
+
+| Item | Reason |
+|---|---|
+| Story-driven config rename (`tachometer-ci-rendering-throughput`, etc.) | Cosmetic. `discover.js` globs `tachometer-ci*.json` so renames are zero-code-change, but the reporter already groups metrics by source file path. Defer until the rename has a concrete consumer. |
+| New end-to-end micros: `wake-count-single-key`, `nested-mutation`, `hydrate-1000-card` | Sketched in [`../icebox/bench-suite-expansion.md`](../icebox/bench-suite-expansion.md). Lands when underlying reactivity / hydration work creates a gating need. |
+| Internal hot-path micros (`micro-expression-evaluator`, `micro-signal`, etc.) | Sketched in [`../icebox/bench-suite-expansion.md`](../icebox/bench-suite-expansion.md). Lands when an audit flags a hot-path regression the macro suite missed. |
+| `remove-{first,middle,last}-10` triplet collapse | Rejected, not deferred. Position-aware (head/middle/tail take different splice paths in flat-list reconcile). Keep all three. |
+| `branch_start_sha` schema field | No consumer asking. Cheap to add later. |
+| Main-drift dashboard | Separate concern from peak attribution. Build if/when long-running perf branches make it useful. |
+
+## Track A — Peak Attribution Methodology Fix
+
+### A1. Schema_v2 (and v1 wipe)
+
+`bench-history.json` and the in-memory `pr-history.json` schema_version: **2** (no v1 read path). Per-metric entries:
+
+```json
+{
+ "create-1k": {
+ "ci": [96.1, 97.6],
+ "mean_ms": 96.85,
+ "percent_delta_ci": [-2.5, -1.5],
+ "baseline_sha": "abc1234..."
+ }
+}
+```
+
+`baseline_sha` (not `tip_of_tree_sha`) for honesty across both run types. Push-to-main and PR-iteration entries carry the field uniformly.
+
+The 8 existing v1 entries get wiped — committed as `{"schema_version": 2, "commits": []}` in this PR's diff. v1 reading is dropped from both `loadHistory` and `readOrSeedHistory`; v1 input rejected explicitly. Pre-1.0 honest cleanup beats backward-compat for data that fed the bug.
+
+### A2. Append + fetch capture both numbers
+
+- `append-history.js:loadMetrics` pairs `this-change` with `tip-of-tree` per metric, reads `differences[base.index].percentChange.{low,high}`, persists as `percent_delta_ci`. Same change in `fetch-pr-history.js:loadMetrics`.
+- New `--baseline-sha` CLI flag on `append-history.js`. Workflow resolves and passes:
+ - PR run (`benchmarks.yml:138`): `git rev-parse FETCH_HEAD` after baseline checkout.
+ - Push-to-main (`benchmarks.yml:135`): `git rev-parse HEAD~1` after parent checkout.
+- The bench workflow writes `baseline-sha.txt` alongside the tachometer JSON in the upload artifact. `fetch-pr-history.js` reads the sidecar from each downloaded prior run; iterations predating this PR's deployment had v1 entries that the wipe removed, so no historical compatibility surface.
+- `loadHistory` and `readOrSeedHistory` accept v2 only. v1 input throws on append (caller must reset) or returns null on read (graceful no-history path engages).
+
+### A3. Reporter switches peak attribution to percent-delta
+
+`computeHistoryStatus` (line 733) currently picks peak by lowest absolute `ci[1]`. Change to: peak = entry with most-negative `percent_delta_ci[1]` upper bound.
+
+WIN / REOPENED / TIED-PEAK comparison runs on percent-delta CIs instead of absolute CIs. Cross-session environmental variance is divided out at each end. `delta_from_peak_pct` becomes (current pct-delta midpoint) − (peak pct-delta midpoint) — a meaningful "you regressed N percentage points of improvement" number.
+
+Status taxonomy unchanged in name, more honest in computation.
+
+### A4. `--scope pr` flag
+
+`benchmarks-report.yml:58-61` overlays main's `bench-history.json` onto the PR checkout before invoking the reporter. Add `--scope pr` to `reporter.js`; comment job invokes it. Drop the "Fetch latest bench-history.json from main" step from the comment job.
+
+Behavioral effect:
+- Test-only / no-prior-bench PRs (#174-style): peak attribution sees only PR-iteration history, which is empty → "Regressions from peak" section disappears. The bug is the bug.
+- Iterative perf PR (#150-style): peak from PR iterations only → only surfaces "iteration N was better on metric X than current." This is the load-bearing autoresearch signal.
+- Push-to-main runs: untouched. History archival continues as today (now writing v2 entries to a freshly-emptied file).
+
+### A5. Drift flag
+
+When current and peak entries have different `baseline_sha`, quantify the cumulative main-side drift on the metric by walking `bench-history.json` between the two baseline SHAs and combining each main commit's `percent_delta_ci`. Absolute-ms comparison between two main entries would re-introduce the cross-session unreliability the rest of this plan exists to fix; the chain-of-percent-deltas is the only methodologically valid path.
+
+Combine: `∏(1 + pct_i) − 1` precisely; for small values the sum of the chain approximates well. Threshold: cumulative drift ≥ ~5pp triggers the flag. Below that, drift is in the runner-noise floor across the chain and would clutter every long-running PR.
+
+```markdown
+| metric | current | peak | vs peak | bisect candidates |
+| `create-1k` | -2% ⚠️ | -10% @ `abc1234` | regressed +8pp | `def5678`, `9abc012` |
+
+⚠️ main moved +6pp on this metric between baselines (`abc1234` → `def5678`,
+ chained across 4 main commits). Comparison may include main-side change.
+```
+
+Use the existing severity emoji slot for visual consistency with the Faster/Slower section style. One footnote per flagged row.
+
+**Chain-gap handling.** If main commits between the two baselines lack `percent_delta_ci` (or main-history is empty — the day-zero case after the wipe), the chain can't be fully computed. Render the flag without a magnitude:
+
+```markdown
+⚠️ main moved between baselines; drift magnitude unavailable
+ (0/N entries available in chain). Comparison may include main-side change.
+```
+
+Honest about the data gap; the disclosure still fires. Fixtures cover the chain-walking *logic*; real history grows the quantification *utility* over weeks as v2 entries accumulate (~3-4 weeks for full chain coverage on most PR windows).
+
+## Track B — Suite Cleanup (independently shippable)
+
+### B1. `toggle-{first,last}-10` collapse
+
+Per the original suite-rationalization rationale: `toggle` operations are not position-aware (same code path regardless of position in the list). Three metrics measure the same thing.
+
+- `tachometer-ci-todo-micro.json`: drop `toggle-first-10` and `toggle-last-10` measurement entries from both `this-change` and `tip-of-tree` benchmark blocks.
+- `bench-todo.js`: drop the corresponding `performance.measure` calls.
+
+`remove-{first,middle,last}-10` stays as three metrics — those ARE position-aware (head/middle/tail splice paths differ in flat-list reconcile).
+
+### B2. `timeout` 3→2 minutes (conditional)
+
+Validate first against the last ~10 push-to-main runs' wall-clock for the slowest matrix cell. Quick `gh api` check:
+
+```bash
+gh run list --workflow=benchmarks.yml --branch main --limit 20 \
+ --json databaseId,conclusion,createdAt,updatedAt,jobs --jq '...'
+```
+
+If 95th percentile of the slowest cell is comfortably under 2 minutes (with ~30s head-room for tachometer's auto-sample tail to converge metrics that need it), ship the knob. Otherwise keep at 3 — wall-clock is not the binding constraint today.
+
+## Sequencing — Two PRs
+
+| PR | Scope | Inline-validatable? |
+|---|---|---|
+| 1 | Track A entire (A1–A5): schema_v2 writes + reads v2 only, peak switch, `--scope pr`, drift flag, `bench-history.json` reset | Bench job validates inline (PR head's workflow runs the new writers and tests). The comment job runs via `workflow_run` which uses main's reporter copy — comment behavior validates only after merge. Mitigation in Risk below. |
+| 2 | Track B: B1 + B2 | ✓ — `pull_request` event uses PR head's workflow. |
+
+PR 2 lands whenever — independent of PR 1.
+
+## Files Touched
+
+| File | PR | Change |
+|---|---|---|
+| `tools/ci/bench/reporter/append-history.js` | 1 | Pair `this-change` + `tip-of-tree` to extract `percent_delta_ci`; accept `--baseline-sha`; `readOrSeedHistory` accepts v2 only. |
+| `tools/ci/bench/reporter/fetch-pr-history.js` | 1 | Same extraction; read `baseline-sha.txt` sidecar from downloaded artifacts. |
+| `tools/ci/bench/reporter/reporter.js` | 1 | `loadHistory` accepts v2 only; `computeHistoryStatus` operates on `percent_delta_ci`; `--scope pr` flag; drift-flag rendering with chain-of-percent-deltas. |
+| `tools/ci/bench/reporter/bench-history.json` | 1 | Wipe to `{"schema_version": 2, "commits": []}`. |
+| `tools/ci/bench/reporter/append-history.test.js` | 1 | Update to v2 expectations; add tests for `--baseline-sha` plumbing and v1 rejection. |
+| `tools/ci/bench/reporter/reporter.test.js` | 1 | Update `fixtures/history-sample.json` to v2; add tests for percent-delta peak, `--scope pr`, drift flag, chain-gap handling. |
+| `.github/workflows/benchmarks.yml` | 1 | Resolve baseline SHA from baseline checkout; write `baseline-sha.txt` to results dir; include in upload artifact path. |
+| `.github/workflows/benchmarks-report.yml` | 1 | Pass `--baseline-sha` to `append-history` (read from artifact sidecar); drop main-history overlay step on comment job; add `--scope pr` to reporter call. |
+| `packages/component/bench/tachometer/tachometer-ci-todo-micro.json` | 2 | Drop `toggle-first-10`, `toggle-last-10` entries from both benchmark blocks. |
+| `packages/component/bench/tachometer/bench-todo.js` | 2 | Drop the corresponding `performance.measure` calls. |
+| `packages/*/bench/tachometer/tachometer-ci-*.json` (5 files) | 2 (conditional) | `timeout` 3 → 2 if validation passes. |
+
+## Sessions (estimated)
+
+1. **PR 1 — methodology fix end-to-end** (~7-9h pair). Schema_v2 capability, peak attribution switch, `--scope pr`, drift flag with chain-of-percent-deltas, fixture + test rebuild, workflow plumbing. Bigger diff but no inter-PR timing dependency.
+2. **PR 2 — suite cleanup** (~2h pair). Triplet drop is mechanical; knob change is gh-api validation + JSON edits. Independent.
+
+## Risk
+
+Bench infrastructure, not user-facing framework code. Blast radius: bench bot comments and the JSON adjunct that agents consume.
+
+- **Comment regression on in-flight PRs**: PR 1 changes the comment shape on every active PR's next bench run after merge. Cosmetic, not blocking — reviewers see fewer phantom regressions, not more.
+- **`workflow_run` constraint on PR 1's reporter changes**: doesn't validate inline (`workflow_run` always uses main's workflow copy, not the PR head's). The bench job validates via inline fixtures and the test suite, but the live comment a reviewer sees on PR 1 itself is still rendered by main's pre-merge reporter. Mitigation:
+ - Offline fixture coverage. `tools/ci/bench/reporter/fixtures/` extends with v2 history fixtures plus handcrafted tachometer JSON exercising the drift flag, `--scope pr` against empty PR-iteration history, and the chain-gap rendering case.
+ - Shadow-mode validation. Run the new reporter offline against ~10 prior merged PRs' artifact sets via `gh run download`; compare to the posted comments and investigate every disagreement before merging.
+ - Land at a quiet window (no active perf PRs in flight). Immediately open a trivial follow-up PR touching `packages/**` so the first real bench run after merge exercises the new reporter against real data.
+ - Prepare the revert commit before merging; target revert latency is minutes, not hours.
+
+## Dependencies
+
+None. The two PRs are independently revertable.
+
+## Open Questions
+
+None.
diff --git a/ai/plans/bench-reporter-overhaul.md b/ai/plans/bench-reporter-overhaul.md
deleted file mode 100644
index 9c33d8e60..000000000
--- a/ai/plans/bench-reporter-overhaul.md
+++ /dev/null
@@ -1,227 +0,0 @@
-# Bench Reporter Overhaul — Correctness & Suite Rationalization
-
-## Goal
-
-Coordinate two outstanding bench-bot improvements that need to land together:
-
-- **Track A — Peak attribution correctness.** Fix phantom "regressions from peak" caused by cross-session absolute-ms comparisons. Store within-session percent-delta CIs, scope peak to PR iterations, flag tip-of-tree drift.
-- **Track B — Suite rationalization remainder.** Finish the still-outstanding piece of the original `tachometer-overhaul` design: collapse non-position-aware triplets, add the fine-grained-reactivity / nested-mutation micro-benches, reorganize configs around what's measured rather than benchmark origin.
-
-Both tracks touch `bench-history.json` and the configs that index into it. Coordinated landing avoids schema/metric-rename collisions and lets the suite reorg's new metrics start writing v2 entries from day one.
-
-This plan supersedes [`icebox/tachometer-overhaul.md`](icebox/tachometer-overhaul.md) for active planning. The icebox file stays as historical design context — its principles section, status taxonomy rationale, and PR A / PR C history are referenced rather than repeated here.
-
-## Background
-
-The original `tachometer-overhaul` design landed in three coordinated PRs:
-
-| PR | Scope | Status |
-|---|---|---|
-| **A** | CI parallelization — matrix-per-config, concurrency group, per-bench cap | **Shipped.** |
-| **C** | In-house Node reporter (`tools/ci/bench/reporter/`) replacing `tachometer-reporter-action@v2` | **Shipped.** |
-| **B** | Suite rationalization + knob tuning | **Partial.** |
-
-What shipped from PR B:
-- `autoSampleConditions: ["2%"]` across all configs.
-- `tachometer-ci-hydrate.json` with `hydrate-each-100` (a partial of the original `hydrate-1000-card` design).
-- Some triplet collapses (`filter-active`/`completed`/`all` → `filter-cycle-20`).
-
-Still outstanding from PR B:
-- Story-driven config reorg (configs are still origin-named: `krausest`, `todo`, `todo-micro`, `hydrate`).
-- Remaining triplet collapses (`toggle-first`/`middle`/`last` still all present).
-- New micro-benches: `wake-count-single-key`, `nested-mutation`. (`hydrate-1000-card` partially covered by `hydrate-each-100`; could amplify.)
-- `timeout` cap from 3 → 2 minutes per config.
-
-A separate methodology bug surfaced after PR C shipped, in PR #174 (test/templating, no perf changes): 23 phantom "regressions from peak" against an anomalous-fast main commit (#162). The shipped reporter at `tools/ci/bench/reporter/reporter.js:733` (`computeHistoryStatus`) merges main-commit history with PR-iteration history and picks peak as the lowest absolute CI upper bound across the merged set. Cross-session absolute-ms compare is what tachometer's design specifically warns against — only same-session round-robin produces tight cross-run CIs. The schema designed in PR C (`schema_version: 1`) stores only absolute `this-change` CI, discarding the percent-delta from `differences[]` that's the actually-comparable cross-iteration number.
-
-The two tracks interact at the bench-history layer: A bumps the schema; B renames metrics and adds new ones. A clean rollout lands A's schema-write capability first so B's reorganized configs accumulate v2-shape entries from their first push.
-
-## Track A — Peak Attribution Correctness
-
-### A1. Schema bump — store within-session-tight numbers
-
-`bench-history.json` and the in-memory `pr-history.json` schema_version → 2. Per-metric entries gain:
-
-```json
-{
- "create-1k": {
- "ci": [96.1, 97.6], // existing — absolute this-change CI
- "mean_ms": 96.85, // existing — derived
- "percent_delta_ci": [-2.5, -1.5], // NEW — same-session round-robin's % vs tip-of-tree
- "tip_of_tree_sha": "abc1234..." // NEW — SHA tip-of-tree pointed at when bench ran
- }
-}
-```
-
-`percent_delta_ci` is the within-session-tight number tachometer warrants. Comparable across iterations when tip-of-tree is pinned. The `tip_of_tree_sha` lets the reporter detect main movement between iterations and flag confounded comparisons.
-
-Existing `ci`/`mean_ms` (absolute `this-change`) stay for context and the cross-main-commit "did this commit improve over its parent" view (the original design's principle 3 is sound for that surface).
-
-### A2. Append-history extracts both numbers
-
-`append-history.js:64-83` (`loadMetrics`) currently filters to `this-change` and stores only its mean CI. Update to walk both `this-change` and `tip-of-tree` entries per metric, extract percent-delta from `differences[]` (the same array `reporter.js:137` already reads for current-vs-base), and record the tip-of-tree SHA passed in via new `--tip-of-tree-sha` flag.
-
-Tip-of-tree SHA is known at bench time:
-- **PR run** (`benchmarks.yml:138`): `git rev-parse FETCH_HEAD` after the baseline checkout.
-- **Push-to-main run** (`benchmarks.yml:135`): `git rev-parse HEAD~1`.
-
-`fetch-pr-history.js:91-116` does the same extraction for prior PR-iteration runs.
-
-### A3. Reporter peak attribution operates on percent-delta
-
-`computeHistoryStatus` (reporter.js:733) currently picks peak as the commit with the lowest absolute CI upper bound. Switch to: peak is the commit with the most-negative percent-delta upper bound on `metrics[name].percent_delta_ci`.
-
-Status taxonomy unchanged (WIN / TIED-PEAK / REOPENED), now operating on within-session-tight numbers at both ends. Cross-session environmental variance is divided out at each end. Methodologically clean to within tachometer's design contract.
-
-The JSON adjunct's `delta_from_peak_pct` becomes the difference between current's percent-delta midpoint and peak's percent-delta midpoint — a meaningful "you regressed N percentage points of improvement" number.
-
-### A4. Scope peak to PR iterations only on PR comments
-
-`benchmarks-report.yml:58-61` currently fetches main's `bench-history.json` into the reporter's working directory before invoking the reporter. This merges main-commit history with PR-iteration history at peak-attribution time.
-
-Add a `--scope pr` flag to reporter.js that bypasses main-history loading. The comment job invokes it. Drop the "Fetch latest bench-history.json from main" step.
-
-Behavioral effect:
-- Tests-only / no-prior-bench PRs → peak attribution empty → "Regressions from peak" section gone.
-- Iterative perf PR → peak from PR iterations only → surfaces "iteration N was better on metric X than current."
-
-### A5. Tip-of-tree drift flag
-
-When current and peak entries have different `tip_of_tree_sha`, render a flag on the row noting main moved during PR lifetime. Threshold: ~5% of metric magnitude in absolute-ms shift (below that, main movement is in the runner-noise floor anyway).
-
-```markdown
-| metric | current | peak | vs peak | bisect candidates |
-| `create-1k` | -2% (≠main¹) | -10% @ `abc1234` | regressed +8pp | `def5678`, `9abc012` |
-
-¹ tip-of-tree differs between current and peak — main moved by Δ ms during PR lifetime; comparison may include main-side change.
-```
-
-Lean: flag, don't drop. Reviewers can interpret a flagged row better than they can act on a missing one.
-
-## Track B — Suite Rationalization Remainder
-
-### B1. Story-driven config reorganization
-
-Replace origin-named configs with story-driven ones — the question reviewers ask, not which file the bench came from.
-
-| New config | Metrics (drawn from existing configs / bench files) |
-|---|---|
-| `tachometer-ci-rendering-throughput.json` | `create-1k`, `create-10k`, `append-1k`, `bulk-add-500`, `add-20`, `clear-10k`, `swap-rows-20` |
-| `tachometer-ci-reactivity.json` | `update-10th-10`, `toggle-middle-10` (collapsed from triplet — see B2), `toggle-all-20`, `toggle-10`, `edit-start-10`, `edit-cycle-5`, plus new `wake-count-single-key`, `nested-mutation` |
-| `tachometer-ci-structural-changes.json` | `remove-row-{front,middle,back}-N`, `remove-{5-front,10-middle,5-back}`, `remove-middle-10` (collapsed from triplet — see B2), `filter-cycle-20`, `clear-completed-250`, `select-40` |
-| `tachometer-ci-hydration.json` | `hydrate-each-100` (existing); future `hydrate-1000-card` if/when added |
-
-Old configs (`tachometer-ci-krausest`, `tachometer-ci-todo`, `tachometer-ci-todo-micro`, `tachometer-ci-hydrate`) are deleted. `discover.js` glob-discovers `tachometer-ci-*.json` so the matrix updates without workflow edits.
-
-The bench JS files (`bench-krausest.js`, `bench-todo.js`, `bench-hydrate.js`) keep their fixture identities — krausest still mirrors the external js-framework-benchmark contestant, todo is still TodoMVC. The reorg is at the *config* layer (which metrics get measured under which story heading), not at the bench-file layer.
-
-### B2. Triplet collapses
-
-Per the original design's "position-aware vs not" rationale:
-
-- **Position-aware → keep**:
- - `remove-row-{front,middle,back}-N` (different paths in keyed reconcile + array splice)
- - `remove-{5-front,10-middle,5-back}` (same)
-- **Not position-aware → collapse to one**:
- - `toggle-{first,middle,last}-10` → `toggle-middle-10`. Same code path regardless of position.
-- **Borderline — open question (see #8)**: `remove-{first,middle,last}-10` in `tachometer-ci-todo-micro`.
-
-### B3. New micro-benches
-
-- **`wake-count-single-key`**: mutate one key on one item in a 1000-item each. Asserts on wake count via `Reaction.setTracing()` counter, emitted as `performance.measure('wake-count-single-key', ...)` with the count encoded as ms (1ms × count).
-- **`nested-mutation`**: `items[i].nested.x = v` on a 1000-item list with nested objects. Measures the coarse-notify path; gates the freeze-default design choice.
-- **`hydrate-1000-card`** (optional): full SSR + hydrate end-to-end at 1000-card scale. Largely covered by amplifying `hydrate-each-100` to N=1000 — confirm whether the existing bench at higher scale satisfies the original intent or a separate fixture is needed.
-
-### B4. Knob tuning final pass
-
-- `autoSampleConditions: ["2%"]` already shipped across all configs. ✓
-- Outstanding: `timeout` 3 → 2 minutes. Validate first against last ~10 main runs' wall-clock to confirm the cap doesn't truncate convergence on the longest-running config. Quick `gh api` / `jq` script.
-
-## How the Tracks Interact
-
-**Schema migration must precede metric renames.** A1 (schema_v2 capability) ships first. Then B1's reorganized configs accumulate v2-shape entries from their first main push. Old metric names (e.g. `toggle-first-10`, `toggle-last-10`) become orphan v1 entries in history; A3's reporter ignores them (no current metric named that to compare against).
-
-**Peak attribution coverage on new metrics is delayed.** A new bench (e.g. `wake-count-single-key`) gets its first v2 entry on the main push that adds it, then accumulates one entry per main commit. Peak attribution kicks in once the PR-iteration history (or main history) has at least one entry for that metric. Same as today's add-a-bench behavior; no special handling needed.
-
-**`discover.js` matrix is glob-based.** Renaming configs (`tachometer-ci-krausest.json` → `tachometer-ci-rendering-throughput.json`) doesn't require workflow edits. The matrix output names update naturally; PR check titles change, which is desirable.
-
-**Test fixtures touched by both tracks.** `reporter.test.js` fixtures (`real-delta`, `zero-delta`) currently mirror the old origin-named configs (`renderer-tachometer-ci.json`, etc.). After B1 the fixture filenames update. A also updates `history-sample.json` to v2. Coordinate the fixture changes so each PR's tests run green.
-
-**Shared `tip_of_tree_sha` plumbing.** A's `--tip-of-tree-sha` workflow output is computed once and consumed by both append-history (Track A) and the reporter step. B's config rename has no effect on the plumbing — it's per-metric, not per-config.
-
-## Rollout — combined ordering under the `workflow_run` constraint
-
-Reporter changes only take effect once merged. Same constraint the original `tachometer-overhaul` plan called out for PR C. Rollout order matters for the schema → suite-reorg → behavior-change progression:
-
-| Stage | Track | Scope | Validates inline? |
-|---|---|---|---|
-| **PR 1** | A | Schema_v2 read+write capability. New main pushes write v2 entries; reporter reads v2 transparently but doesn't yet use it for peak attribution. No PR-comment behavior change. | Schema-write ✓ on push-to-main; comment unchanged. |
-| **PR 2** | B | Suite rationalization: config reorg, triplet collapses, knob `timeout` final pass. Existing reporter renders new configs unchanged. New metrics begin accumulating v2 entries from first push. | ✓ — `pull_request` event uses PR head's workflow. |
-| **PR 3** | A | Peak attribution switch to percent-delta. `--scope pr` flag. Workflow drops main-history fetch on comment job. Tip-of-tree drift flag rendered. | ✗ — `workflow_run` uses main's copy. Validate via offline fixtures + post-merge acceptance test (trivial follow-up PR). |
-| **PR 4 (optional)** | B | New micro-benches (`wake-count-single-key`, `nested-mutation`). Independent of A; lands when the underlying perf work needs them. | ✓ — `pull_request`. |
-
-PR 1 → PR 2 ordering: schema-write capability lands first so B's new configs write v2 entries from the start.
-PR 1 → PR 3 ordering: schema_v2 must be writing for ~10 main pushes before PR 3 has data to read.
-PR 2 ↔ PR 4 are independent of each other.
-
-Each PR is independently revertable.
-
-## Open Questions
-
-1. **v1→v2 entry migration on read.** Stay v1-shape or rewrite on read? Lean: stay v1; let v2 accumulate organically. (Track A.)
-2. **Schema_v1 graceful-degrade in reporter.** Fall back to absolute peak attribution, or surface no peak section? Lean: no peak section; absolute peak is what we're retiring. (Track A.)
-3. **Branch-start anchoring.** Add as a third schema field, or defer? The original `tachometer-overhaul` design tracked branch-start as a stable reference for "this branch's progress" (principle 4). Lean: defer; user's stated intent satisfied without it. (Track A.)
-4. **Tip-of-tree drift threshold.** What absolute-ms shift triggers the confound flag? Lean: ~5% of metric magnitude. (Track A.)
-5. **Main-drift on a separate dashboard.** Build, or leave untracked? Lean: defer; track separately if/when needed. Could become its own P-track plan. (Track A.)
-6. **Story-driven config naming.** `rendering-throughput`, `reactivity`, `structural-changes`, `hydration` are the original design's names. Confirm or revise. (Track B.)
-7. **`select-40` placement.** Original design called select "structural"; current bench treats it as part of krausest's keyed-table workflow. Reactivity vs structural-changes is borderline. Confirm. (Track B.)
-8. **Triplet collapse for `remove-{first,middle,last}-10` in `todo-micro`.** Original design said collapse all not-position-aware triplets; remove operations on a flat list ARE position-aware (head/tail vs middle take different splice paths). Keep all three or collapse to middle? Lean: keep — they're position-aware. (Track B.)
-9. **Wake-count instrumentation path.** Emit count as ms-encoded measurement via `performance.mark`, or extend tachometer with custom measurement type? Lean: ms-encoded (no upstream patch). (Track B.)
-10. **Knob `timeout` 3 → 2 minutes.** Validate against last ~10 main runs first. Quick gh-api script before committing. (Track B.)
-
-## Files Touched
-
-| File | PR | Change |
-|---|---|---|
-| `tools/ci/bench/reporter/append-history.js` | 1 | Extract `percent_delta_ci` + `tip_of_tree_sha`; write `schema_version: 2`. |
-| `tools/ci/bench/reporter/fetch-pr-history.js` | 1 | Same extraction for PR-iteration runs. |
-| `tools/ci/bench/reporter/reporter.js` | 1, 3 | PR 1: schema_v2 read support, no behavior change. PR 3: peak attribution on percent-delta, `--scope pr` flag, tip-of-tree drift flag rendering. |
-| `.github/workflows/benchmarks.yml` | 1 | Compute and emit tip-of-tree SHA from baseline checkout (workflow output). |
-| `.github/workflows/benchmarks-report.yml` | 1, 3 | PR 1: pass `--tip-of-tree-sha` to append-history. PR 3: drop "Fetch latest bench-history.json from main" step in comment job; add `--scope pr` to reporter call. |
-| `tools/ci/bench/reporter/reporter.test.js` | 1, 3 | Update `history-sample.json` to v2; add `history-sample-v1.json` for graceful-degrade test; add tests for drift flag and `--scope pr`. |
-| `tools/ci/bench/reporter/append-history.test.js` | 1 | Tests for v2 schema writing. |
-| `packages/component/bench/tachometer/tachometer-ci-{krausest,todo,todo-micro,hydrate}.json` | 2 | Delete. |
-| `packages/component/bench/tachometer/tachometer-ci-{rendering-throughput,reactivity,structural-changes,hydration}.json` | 2 | Create — story-driven configs. |
-| `packages/component/bench/tachometer/bench-{krausest,todo,hydrate}.js` | 2 | Triplet collapse: remove `toggle-first-10` / `toggle-last-10` measurements; keep `toggle-middle-10`. |
-| `packages/reactivity/bench/tachometer/bench-wake-count.js` (new) | 4 | `wake-count-single-key` micro. |
-| `packages/reactivity/bench/tachometer/bench-nested-mutation.js` (new) | 4 | `nested-mutation` micro. |
-| `tools/ci/bench/reporter/fixtures/real-delta/*.json` | 2 | Rename to match new config naming. |
-| `tools/ci/bench/reporter/bench-history.json` | — | Auto-updated as main pushes accumulate v2 entries. No manual touch. |
-
-## Dependencies
-
-None blocking. PRs are independently revertable. Either track can stall without blocking the other.
-
-## Risk
-
-Bench infrastructure, not user-facing framework code. Blast radius is the bench bot comments and the JSON adjunct that agents consume.
-
-- **Comment regression for in-flight PRs**: PR 3 changes the comment shape on every active PR's next bench run. Cosmetic, not blocking — reviewers see fewer phantom regressions. No data loss.
-- **Schema migration race**: PR 1 must merge before PR 3 lands. Otherwise PR 3's reporter looks for `percent_delta_ci` in a v1 history. Open Question 2's no-peak-section graceful-degrade covers this — worst case, the section is empty for the gap window.
-- **Metric-rename history orphans**: PR 2's triplet collapses retire `toggle-first-10` / `toggle-last-10`. Their existing v1 history entries become orphans (no current metric to compare). Reporter ignores them naturally — no current metric named that means no peak attribution lookup. No remediation needed.
-- **`workflow_run` constraint**: as with the original `tachometer-overhaul` PR C, PR 3 doesn't validate inline. Mitigation: thorough offline test coverage; merge during a quiet window; have revert ready.
-
-## Status
-
-`initial` — combines the original `tachometer-overhaul` PR B remainder and the newly-identified peak-attribution correctness work. Ten open questions are real design calls; ~45-min pair to resolve them upgrades to `scoped`. Implementation surface is concrete (~10 source files across the four PRs, modest LOC each).
-
-Total estimate post-scoping: 16-24h pair across 4 PRs (PR 4 optional and independent).
-
-Supersedes [`icebox/tachometer-overhaul.md`](icebox/tachometer-overhaul.md) for active planning.
-
-## Sessions (estimated, post-scoping)
-
-1. **PR 1** (Track A schema_v2 capability): append-history + fetch-pr-history extract percent-delta + tip-of-tree SHA; workflow plumbing; reporter reads v2 transparently; fixture + tests. ~4-5h pair.
-2. **PR 2** (Track B suite reorg): four story-driven configs replace origin-named ones; `toggle-{first,last}-10` collapse to middle; `timeout` 3→2 (after validation); fixtures rename. ~4-6h pair.
-3. **PR 3** (Track A peak switch): `computeHistoryStatus` operates on percent-delta CIs; `--scope pr` flag; workflow drops main-history fetch on comment job; tip-of-tree drift flag rendering. ~3-4h pair.
-4. **PR 4** (Track B new micros, optional and independent): `wake-count-single-key`, `nested-mutation`. Lands when the underlying reactivity work needs them. ~3-5h pair.
diff --git a/ai/plans/icebox/bench-suite-expansion.md b/ai/plans/icebox/bench-suite-expansion.md
new file mode 100644
index 000000000..f8668f048
--- /dev/null
+++ b/ai/plans/icebox/bench-suite-expansion.md
@@ -0,0 +1,58 @@
+# Bench Suite Expansion
+
+## Status
+
+`initial` — drafted, not on the active roadmap. Lands when underlying perf work creates a gating need, or when an audit identifies a regression the current suite missed. The current suite is comprehensive at the macro level; these are surgical adds.
+
+## Goal
+
+Expand bench coverage where the current suite has identified gaps. Two tracks: file-scoped micro-benches for hot paths the macro suite can't isolate signal on, and end-to-end benches for reactivity and hydration patterns the current suite doesn't exercise.
+
+## Track 1 — Internal hot-path micros (file-scoped coverage)
+
+Macro suites tell the *product* story (user-observable latency); micros tell the *implementation* story (per-op cost). A 20% regression in `expression-evaluator.js` shifts `update-10th` by maybe 2-3% — below the noise floor. A PR touching only `expression-evaluator.js` gets no meaningful signal from the end-to-end suite.
+
+Candidates — one config per hot-path file:
+
+| Config | Covers | Operations per sample |
+|---|---|---|
+| `micro-expression-evaluator` | `packages/renderer/src/expression-evaluator.js` | Simple identifier, dotted path, Lisp helper, JS eval, mixed |
+| `micro-signal` | `packages/reactivity/src/signal.js` | `set(same)` fast path, `set(changed)`, `notify` with N subscribers, sub/unsub churn |
+| `micro-reaction-scheduler` | `packages/reactivity/src/reaction.js` | `flushTask`, microtask coalescing, dependency-set diffing, nested-reaction teardown |
+| `micro-template-compiler` | `packages/templating/src/*` | Parse (cold), parse (cached), AST walk, snippet args extraction |
+| `micro-build-html-string` | `packages/renderer/src/build-html-string.js` | Fragment serialization, attribute binding scan, DSD marker emission |
+| `micro-dom-walker` | `packages/renderer/src/engines/native/renderer.js` (`bindMarkers` walker) | Single-pass SHOW_ELEMENT / SHOW_COMMENT over 1000-node tree, `blockDepth` skip, per-item marker adoption |
+
+Weight bench design by production distribution — 79% of production template expressions are property lookup (simple identifier + dotted path), 19% Lisp helpers, 2% JS eval. A 10% improvement on simple identifiers has more real-world impact than a 2× improvement on complex Lisp.
+
+## Track 2 — Reactivity / hydration adds
+
+Three benches identified as gaps in the current end-to-end suite, each tied to specific design directions:
+
+- **`wake-count-single-key`** — mutate one key on one item in a 1000-item each. Asserts on wake count via `Reaction.setTracing()` counter. Directly exposes the fine-grained-reactivity win when that work lands.
+- **`nested-mutation`** — `items[i].nested.x = v` on a 1000-item list with nested objects. Measures coarse-notify path; gates the freeze-default design choice.
+- **`hydrate-1000-card`** — full SSR + hydrate end-to-end at 1000-card scale. Likely subsumed by amplifying `hydrate-each-100` to N=1000; confirm before building separately.
+
+## Methodology constraint
+
+Only compare percent-deltas across runs, never absolute ms across sessions. Each new config follows the `this-change` / `tip-of-tree` round-robin pattern; cross-iteration comparison uses `differences[].percentChange`. Same rule the rest of the bench infrastructure operates under.
+
+## Open Questions
+
+1. **Wake-count instrumentation path.** Emit count as ms-encoded measurement via `performance.mark` (no upstream patch), or extend tachometer with a custom measurement type. Lean: ms-encoded.
+2. **Nested-mutation setup contract.** Reuse nested objects across mutations (measures `isEqual` path) or freshly spread each time (measures allocation path). Probably two benches under a common umbrella.
+3. **Hydration bench scaling.** Amplify `hydrate-each-100` to N=1000, or build a separate fixture? Amplify first; build a separate fixture only if format-change bench instability becomes a problem.
+4. **Triggering for micros.** Always-run on every PR, or path-based filtering? Lean: always-run — cross-file regressions are real (a `signal.js` change can move expression-evaluator's observed cost). If CI cost becomes a concern, use a `[skip-micro]` commit message tag rather than path-based routing.
+
+## When this lands
+
+Each track triggers independently:
+
+- **Track 1** lands when an audit flags a hot-path regression the macro suite missed, OR when starting a perf pass on a file the macro suite can't isolate signal for.
+- **Track 2** lands when the underlying reactivity / hydration work creates a gating need (fine-grained reactivity for `wake-count-single-key`; freeze-by-default for `nested-mutation`; hydration scaling for `hydrate-1000-card`).
+
+No speculative builds — the suite is already comprehensive at the macro level.
+
+## Dependencies
+
+None. Each bench in either track is independently shippable.
diff --git a/ai/plans/icebox/tachometer-overhaul.md b/ai/plans/icebox/tachometer-overhaul.md
deleted file mode 100644
index 90b8cd154..000000000
--- a/ai/plans/icebox/tachometer-overhaul.md
+++ /dev/null
@@ -1,816 +0,0 @@
-# Tachometer Overhaul
-
-## Status
-
-**Superseded for active planning by [`../bench-reporter-overhaul.md`](../bench-reporter-overhaul.md)** (ROADMAP P15).
-
-PR A (CI parallelization) and PR C (in-house Node reporter) shipped from the original design. PR B (suite rationalization + knob tuning) was partially absorbed (`autoSampleConditions: ["2%"]`, partial triplet collapses, `tachometer-ci-hydrate.json`) and partially carried forward into the active plan as **Track B** of the overhaul.
-
-A separate methodology bug surfaced after PR C shipped — peak attribution operating on cross-session absolute ms produces phantom "regressions from peak" on PRs without perf changes (PR #174 surfaced 23 of these). The fix lives in the active plan as **Track A** (schema_v2 with within-session percent-delta + tip-of-tree SHA, `--scope pr` flag, tip-of-tree drift flag).
-
-This file remains as historical design context — the principles, status taxonomy rationale, JSON schema design, and PR A / PR C execution playbooks are referenced by the active plan rather than repeated. Read this for the *why* behind decisions in the active plan; read the active plan for what's getting built next.
-
-The full plan below was the original three-PR design; sections describing PR A and PR C are historical context for what shipped.
-
----
-
-## Original framing
-
-Coordinated overhaul of **what** we measure, **how** the CI runs it, and
-**how the results are reported**. Three changes land together because they
-only pay off in combination: a cleaner suite wasted in the old reporter
-gains little; a new reporter over the current noisy suite gains little;
-and neither is usable without parallel CI to keep wall-clock under 10 min.
-
-Audience is both the PR reviewer and the next autoresearch agent session.
-The guiding rule is **no row without an action pointer** — every line in
-the artifact should either move a decision forward or get cut.
-
-## Framing principles
-
-1. **Agent handoff, not status report.** The artifact must answer "what
- should I try next?" — not "where are we?". Wins and losses without a
- next-action pointer are wasted signal.
-
-2. **Honest CIs only.** Tachometer resolves 95% CIs before emitting a
- number, or marks the result `unsure`. Overlapping CIs = statistical
- tie, not "one run was lucky". Peak = the commit whose CI dominates
- all others on a metric, or the cluster of commits whose CIs overlap
- at the bottom.
-
-3. **Absolute `this-change`, not PR-vs-main delta, for cross-commit
- comparisons.** Main moves under a long-lived perf PR. Deltas across
- commits mix two moving parts; absolute `this-change` CIs for each run
- give the branch's own trajectory cleanly. Delta-vs-main stays as a
- secondary column for context.
-
-4. **Branch-start baseline, not current main, for progress claims.** "How
- far we've come on this branch" needs a stable reference. First run on
- the branch is that reference.
-
-5. **Stories over rows.** Three macro suites organized by *what is being
- measured*, not by benchmark origin. Micros cover internal hot paths
- that end-to-end benches can't isolate.
-
-6. **Parallel, fast feedback.** One CI job per config, ≤10 min wall-clock,
- per-check status, edit-in-place comment. Slow feedback kills iteration.
-
-## Suite rationalization
-
-### Problem with the current suite
-
-27 benchmarks across three configs (`tachometer-ci.json`,
-`tachometer-ci-todo.json`, `tachometer-ci-todo-micro.json`). Watching it
-move across 17 commits on `perf/native` revealed two issues:
-
-1. **Redundancy without positional relevance.** Triplets like
- `filter-all/active/completed` and `toggle-first/last/middle` move
- within ±1-5pp of each other on nearly every commit. They triangulate
- noise but add nothing that one representative wouldn't catch —
- at 3× the noise budget and 3× the comment surface area.
-
-2. **Coverage gaps on framework-relevant patterns.** Fine-grained
- reactivity (wake count on a single-key mutation), nested mutation
- (`items[i].nested.x = v`), and SSR hydration end-to-end are all
- invisible to current PR CI.
-
-### Rationale for which positional triplets stay
-
-Keep positional triplets *only* when the algorithm under test is
-position-aware — i.e. when front/middle/back exercise different code
-paths or data layouts.
-
-- **Remove operations are position-aware.** Head removal vs splice vs
- tail pop take different paths in both the reconcile loop (marker
- bookkeeping, DOM removal order) and the underlying state array.
- **Keep** `remove-first/middle/last` (micro) and
- `remove-5-front/middle/back` (macro).
-- **Toggle-* operations are NOT position-aware.** Same code path
- regardless of N. **Collapse** to `toggle-middle` alone.
-- **Filter-* operations are NOT position-aware.** Filter iterates every
- item regardless of result set size. **Collapse** to `filter-completed`
- (most sensitive through the rendering path).
-
-### Cuts
-
-| Drop | Rationale |
-|---|---|
-| `ci/create-10k` | 10× N of create-1k, same op. 1k catches constant-factor regressions; 10k is dominated by allocation overhead that rarely moves independently. |
-| `todo/bulk-add-200` | 4× N of bulk-add-50, same op. Moves in lockstep. |
-| `todo-micro/toggle-first` | Collapsing to `toggle-middle`. |
-| `todo-micro/toggle-last` | Collapsing to `toggle-middle`. |
-| `todo-micro/filter-all` | Collapsing to `filter-completed`. |
-| `todo-micro/filter-active` | Collapsing to `filter-completed`. |
-
-Net: 27 → 21 macro benchmarks before adds.
-
-### Keeps (unchanged — pulling their weight)
-
-- **Krausest-style (6):** create-1k, append-1k, update-10th, select, swap-rows, clear
-- **TodoMVC macro (8):** bulk-add-50, add-20, toggle-10, toggle-all, remove-5-front, remove-5-middle, remove-5-back, clear-completed
-- **TodoMVC micro (7):** toggle-middle, remove-first, remove-middle, remove-last, filter-completed, edit-start, edit-save
-
-### Adds
-
-Reclaim the budget from cuts to cover current gaps:
-
-1. **`reactivity-micro/wake-count-single-key`** — mutate one key on one
- item in a 1000-item each. Assert on wake count, not timing. Ideally
- a `Reaction.setTracing()`-backed counter emitted via
- `performance.mark()` and read out in tachometer via a measurement
- expression. Directly exposes the fine-grained-reactivity win when it
- lands.
-
-2. **`reactivity-micro/nested-mutation`** — `items[i].nested.x = v` on a
- 1000-item list with nested objects. Measures the current coarse-notify
- path vs any future fine-grained scheme. Also the gate for the
- freeze-default design choice — with freeze-on-set this either works
- or throws; either outcome is measurable.
-
-3. **`hydration-macro/hydrate-1000-card`** — full hydration path for
- `/perf/hydrated`: `renderToString()` output into DOM, time to
- `hydrate()` complete, time to first interactive update. Today's
- biggest perf story (the hydration pass on `perf/native` was a ~425ms
- regression that took four plans to close) has no PR-gate signal.
-
-### Story-driven reorganization
-
-Reorganize configs around *what is being measured* rather than *origin
-of benchmark*. Four story-driven suites:
-
-- **`rendering-throughput`** — mount/append/swap/teardown under load.
- create-1k, append-1k, swap-rows, clear, bulk-add-50.
-- **`reactivity`** — update propagation efficiency.
- update-10th, toggle-middle, toggle-all, toggle-10, edit-start,
- edit-save, add-20, wake-count-single-key, nested-mutation.
-- **`structural-changes`** — reordering / removal / filter.
- remove-first/middle/last (micro), remove-5-front/middle/back (macro),
- filter-completed, clear-completed, select.
-- **`hydration`** — SSR + hydrate.
- hydrate-1000-card.
-
-Each suite's PR comment becomes interpretable at a glance: "reactivity
-got 30% faster, structural-changes held, hydration regressed 5%" tells
-a reviewer *where to look*. Today they cross-reference three tables of
-27 mixed-axis rows.
-
-## Internal hot-path micro-benches (file-scoped coverage)
-
-End-to-end benches mask internal-hot-path regressions. A 20% regression
-in the expression evaluator shifts `update-10th` by maybe 2-3%, below
-the noise floor. A PR that only touches `expression-evaluator.js` gets
-no meaningful signal from the end-to-end suite.
-
-### Candidates (one config per hot-path file)
-
-| Config | Covers | Ops per sample |
-|---|---|---|
-| `micro-expression-evaluator` | `expression-evaluator.js` | Lookup (`a.b.c`), JS (`a + b`), helper call, ternary, mixed Lisp/JS |
-| `micro-signal` | `packages/reactivity/src/signal.js` | `set(same)` (equality fast path), `set(changed)`, `notify` with N subscribers, subscribe/unsubscribe churn |
-| `micro-reaction-scheduler` | `packages/reactivity/src/reaction.js` | flushTask, microtask coalescing, dependency-set diffing, nested-reaction teardown |
-| `micro-template-compiler` | `packages/templating/src/*` | Parse (cold), parse (cached), AST walk, snippet args extraction |
-| `micro-build-html-string` | `packages/renderer/src/build-html-string.js` | Fragment serialization, attribute binding scan, DSD marker emission |
-| `micro-dom-walker` | `packages/renderer/src/engines/native/renderer.js` (bindMarkers walker) | Single-pass SHOW_ELEMENT\|SHOW_COMMENT over 1000-node tree, blockDepth skip, per-item marker adoption |
-
-### Triggering strategy
-
-**Run all micro-benches on every PR.** They are cheap (<30s total).
-Arguments for conditional per-file triggering exist (less CI load), but:
-
-- Cross-file regressions are real — a change to `signal.js` can move the
- expression evaluator's observed cost. Always-run catches these.
-- Conditional logic adds workflow complexity for small savings.
-
-If CI cost becomes a concern, use a `[skip-micro]` commit message tag
-rather than path-based routing. Keep the opt-out simple.
-
-### Why micros complement the macro suite
-
-Macro suites tell the *product* story (user-observable latency). Micros
-tell the *implementation* story (per-op cost). Both are needed:
-
-- A 3× wake-count reduction at the Signal layer may show as only 5% in
- `update-10th` because DOM work dominates. Macro undersells the win.
-- An allocation regression in the expression evaluator may not move any
- macro bench measurably but will cause GC pauses on heavy pages.
- Macro misses the regression entirely.
-
-A comment showing "macro suite held, `micro-expression-evaluator`
-regressed 40%" is diagnostically gold: regression is isolated to one
-unit, fix is local.
-
-## CI orchestration
-
-### Time budget — where the 32 min goes today
-
-Runs of the current 27-bench suite take 31-33 min consistently. Three
-coordinated levers close that gap. The suite cut above is lever 1 of 3;
-all three ship together.
-
-Per config today, tachometer does:
-
-1. **Mandatory floor**: `sampleSize` (50) × 2 URLs × N metrics, round-robin
- at ~300-500ms per sample. For a 7-metric config: ~700 samples ≈ 5 min
- just to reach the sample-size floor.
-2. **Auto-sample tail**: up to `timeout` (5 min, micro 3 min) chasing
- every metric's CI against `autoSampleConditions`.
-
-Current configs ask for `autoSampleConditions: ["0%", "10%"]` — both
-"is there any difference?" *and* "is the difference ≥10%?". The `0%`
-condition cannot converge when the true delta is truly zero
-(tachometer's own docs flag this: "if the actual difference is very
-close to a condition, the condition will never be met and the timeout
-will expire"). Every `unsure 🔍 -0% - +0%` verdict is 3-5 min of
-compute producing a non-actionable answer.
-
-Relaxing to `["10%"]` would be the wrong fix: autoresearch *depends* on
-sub-10% signal because small wins stack. A 3% improvement on update-10th
-plus 2% on swap-rows plus 1.5% on clear is a real perf story; with a
-10% floor they're all rounded to "within 10%" and the feedback loop is
-dead.
-
-### Knob tuning (ships with the config rationalization)
-
-| change | current | proposed | effect |
-|---|---|---|---|
-| `autoSampleConditions` | `["0%", "10%"]` | `["2%"]` (start), tighten to `["1%"]` if data supports | kills the zero-convergence tail; floor set by runner noise, not preference |
-| `timeout` (per-config cap) | 5 / 5 / 3 min | 2 min uniform | caps worst case; aligns with the 3-min per-bench cap below |
-| `sampleSize` | 50 | **keep 50** | don't erode the floor when the other knobs pay back enough |
-
-The resolution floor is a **runner-noise question, not a preference
-choice.** On a quiet workstation tachometer narrows well under ±0.5%;
-on shared GHA runners at `sampleSize: 50`, small-time metrics
-(`toggle-*` ~2ms, `edit-save` ~20ms) routinely carry CI widths of
-1-2% relative. Below that floor we can't converge regardless of how
-long we sample.
-
-**Validation step before committing the value:** pull CI widths from
-the last ~10 runs on `perf/native`, look at the distribution per
-metric. If ≥90% of metrics routinely converged to CIs narrower than
-±1%, use `["1%"]`. If the floor is closer to ±2%, use `["2%"]`. Start
-at `["2%"]` as the safe default — a metric resolved at ±2% still
-preserves the stackable autoresearch signal; a metric that won't
-converge produces `unsure` and wastes compute.
-
-Why `["2%"]` keeps autoresearch signal:
-
-- **True delta 5%**: CI narrows to ~[4%, 6%], entirely outside ±2% →
- converges with actual magnitude preserved.
-- **True delta 0.3%**: CI ~[-0.5%, +1.1%], within ±2% → converges as
- "within 2% (noise floor)". Fast, not `unsure`.
-- **True delta 0%**: CI ~[-0.8%, +0.8%], within ±2% → converges
- quickly. No more timeout on zero deltas.
-- **Boundary case**: only metrics with true delta right at ±2% can
- fail — narrow sliver.
-
-A 3% + 2% + 1.5% stack still surfaces: each individual metric
-resolves to its actual magnitude (the CI of a 3% delta is outside ±2%,
-so the output is the real `[2.5%, 3.5%]`, not "within 2%"). The floor
-is only what we *round to* when the true effect is smaller than we can
-distinguish.
-
-For debugging a specific borderline metric, run locally with
-`autoSampleConditions: ["0.5%"]` and a longer timeout — never in CI.
-
-### Combined impact of the three levers
-
-| lever | serial time |
-|---|---|
-| Current (27 benches, `["0%", "10%"]`, 3 serial configs) | ~32 min |
-| + suite cut (21 → 27 with adds, still 3 configs) | ~28 min |
-| + knob tuning (`["10%"]`, 2-min cap) | ~12-15 min |
-| + parallel matrix (one job per story-driven config) | **~5-7 min wall-clock** |
-
-The suite cut alone doesn't hit the target; the knob tuning alone
-doesn't hit the target; parallelization alone doesn't hit the target.
-All three together do.
-
-### Parallel config jobs (matrix strategy)
-
-One GitHub Actions job per tachometer config, running concurrently.
-Wall-clock = duration of the slowest config, not the sum.
-
-- Current slowest configs run ~10-12 min each on the full 27-bench suite.
-- After cut + knob tuning + split: each config runs 3-8 benches with a
- 2-min auto-sample cap. Slowest individual config targets ~5-7 min;
- micros run in ~1-3 min.
-- **Target: under 10 min wall-clock**, driven by the slowest macro
- config (likely `structural-changes` with the remove-* triplets).
-
-### Per-check PR statuses
-
-Each parallel job exposes as a separate check in the PR checks panel.
-A single red X next to `micro-signal` tells a reviewer exactly what
-regressed without opening the comment. This is the piecemeal feedback
-that agents can surface via `gh pr checks` and humans can scan in
-the GitHub UI.
-
-**Check conclusion taxonomy.** Red X means "we measured a regression."
-UNSURE metrics use `neutral` conclusion, not `failure` — collapsing
-"couldn't tell" into the same signal as "confirmed regression"
-misleads both humans and agents. Per-suite rule: any REGRESSED metric
-→ failure; otherwise any UNSURE metric → neutral; otherwise success.
-
-### Budget cap per bench
-
-Hard per-bench timeout (3 min each). If a bench times out, it reports
-`unsure — insufficient samples` and the suite moves on. 9/10 confident
-results in 10 min beats 10/10 in 40 min where some are unrigorous anyway.
-
-### Concurrency control
-
-Add `concurrency: { group: bench-${{ github.ref }}, cancel-in-progress: true }`
-to `benchmarks.yml`. Rapid pushes currently stack runs; an older run
-completing last can overwrite the comment from a newer run. Already
-identified in the earlier CI review; fix lands with this overhaul.
-
-### Tiering (fallback, not recommended yet)
-
-If per-PR time stays painful even after parallelization:
-
-- **Always-run:** micros + high-signal macros (update-10th, toggle-all,
- swap-rows, clear, bulk-add-50).
-- **Nightly (cron on main):** positional remove-* triplets, create-10k,
- full TodoMVC macro.
-
-Downside: regressions in nightly-tier benches surface days later, past
-bisection-cheap. Fallback only.
-
-## Reporter redesign
-
-Replace `andrewiggins/tachometer-reporter-action@v2` with an in-house
-Node script, **capped at ≤300 LOC with a minimal test surface.** If
-it grows beyond that in initial implementation, it's accruing
-maintenance debt that wasn't priced into the replace-vs-depend
-decision — stop, reconsider, possibly find a narrower scope. Reasons
-to own it at all:
-
-- The action's HTML output requires regex parsing. Every agent doing
- autoresearch runs the same `
……` extraction. Native
- markdown table removes that step entirely.
-- The action buries the commit SHA in HTML `data-*` attributes. Humans
- reading the comment can't tell which commit's numbers they're looking
- at without cross-referencing Checks.
-- The "⏳ results are out of date" banner over stale data misleads both
- humans and agents. Stale data with a banner is worse than no data.
-- We own the format → we can evolve it for agent autoresearch (attempts
- graveyard, REOPENED taxonomy, JSON adjunct) without patching an
- upstream action.
-
-### Comment shape
-
-```
-## Bench — `7efaff9` · Perf: Gate each phase-3 notify on shallow prop diff
-Branch start: `56554b4` · 25 commits · run #42 @ 2026-04-14 18:36Z · [full run ↗](url)
-
-**Wins 14** · Tied-peak 8 · **Regressed 5** · Unexplored 0 · Unsure 0
-
-### Regressed from peak (sorted by severity)
-| metric | current CI | peak CI | peak commit | bisect candidates |
-|-------------------|-----------------|----------------|------------------------------------------|---------------------|
-| update-10th | [13.8, 14.3]ms | [9.5, 10.1]ms | `782d01b` Bug: stringify item keys | `7924af5`, `deb712c`|
-| toggle-last | [2.4, 2.7]ms | [1.7, 1.9]ms | `00f8141` Feat: Add deepFreeze | `7924af5` |
-
-Bisect-candidates column caps at 2-3 (nearest-to-peak + most-recent).
-Full list per metric lives in the JSON adjunct.
-
-### Wins vs branch-start (collapsed)
-
-[full table: metric | current CI | branch-start CI | Δ]
-
-
-### Abandoned attempts
-- `11adcca` Perf: Specialize defineBlock reaction callback — reverted in `0873084`
- - observed: update-10th [x,y] → [a,b], toggle-last [x,y] → [a,b]
-
-### Top commits by net perf impact (wins − regressions caused)
-- `782d01b` Bug: stringify item keys +6 net
-- `9071884` Perf: per-item markers + DOM-reusing first mutation +4 net
-- `7924af5` Perf: Gate each phase-3 notify on shallow prop diff −2 net
-- `deb712c` Perf: Skip itemSignal.notify on freshly-created −1 net
-
-### Machine-readable
-[bench-report.json ↗](artifact-url) — for agent consumption
-```
-
-### Status taxonomy
-
-Every metric is classified into exactly one of:
-
-- **WIN** — HEAD's CI dominates every prior commit's CI (non-overlapping
- below), and dominates branch-start.
-- **TIED-PEAK** — HEAD's CI overlaps the top cluster. No single winner;
- HEAD is in the winning set.
-- **REOPENED** — some earlier commit's CI is below HEAD's CI with
- non-overlapping margin. Metric was improved, then lost. Highest-value
- actionable signal — cherry-pick candidate.
-- **UNEXPLORED** — HEAD's CI overlaps branch-start; no commit on the
- branch beat branch-start on this metric. No progress made.
-- **UNSURE** — tachometer couldn't resolve within timeout (metric
- within `autoSampleConditions` threshold of the comparator). Reported
- separately; not counted as win or loss.
-
-### Structured JSON adjunct
-
-Alongside the markdown comment, attach `bench-report.json` as a workflow
-artifact. Agents run `gh run download` → parse JSON → operate. Schema:
-
-```json
-{
- "head": { "sha": "7efaff9", "msg": "...", "intent_class": "Perf" },
- "branch_start": { "sha": "56554b4", "msg": "..." },
- "run_id": 42,
- "workflow_run_url": "...",
- "suites": {
- "reactivity": {
- "status": "complete | running | failed",
- "metrics": {
- "update-10th": {
- "status": "REOPENED",
- "current_ci": [13.8, 14.3],
- "branch_start_ci": [10.8, 11.2],
- "peak": {
- "sha": "782d01b",
- "msg": "Bug: stringify item keys so adoption actually matches",
- "intent_class": "Bug",
- "ci": [9.5, 10.1]
- },
- "tied_peak_cluster": ["782d01b"],
- "intervening_commits": ["deb712c", "7924af5"],
- "bisect_candidates": ["7924af5"],
- "vs_main_delta": "+30%"
- }
- }
- }
- },
- "abandoned_attempts": [
- { "sha": "11adcca", "msg": "...", "reverted_in": "0873084",
- "observed_impact": {"update-10th": [[9.9, 10.3], [13.8, 14.3]]} }
- ],
- "commit_impact": [
- { "sha": "782d01b", "msg": "...", "wins": 5, "regressions_caused": 0, "net": 5 },
- { "sha": "7924af5", "msg": "...", "wins": 1, "regressions_caused": 3, "net": -2 }
- ]
-}
-```
-
-The markdown comment is a rendered view of this JSON; the JSON is the
-source of truth.
-
-### Persistence: commit `bench-history.json` on merge
-
-Workflow artifacts expire (90 days default). For cross-session agent
-autoresearch spanning multi-week or multi-PR perf branches, the
-history *is* the asset — losing it 90 days after a run kills the
-feedback loop for any long-lived perf initiative.
-
-Solution: on merge of any PR with benchmark artifacts, append the
-final `bench-report.json` to `bench/history/bench-history.json` on
-main. Cheap (a few KB per merge), permanent, and the agent reading
-this file has the full cross-PR history available by default.
-Implement as a post-merge GitHub Action that runs once per merged PR
-that produced benchmark data.
-
-### In-progress state
-
-Replace the "⏳ results are out of date, stale numbers below" pattern
-with an explicit state line, no data rendered:
-
-```
-## Bench — `7efaff9` · Perf: ...
-Status: running (4/10 suites complete) · Last complete run: `11adcca` at 14:23Z
-[partial results available once more suites complete]
-```
-
-Stale data under a "currently running" banner trains readers to
-distrust the comment. No-data is better than misleading-data.
-
-### Piecemeal editing
-
-The collector job from the parallel matrix posts the initial comment
-and edits in place as each suite finishes. Readers see:
-
-1. **t=0**: "Status: running (0/10) · building baseline"
-2. **t=2m**: micros land → table row for each micro suite filled in
-3. **t=5m**: fast macros land → rows fill in
-4. **t=10m**: slowest macro lands → `Status: complete`, full verdict
- line + abandoned-attempts + intent-class summary rendered
-
-Each edit is a full-comment rewrite from the latest JSON state.
-
-## Measurement hygiene (applies to all views)
-
-- **Peak = non-overlapping CI dominance.** If HEAD's CI lower bound
- exceeds candidate-peak's CI upper bound → HEAD regressed. If they
- overlap → tied, no regression claim.
-- **Tied-peak clusters are treated as a set.** Cherry-pick arbitrarily
- within; don't rank within a tie.
-- **Unsure metrics are a third state.** Not wins, not losses. Reported
- in their own bucket in both the markdown and JSON.
-- **Branch-start baseline is the first run's `this-change` CI.** Stable
- reference for the "this branch" story. `vs-main` lives as a secondary
- column for context only.
-
-## Where this lands
-
-### Constraint: `workflow_run` uses main's workflow copy
-
-`benchmarks-report.yml` triggers on `workflow_run`. GitHub always uses
-**main's copy** of a workflow triggered by `workflow_run`, not the PR
-head's. Reporter changes only take effect once merged to main; they
-cannot be validated inline on the PR that introduces them.
-
-`benchmarks.yml` triggers on `pull_request` and uses the PR head's
-copy, so suite and matrix changes do validate inline.
-
-### Staged landing — three PRs, not one
-
-Each PR has independent correctness value and ships separately. The
-10-min wall-clock target is achieved only after all three land, but
-each is validatable on its own and a bug in a later PR doesn't force
-rolling back earlier ones.
-
-**PR A — CI parallelization (unblocks wall-clock immediately).**
-Matrix-per-config + per-check PR status + concurrency group + 3-min
-per-bench cap. Same old suite, same old reporter. The existing
-`tachometer-reporter-action@v2` already handles multi-artifact
-downloads (`path: results/**/*.json`), so this works without reporter
-changes. Validatable inline on the introducing PR — `pull_request`
-event uses the PR head's workflow copy. **Payoff: wall-clock drops to
-the slowest single config (~10-12 min) from 32 min, concurrency bug
-fixed.**
-
-**PR B — Suite rationalization + knob tuning (on top of A).** New
-four-story config layout, 27 → ~22 metrics, `autoSampleConditions:
-["2%"]`, 2-min per-config cap. Still uses the existing reporter
-(it renders any tachometer JSON). Validatable inline. Before
-landing, pull CI widths from ~10 past `perf/native` runs to confirm
-`["2%"]` converges; tighten to `["1%"]` if data supports. **Payoff:
-suite tells a story per config, serial time drops, resolution floor
-honest to the hardware.**
-
-**PR C — Reporter replacement (on top of B).** New Node script, JSON
-adjunct schema, status taxonomy, commit-impact ranking, history
-persistence on merge. Delete `tachometer-reporter-action@v2` usage.
-Cannot be validated inline — `workflow_run` uses main's workflow
-copy, so the new reporter only runs once merged. Acceptance test: the
-landing PR touches a trivial `packages/**` file; the first run after
-merge exercises the new reporter and we read the resulting comment.
-
-**Prep work before PR A: merge `perf/native` to main** — unblocks
-everything downstream and gives us a clean baseline.
-
-**Prep work before PR C: build the reporter locally first.** Pull run
-artifacts from a real PR with `gh run download` as a fixture; iterate
-the Node script + JSON schema + markdown renderer against real data
-until the rendered comment reads correctly. Land only once the
-offline rendering is right.
-
-### Execution walkthrough — per-PR procedure
-
-Short version of the whole loop: **A and B are "PR then merge" with
-inline verification. C is "build rigorously offline, then land, then
-verify on the next real PR."** Rigor is front-loaded for C to replace
-the verification step we can't do normally.
-
-#### PR A — CI parallelization
-
-**Dev**
-
-1. Branch off main. Modify `.github/workflows/benchmarks.yml` —
- matrix-per-config, add `concurrency` block, 3-min per-bench cap,
- separate check name per matrix cell.
-2. Push, open PR. The `pull_request` trigger uses the PR head's
- workflow, so the new matrix runs immediately on this PR.
-
-**Confirm before merge** (all visible in the PR itself)
-
-- Actions tab shows N parallel jobs (`bench-reactivity`,
- `bench-structural-changes`, etc.) instead of one serial job.
-- Checks panel lists each as its own entry — you can see red X on
- just one suite.
-- Wall-clock on the slowest job ≈ what you'd expect (~10-12 min
- today, since suite is unchanged).
-- Push a second commit within a minute of the first — old run should
- cancel (concurrency group).
-- Comment from old `tachometer-reporter-action@v2` still renders
- correctly across multiple artifacts.
-
-**Merge.** Clean rollback available (revert the workflow file).
-
-#### PR B — Suite rationalization + knob tuning
-
-**Pre-work (before branching)**
-
-Pull CI widths from the last ~10 `perf/native` runs; eyeball what
-tachometer actually resolves at on GHA. If ≥90% converge below ±1%,
-ship `["1%"]`. Otherwise ship `["2%"]`. This is a ~15-min gh-api
-script, no commits.
-
-**Dev**
-
-1. Branch off main (which now has PR A).
-2. Write the four new story-driven configs
- (`rendering-throughput.json`, `reactivity.json`,
- `structural-changes.json`, `hydration.json`), matching HTML
- fixtures, delete the three old `tachometer-ci*.json` files.
-3. Add the three new benches (`wake-count-single-key`,
- `nested-mutation`, `hydrate-1000-card`). Also add the six
- micro-bench configs.
-4. Push, open PR.
-
-**Confirm before merge**
-
-- Matrix now spawns one job per new config + one per micro — check
- the list matches the plan.
-- Each job completes within the 2-min auto-sample cap.
-- Wall-clock target (~5-7 min on slowest) hit.
-- No metric hits the old `unsure 🔍 -0% - +0%` pattern. If any do,
- the noise floor you chose is too tight — bump to `["2%"]`.
-- Old reporter renders the new configs — sanity-check the comment
- reads correctly even though it's the last time we'll see it.
-
-**Merge.** Revert is clean (restore old configs, keep matrix).
-
-#### PR C — Reporter replacement
-
-This is the one that can't be validated inline. Build confidence
-*before* opening the PR, not during review.
-
-**Pre-work (substantial, no PR open yet)**
-
-1. `gh run download` artifacts from ~20 past runs across different
- branches → check into `reporter/fixtures/`.
-2. Write the reporter as a ≤300-LOC Node script. Unit tests snapshot
- its markdown + JSON output per fixture. Iterate locally until
- every fixture renders right.
-3. **Shadow-mode check**: render last ~10 merged PRs with the new
- reporter, diff against the old posted comments. Investigate every
- disagreement — new logic wrong, or old logic was misleading. Both
- are findings.
-4. (Optional, high confidence) Push reporter to your fork's main,
- open a throwaway PR on the fork, watch the full `workflow_run`
- cycle end-to-end on infrastructure you don't care about.
-5. Rehearse the revert locally on a staging branch. Confirm
- `git revert ` restores the old reporter cleanly.
-
-**Dev**
-
-1. Branch off main. Add the reporter script, fixtures, tests.
- Replace `benchmarks-report.yml` to call the script instead of the
- action. Add the post-merge `bench-history.json` append workflow.
-2. Push, open PR. **The PR's own benchmark comment will still be
- rendered by the OLD reporter** (because `workflow_run` uses main's
- copy) — so the PR comment is not a test of anything.
-
-**Confirm before merge** (not in the PR's comment — elsewhere)
-
-- CI runs the Node unit test suite (add it to `.github/workflows/ci.yml`
- if not already) — all fixture snapshots pass.
-- Shadow-mode output reviewed.
-- Fork dry-run (if you did it) was clean.
-- Revert commit drafted and ready to push.
-
-**Merge** — at a quiet window, no active perf PRs in flight.
-
-**Post-merge acceptance test** (the actual live validation)
-
-1. Immediately open a trivial follow-up PR — one-character change to
- any `packages/**` file — to trigger `benchmarks.yml`.
-2. The first real run exercises the new reporter (main now has it).
-3. Read the rendered comment critically. If wrong → push the
- prepared revert within minutes.
-4. If right → monitor the next ~5 real perf PRs for subtle issues
- that only show on real data.
-
-### Testing strategy
-
-PR A and PR B are `pull_request`-triggered and validate inline — push
-the branch, watch the run, inspect the output. Standard iterative
-development; nothing special to call out.
-
-PR C is the hard case. `workflow_run` uses main's workflow copy, so
-the new reporter cannot execute on its own PR. Strategy for landing
-PR C with confidence despite this constraint:
-
-**1. Offline fixtures + unit tests (mandatory, ships in PR C).**
-Pull `results/*.json` from ~20 past runs across multiple branches
-with `gh run download`, check them into `reporter/fixtures/`. Write
-a Node test suite that feeds each fixture to the reporter and snapshots
-the markdown + JSON output. Cover the edge cases explicitly:
-
-- all-green run (no regressions, no unsure)
-- mixed run (regressions + unsure + wins)
-- partial failure (one suite's artifact is missing)
-- empty run (tachometer timed out, zero results)
-- malformed JSON (one suite's output is corrupt)
-- first run on a branch (no peak history yet)
-- force-pushed branch (SHA not in the current linear history)
-
-Snapshot tests catch the majority of logic bugs offline — status
-classification (WIN / TIED-PEAK / REOPENED / UNEXPLORED / UNSURE),
-CI-overlap math, commit-impact ranking, bisect-candidate selection.
-
-**2. Shadow-mode validation against historical runs.** Before landing
-PR C, run the new reporter offline against the last ~10 merged PRs'
-artifact sets. For each, read the rendered output and compare to the
-old reporter's posted comment. Confirm the new rendering surfaces the
-same wins/regressions the old one flagged, plus the new dimensions
-(peak attribution, REOPENED classification, commit-impact ranking).
-If shadow-mode disagrees with the old reporter on a known-outcome PR,
-the disagreement itself is diagnostic — either the new logic is wrong
-or the old reporter was misleading. Either way, investigate before
-merging.
-
-**3. Optional: fork dry-run for true end-to-end validation.** Push the
-new reporter to a personal fork's main, open a throwaway PR against
-that fork that touches `packages/**` to trigger `benchmarks.yml`.
-Because the fork's main has the new reporter, `workflow_run` there
-uses it. This is the only way to exercise the full
-`pull_request → workflow_run → comment` cycle without landing on
-upstream main. Worth the ~15 min of setup if confidence from #1 and
-#2 isn't quite enough.
-
-**4. Land on main with caution.** Steps at landing time:
-
-- Merge at a quiet window (no active perf PRs queued).
-- Immediately open the acceptance-test PR (trivial `packages/**`
- change) — the first real run is the live validation.
-- Watch the full pipeline: matrix spawns, suites complete, collector
- edits the comment, JSON adjunct uploads, post-merge history commit
- fires on the next merge.
-- Keep the revert commit prepared and ready to push. Target revert
- latency is minutes, not hours.
-
-**5. Post-landing monitor.** For the first ~5 PRs after landing PR C,
-spot-check each comment against expectations. Agents doing
-autoresearch in that window should explicitly flag any reading that
-seems wrong — "this metric shows WIN but the number is higher than
-branch-start" kind of discrepancies. Early noise catches late bugs.
-
-### Rollback plan
-
-Staged landing means granular rollback:
-
-- PR A bad → revert A, keep old serial workflow. Easy.
-- PR B bad → revert B, keep new matrix. Old suite runs in parallel.
-- PR C bad → revert C, keep new suite + matrix. Old reporter renders
- new configs (it's format-agnostic on the tachometer JSON side).
-- Structured JSON is the contract inside PR C — markdown rendering
- bugs don't compromise the underlying data; fix-forward is usually
- preferable to revert for cosmetic issues.
-
-**Revert rehearsal.** Before merging PR C, dry-run the revert
-locally: `git revert ` on a staging branch, confirm the diff
-restores the old action config cleanly, close the staging branch.
-That way the production revert (if needed) is muscle memory, not
-discovery.
-
-## Open questions
-
-- **Wake-count instrumentation path.** Tachometer measures timing;
- asserting on a count requires either (a) emitting the count as a
- timing delta via `setTimeout(0, count * 1ms)` (hack, measurable but
- ugly), or (b) extending tachometer with a custom `measurement` type.
- (b) is correct but is work.
-- **Nested-mutation setup contract.** Reuse nested objects across
- mutations (measures `isEqual` path) or freshly spread each time
- (measures allocation path)? Both interesting; probably two benches
- under a common umbrella.
-- **Hydration bench baseline.** `/perf/hydrated` currently runs against
- whatever DOM the server produced. Stable comparison needs a
- snapshotted HTML input that doesn't change when renderer output
- format changes. Probably a fixture-HTML-per-commit approach rebuilt
- on each run.
-- **Bisect candidate heuristic.** "Commits after peak that touched
- likely files" requires a file→metric mapping we don't have. Start
- naive ("all commits between peak and HEAD"); refine if the signal
- is too noisy to act on. Even the naive version is more than the
- current reporter offers.
-- **Attempts-graveyard detection.** Reverts are mechanical
- (`git log --grep '^Revert'`). Other abandonment patterns (commit X
- landed, commit Y silently supersedes) aren't. Start with reverts;
- add an explicit `[abandoned]` message tag if supersede cases matter.
-- **Branch-start detection.** "First commit on this branch" is
- `merge-base(HEAD, main)` + 1, but PRs with force-pushes or rebases
- scramble that. Default: anchor to the first `Benchmarks` run's SHA
- (whatever commit produced the first `results/*.json` artifact
- visible on this PR's runs). Fallback if that breaks down: an
- explicit `.bench-baseline` dotfile at the repo root with a
- `BASELINE_SHA=...` line, committed once at branch start. Ship with
- the run-based default, add the dotfile mechanism only if agents
- start fighting the detection.
-- **Hosting the JSON adjunct for humans.** Workflow artifacts are
- authenticated behind GitHub login. Agents with `gh` auth can fetch;
- anonymous readers of a public PR cannot. Acceptable for now (public
- repo, few anonymous readers of perf JSON); flag if ever a blocker.
-
-## Not in scope
-
-- Memory profiling (separate tool — Chrome heap snapshots, not tachometer).
-- Framework-vs-framework comparison (Krausest comparison suite; out of
- scope for internal regression tracking).
-- SSR server-side render timing alone (covered by server-side benchmarks
- in a separate runner).
-- Historical backfill of pre-existing PR comments into the new format.
- Old PRs keep their old comments; new PRs use new reporter.
diff --git a/ai/skills/contributing/author-pull-requests.md b/ai/skills/contributing/author-pull-requests.md
index 9ac16bad7..74b26104b 100644
--- a/ai/skills/contributing/author-pull-requests.md
+++ b/ai/skills/contributing/author-pull-requests.md
@@ -20,9 +20,11 @@ type: skill
## Why this matters
-AI-written prose has a tell: it's trying to convince the reader the work was thorough and correct. Humans spot it instantly — and stop trusting the writer.
+PR descriptions on this repo are public. They're read by community users, downstream consumers, and contributors browsing GitHub. Write them as a public changelog entry for an open source project, in the tone of a colleague leaving a quick note.
-Good PR descriptions read like a colleague leaving a quick note. Matter-of-fact. State what's now true; don't argue for it. The diff is the evidence — your text gives orientation.
+Public changelog tone is matter-of-fact. State what changed and why a consumer should care. Don't argue. Don't perform thoroughness. Don't assume the reader has been following along.
+
+AI-written prose tells immediately because it's trying to convince the reader the work was thorough and correct. Humans spot it. The diff is the evidence. Your text gives orientation.
If you catch yourself writing to demonstrate thoroughness or pre-empt skepticism, stop. Cut the offending text and go back to plain description.
@@ -30,11 +32,11 @@ If you catch yourself writing to demonstrate thoroughness or pre-empt skepticism
## What the reader actually wants
-The reviewer opens GitHub, glances at the title, skims the body, then reads the diff. Your text gives them just enough to:
+The reader opens GitHub, glances at the title, skims the body, then reads the diff if they're a reviewer. Your text gives them just enough to:
1. Know what kind of change this is — **the title**
-2. Know why it exists as a unit — **one framing sentence**
-3. Know what's now true after merge — **3–5 outcome bullets**
+2. Know why it exists as a unit — **the framing**
+3. Know what's now true after merge — **outcome bullets**
That's the entire job. Anything beyond is noise that erodes trust.
@@ -101,12 +103,13 @@ That's the whole body. No section headers beyond `## Changes`. No risk score. No
### Medium tier
```markdown
-[One sentence — why this PR exists.]
+[Framing. One or more sentences. Methodology fixes often need a paragraph
+or two to explain what was wrong before what's fixed. State the problem
+in plain language a downstream consumer can act on.]
## Changes
- [Outcome bullet]
- [Outcome bullet]
-- [Outcome bullet]
## Risk
N/10 — [one-line reason].
@@ -116,13 +119,17 @@ Failure modes: [bulleted list — only when score ≥ 5 or blast radius is non-o
- [Deviations from standard only. Skip "rerun tests" / "CI passes" — those are assumed.]
```
-**Word target.** Medium-tier bodies usually land 80–150 words. Past 200 is a sign you're restating the diff. After drafting a body that feels complete, expect to cut roughly half — the AI default is about twice the length humans actually write.
+**Framing length.** As many sentences as the change genuinely needs to explain itself. Methodology bugs typically take 2–3 paragraphs (what was wrong, why it was wrong, what's fixed). Refactors take one. The test is whether removing a sentence loses information a downstream consumer needs. If not, cut.
+
+**Bullet count.** As many bullets as there are distinct outcomes. Two outcomes means two bullets. Don't pad to reach a target. Don't compress past clarity.
+
+**Word target.** Most bodies land 80–200 words. Past 300 is usually restating the diff. After drafting, expect to cut roughly a third. The AI default is twice the length humans actually write.
### Large tier
Same as Medium, plus:
-- If plan-driven, lead the framing sentence with `Implements [plan name](permalink-at-PR-creation-SHA)`. Get the SHA via `git log -1 --format=%H ai/plans/foo.md` and form `https://github.com/Semantic-Org/Semantic-Next/blob//ai/plans/foo.md`.
+- If plan-driven, open with `Implements [plan name](permalink-at-PR-creation-SHA).` on its own line, then a paragraph break, then the framing. Don't fold the plan link into the same sentence as the framing. Separation makes both surfaces easier to scan. Get the SHA via `git log -1 --format=%H ai/plans/foo.md` and form `https://github.com/Semantic-Org/Semantic-Next/blob//ai/plans/foo.md`.
- `## Risk` failure-modes list is mandatory.
- Body may be longer, but bullets still describe outcomes, not mechanisms.
@@ -183,14 +190,24 @@ The labels match the natural axes a reviewer scans by — what surface area they
### Bullet shape
-Bullets should be noun phrases or short verb phrases. Most under 10 words. Full sentences in bullets is an AI tell.
+Bullets describe state in plain language. Either short noun phrases or full sentences in present tense work. The AI tell isn't full sentences per se. It's *corporate-prose* sentences ("This update enhances...", "This change provides...", "We have added..."). The smell test: would you text this to the reviewer?
```
+❌ "This change enables the system to remove the throwaway profiling artifacts."
❌ "Remove the throwaway profiling and screenshot artifacts from the repo root."
✅ "Remove root profiling/screenshot artifacts"
+✅ "Test runner now shows drift if main has changed significantly between iterations"
```
-**Pick one voice across all bullets.** Don't mix imperative ("Remove X"), declarative state ("X removed"), and active past with for-clause ("Dropped X for Y"). Mixed tenses read AI-shaped — no human drifts mid-list. Active past with "for X" is punchy and intent-bearing for bug fixes and refactors.
+**Pick one voice across all bullets.** Don't mix imperative ("Remove X"), declarative state ("X removed"), and active past with for-clause ("Dropped X for Y"). Mixed tenses read AI-shaped. No human drifts mid-list.
+
+### Italics for contrast pairs
+
+Use `*X* vs *Y*` when the body needs the reader's eye to land on a comparison. Sparing. Once or twice per body, not a stylistic mannerism.
+
+```
+✅ "Helps determine whether regressions represent *changes in the PR* versus *changes in main*"
+```
---
@@ -461,13 +478,13 @@ In order:
7. **Cut scaffolding bullets.** If a bullet states the obvious consequence of bullets above it ("update path references to match"), drop it.
8. **Trim framing sentence tails.** "so that…", "plus the X that follows", "in order to…" — usually padding.
9. **Search for AI tells (words).** Look for: *verified, ensured, considered, note that, important to flag, in summary, this PR introduces, all tests pass, fully tested*.
-10. **Search for AI tells (punctuation).** Three patterns. Single instances are fine. Clusters or stylized uses are tells.
- - **Paired em-dashes as parentheses** (`text — like this — text`). Single em-dashes are fine. Pairs used as brackets are AI-shaped — use real parens or split into two sentences.
+10. **Search for AI tells (punctuation).** Default to periods.
+ - **Semicolons.** Any semicolon in a body is an AI tell. Most rewrite cleanly to a period + new sentence.
+ - **Paired em-dashes as parentheses** (`text — like this — text`). AI-shaped. Use real parens or split into two sentences. Single em-dashes are fine but most rewrite to a period + new sentence.
- **Explanatory colons** (`X was the canonical repro: helpers reading...`). Almost always splits cleanly into two sentences. The colon makes prose read like a writeup.
- - **Semicolon clusters.** Three or more in one body is a tell. Most rewrite to periods.
11. **Check tier appropriateness.** Did you reach for Medium/Large machinery on a Small PR? If yes, drop them.
12. **Voice check — read each bullet aloud.** Imagine you're texting it to the reviewer. Does it sound like a developer in a hurry, or like a press release? If the latter, rewrite. Specific tells: bullets that start with `Let X...`/`Stop Y...`/`Wire Z...` (verb-first mechanism), bullets that mention line numbers or internal field names, bullets longer than the corresponding commit message subject.
-13. **Honest question.** If a colleague wrote this PR and pinged you, would the body sound like them, or like a corporate document? If the latter, you're still in AI-prose mode.
+13. **Public changelog gut-check.** This body lives on GitHub as a public record. Read it imagining a community user encountering it cold via the project's release notes. Does it stand alone? Or does it read as internal back-and-forth assuming the reader has been following along? If the latter, rewrite for the external audience.
---
diff --git a/ai/skills/contributing/code-review.md b/ai/skills/contributing/code-review.md
index 29dbb0a27..54a389b57 100644
--- a/ai/skills/contributing/code-review.md
+++ b/ai/skills/contributing/code-review.md
@@ -37,6 +37,8 @@ Each agent runs `gh pr diff {number}` and examines the changes through its lens.
- **What's wrong** — one sentence
- **Why it matters** — impact, what breaks, or which standard it violates
+**Group instances of the same pattern.** When you find N>1 occurrences of the same root issue (same root cause, same citation, same fix shape), return them as a *single grouped finding* listing every file/line, not N separate findings. Example: 22 comments across the diff that all narrate a migration → one grouped "migration-narration" finding listing all 22 sites. The scoring stage applies the rubric per-finding; grouping caps scorer count at unique-pattern-count rather than instance-count, while preserving fresh-cold-read tamper-safety on each pattern. Don't group across distinct citations or distinct fix shapes — those are different judgments.
+
Lens agents **do not score their own findings.** Scoring is a separate stage with separate agents — see Handling Results.
If an agent finds no issues, it returns "No issues found" with a brief summary of what it verified was clean.
@@ -93,9 +95,11 @@ Three lenses on the diff under a high-confidence bar. Comments are out of scope
### Agent 6 — Performance Review
Anchor: `ai/skills/workflows/contributing/improve-performance.md`. Tachometer is the committed source of truth for performance in this repo; CI posts a bench reporter comment on the PR with per-metric verdicts (faster / slower / no change / unsure) and an `Expected Noise` column.
+**Step 0 — Applicability check.** The benchmarks workflow runs only when a PR touches paths in its `paths:` filter (currently `packages/**` or `.github/workflows/benchmarks.yml`). Run `gh pr diff {number} --name-only` first. If none of the changed files match those paths, abort immediately with: *"PR scope doesn't trigger benchmarks (no `packages/**` or `benchmarks.yml` changes). Performance review N/A."* Don't fetch comments, don't speculate — the workflow won't have run.
+
**Step 1 — Find the tachometer comment.** Use `gh pr view {number} --comments` or `gh api repos/{owner}/{repo}/issues/{number}/comments`. The bench reporter's comment is recognizable by the per-metric verdict table.
-**If absent — abort.** Return: *"Tachometer hasn't run yet on this PR. Rerun Agent 6 in a separate pass once CI catches up."* Don't speculate about performance without data.
+**If absent (despite applicable scope) — abort.** Return: *"Tachometer hasn't run yet on this PR. Rerun Agent 6 in a separate pass once CI catches up."* Don't speculate about performance without data.
**If present — investigate each regression honestly.** For every metric flagged "slower," judge:
@@ -116,12 +120,14 @@ The 6 lens agents return findings *without* confidence scores. Scoring is a sepa
Before spawning scoring agents, state in the conversation:
-> "Launching N parallel Opus scoring agents — one per finding."
+> "Launching N parallel Opus scoring agents — one per finding (grouped findings count as one)."
**This is not optional, and not a checkbox to game.** Orchestrators on round 3+ of an iteration loop tend toward fatigue and skip the scoring stage — scoring findings themselves to save effort, hoping the user won't notice. The announcement is the artifact that makes skipping observable. If you're tempted to skip on round 3 (or 4, or 5), that's exactly the moment when the rigor matters most. Stop, announce, then launch.
### Scoring agents (Opus, one per finding, parallel)
+Lens agents already consolidate same-pattern instances into grouped findings (see Lens agent output format). Each grouped finding is one scorer. A lens that returns 22 same-pattern instances as one grouped finding gets one scorer; a lens that returns 5 distinct findings gets five.
+
For each finding from any lens agent, launch a fresh **Opus** agent (always Opus — Haiku is too fast to verify standards citations) that receives:
- The full PR diff
diff --git a/ai/skills/contributing/manage-roadmap.md b/ai/skills/contributing/manage-roadmap.md
index 8dbc95546..4f7515425 100644
--- a/ai/skills/contributing/manage-roadmap.md
+++ b/ai/skills/contributing/manage-roadmap.md
@@ -210,7 +210,7 @@ Work happens on a feature branch, committed incrementally, merged via PR.
```
If the PR closes without merging, reverse all of the above. If the PR merges and the plan is complete, follow the archive flow below.
-8. **Self-review the PR** using the `contributing/code-review` skill. Run 5 parallel agents, fix findings, rerun until clean. See the skill for the full process — it covers agent lenses, scoring rubric, iterative loop, and what counts as a false positive.
+8. **Self-review the PR** using the `contributing/code-review` skill — it owns the full process (lens agents, scoring, iterative loop, false-positive rules). Fix findings, rerun until clean.
9. **Post-merge verification** (when applicable). Only relevant for work that affects live infrastructure — CI pipelines, CDN endpoints, MCP deploys, etc. After the user merges and CI runs, verify the live endpoints behave correctly. Not needed for pure source changes.
### When to branch vs. commit to main
diff --git a/tools/ci/bench/reporter/append-history.js b/tools/ci/bench/reporter/append-history.js
index 5f0bb216a..c1b47a8a9 100644
--- a/tools/ci/bench/reporter/append-history.js
+++ b/tools/ci/bench/reporter/append-history.js
@@ -3,9 +3,9 @@
Append a commit's bench measurements to bench-history.json.
Invoked from benchmarks-report.yml when a bench run was triggered by a
- push to main (not a pull_request). Reads per-metric absolute CIs from
+ push to main (not a pull_request). Reads per-metric CIs from the
tachometer JSON output and appends a new entry to the history file,
- which is then committed back to main by the workflow.
+ which the workflow then commits back to main.
Usage:
node append-history.js \
@@ -13,29 +13,33 @@
--sha commit SHA being archived
--msg commit subject line
--parent-sha parent commit SHA
+ --baseline-sha SHA the bench was compared against. Pinned per
+ metric onto entries that have a percent_delta_ci.
--timestamp commit timestamp (ISO 8601); defaults to now
--history bench-history.json path (default: ./bench-history.json — relative to CWD)
+ Schema: v2. The reader rejects anything else with a reset instruction.
+
Idempotency: if the same SHA already exists in the file, the entry is
replaced. Lets a re-run of the workflow (e.g. after a flaky first pass)
overwrite rather than duplicate.
*/
import fs from 'node:fs';
-import path from 'node:path';
+import { loadHistoryMetrics } from './extract-metrics.js';
const args = parseArgs(process.argv.slice(2));
const resultsDir = required(args, 'results');
const sha = required(args, 'sha');
const msg = args.msg ?? '';
const parentSha = args['parent-sha'] ?? '';
+const baselineSha = args['baseline-sha'] ?? '';
const timestamp = args.timestamp ?? new Date().toISOString();
const historyPath = args.history ?? './bench-history.json';
-const metrics = loadMetrics(resultsDir);
-// Squash-merge commit titles end with ` (#N)` — the PR the commit came from.
-// Capturing it here lets the reporter link peak SHAs straight to the PR page
-// (where the bench comment lives) instead of just to the commit view.
+const metrics = loadHistoryMetrics(resultsDir, baselineSha);
+// Squash-merge commit titles end with ` (#N)` — capture so peak SHAs link
+// straight to the PR conversation page rather than the commit view.
const prMatch = /\(#(\d+)\)\s*$/.exec(msg);
const pr = prMatch ? Number(prMatch[1]) : null;
const entry = { sha, msg, parent_sha: parentSha, timestamp, pr, metrics };
@@ -55,60 +59,24 @@ fs.writeFileSync(historyPath, JSON.stringify(history, null, 2) + '\n');
console.log(`${history.commits.length} total commit${history.commits.length === 1 ? '' : 's'} in history`);
console.log(`Metrics recorded: ${Object.keys(metrics).length}`);
-/**
- * Walk the results directory and extract one { ci, mean_ms } entry per
- * metric. Uses the `this-change` absolute CI — that's the value that
- * indexes by SHA cleanly across time (the delta-vs-base comparison used
- * in PR comments is not meaningful across commits).
- */
-function loadMetrics(dir) {
- const out = {};
- for (const entry of walk(dir)) {
- if (!entry.endsWith('.json')) { continue; }
- const data = JSON.parse(fs.readFileSync(entry, 'utf8'));
- if (!Array.isArray(data.benchmarks)) { continue; }
-
- for (const bm of data.benchmarks) {
- const source = (bm.name ?? '').split(' [')[0];
- if (source !== 'this-change') { continue; }
- const metricName = bm.measurement?.name ?? bm.name;
- if (!bm.mean) { continue; }
- out[metricName] = {
- ci: [round4(bm.mean.low), round4(bm.mean.high)],
- mean_ms: round4((bm.mean.low + bm.mean.high) / 2),
- };
- }
- }
- return out;
-}
-
function readOrSeedHistory(filePath) {
if (!fs.existsSync(filePath)) {
- return { schema_version: 1, commits: [] };
+ return { schema_version: 2, commits: [] };
}
const raw = fs.readFileSync(filePath, 'utf8');
const parsed = JSON.parse(raw);
if (!Array.isArray(parsed.commits)) {
throw new Error(`Invalid history file: missing or non-array 'commits' field`);
}
- if (parsed.schema_version !== 1) {
- throw new Error(`Unsupported schema_version ${parsed.schema_version}; expected 1`);
+ if (parsed.schema_version !== 2) {
+ throw new Error(
+ `Unsupported schema_version ${parsed.schema_version}; expected 2. `
+ + `Reset the file with {"schema_version": 2, "commits": []} to migrate.`,
+ );
}
return parsed;
}
-function round4(n) {
- return Number(n.toFixed(4));
-}
-
-function* walk(dir) {
- for (const ent of fs.readdirSync(dir, { withFileTypes: true })) {
- const full = path.join(dir, ent.name);
- if (ent.isDirectory()) { yield* walk(full); }
- else { yield full; }
- }
-}
-
function parseArgs(argv) {
const out = {};
for (let i = 0; i < argv.length; i++) {
diff --git a/tools/ci/bench/reporter/append-history.test.js b/tools/ci/bench/reporter/append-history.test.js
index 3f5dcc431..f7778aff0 100644
--- a/tools/ci/bench/reporter/append-history.test.js
+++ b/tools/ci/bench/reporter/append-history.test.js
@@ -14,7 +14,15 @@ const __dirname = path.dirname(fileURLToPath(import.meta.url));
const SCRIPT = path.join(__dirname, 'append-history.js');
const FIXTURE_DIR = path.join(__dirname, 'fixtures', 'real-delta');
-function runAppend({ sha, msg = 'test', parentSha = '', timestamp, historyPath, resultsDir = FIXTURE_DIR }) {
+function runAppend({
+ sha,
+ msg = 'test',
+ parentSha = '',
+ baselineSha = '',
+ timestamp,
+ historyPath,
+ resultsDir = FIXTURE_DIR,
+}) {
const argv = [
SCRIPT,
'--results',
@@ -28,34 +36,39 @@ function runAppend({ sha, msg = 'test', parentSha = '', timestamp, historyPath,
'--history',
historyPath,
];
+ if (baselineSha) { argv.push('--baseline-sha', baselineSha); }
if (timestamp) { argv.push('--timestamp', timestamp); }
// Capture stderr so throw-cases can assert on the inner error message.
execFileSync('node', argv, { stdio: ['ignore', 'pipe', 'pipe'], encoding: 'utf8' });
return JSON.parse(fs.readFileSync(historyPath, 'utf8'));
}
-test('seeds a new history file when one does not exist', () => {
+function seedV2(historyPath, commits = []) {
+ fs.writeFileSync(historyPath, JSON.stringify({ schema_version: 2, commits }));
+}
+
+test('seeds a new history file when one does not exist (schema_version 2)', () => {
const tmp = fs.mkdtempSync('/tmp/bench-hist-test-');
const historyPath = path.join(tmp, 'bench-history.json');
const result = runAppend({ sha: 'abc123', historyPath, timestamp: '2026-04-15T00:00:00Z' });
- assert.equal(result.schema_version, 1);
+ assert.equal(result.schema_version, 2, 'writes v2 schema');
assert.equal(result.commits.length, 1);
assert.equal(result.commits[0].sha, 'abc123');
assert.equal(result.commits[0].timestamp, '2026-04-15T00:00:00Z');
assert.ok(Object.keys(result.commits[0].metrics).length > 0, 'metrics extracted');
});
-test('appends to an existing history', () => {
+test('appends to an existing v2 history', () => {
const tmp = fs.mkdtempSync('/tmp/bench-hist-test-');
const historyPath = path.join(tmp, 'bench-history.json');
- fs.writeFileSync(
- historyPath,
- JSON.stringify({
- schema_version: 1,
- commits: [{ sha: 'existing-commit', msg: 'old', parent_sha: '', timestamp: '2026-01-01T00:00:00Z', metrics: {} }],
- }),
- );
+ seedV2(historyPath, [{
+ sha: 'existing-commit',
+ msg: 'old',
+ parent_sha: '',
+ timestamp: '2026-01-01T00:00:00Z',
+ metrics: {},
+ }]);
const result = runAppend({ sha: 'new-commit', historyPath });
assert.equal(result.commits.length, 2);
@@ -67,19 +80,13 @@ test('replaces an existing entry with matching SHA (idempotent re-run)', () => {
const tmp = fs.mkdtempSync('/tmp/bench-hist-test-');
const historyPath = path.join(tmp, 'bench-history.json');
// Seed with an older entry for the same SHA we're about to write
- fs.writeFileSync(
- historyPath,
- JSON.stringify({
- schema_version: 1,
- commits: [{
- sha: 'target-sha',
- msg: 'stale',
- parent_sha: '',
- timestamp: '2000-01-01T00:00:00Z',
- metrics: { old: { ci: [1, 2], mean_ms: 1.5 } },
- }],
- }),
- );
+ seedV2(historyPath, [{
+ sha: 'target-sha',
+ msg: 'stale',
+ parent_sha: '',
+ timestamp: '2000-01-01T00:00:00Z',
+ metrics: { old: { ci: [1, 2], mean_ms: 1.5 } },
+ }]);
const result = runAppend({ sha: 'target-sha', msg: 'fresh', historyPath, timestamp: '2026-04-15T00:00:00Z' });
assert.equal(result.commits.length, 1, 'no duplicate entries');
@@ -88,22 +95,42 @@ test('replaces an existing entry with matching SHA (idempotent re-run)', () => {
assert.ok(!('old' in result.commits[0].metrics), 'stale metrics replaced');
});
-test('extracts this-change absolute CIs only (not tip-of-tree)', () => {
+test('extracts both absolute CI and percent-delta from differences[]', () => {
const tmp = fs.mkdtempSync('/tmp/bench-hist-test-');
const historyPath = path.join(tmp, 'bench-history.json');
const result = runAppend({ sha: 'abc', historyPath });
const metrics = result.commits[0].metrics;
- // The real-delta fixture has known this-change means for these metrics
- // (update-10th, toggle-middle are the biggest movers in that run).
+ // The real-delta fixture pairs this-change + tip-of-tree benchmarks for
+ // each metric, so percent_delta_ci is extractable.
assert.ok('update-10th' in metrics);
assert.ok('toggle-middle' in metrics);
- assert.ok(Array.isArray(metrics['update-10th'].ci));
+ assert.ok(Array.isArray(metrics['update-10th'].ci), 'absolute CI persisted');
assert.equal(metrics['update-10th'].ci.length, 2);
assert.equal(typeof metrics['update-10th'].mean_ms, 'number');
- // Mean is the midpoint of the CI
+ // mean is the midpoint of the absolute CI
const { ci, mean_ms } = metrics['update-10th'];
assert.ok(Math.abs(mean_ms - (ci[0] + ci[1]) / 2) < 0.01, 'mean is CI midpoint');
+ assert.ok(Array.isArray(metrics['update-10th'].percent_delta_ci), 'percent_delta_ci persisted');
+ assert.equal(metrics['update-10th'].percent_delta_ci.length, 2);
+});
+
+test('applies --baseline-sha to entries with percent-delta', () => {
+ const tmp = fs.mkdtempSync('/tmp/bench-hist-test-');
+ const historyPath = path.join(tmp, 'bench-history.json');
+ const result = runAppend({ sha: 'abc', historyPath, baselineSha: 'parent-tip-1234567890' });
+
+ const updateTenth = result.commits[0].metrics['update-10th'];
+ assert.equal(updateTenth.baseline_sha, 'parent-tip-1234567890', 'baseline_sha attached');
+});
+
+test('omits baseline_sha when --baseline-sha not passed', () => {
+ const tmp = fs.mkdtempSync('/tmp/bench-hist-test-');
+ const historyPath = path.join(tmp, 'bench-history.json');
+ const result = runAppend({ sha: 'abc', historyPath });
+
+ const updateTenth = result.commits[0].metrics['update-10th'];
+ assert.ok(!('baseline_sha' in updateTenth), 'no baseline_sha when flag absent');
});
test('records parent_sha when provided', () => {
@@ -136,6 +163,21 @@ test('pr is null when commit message has no PR reference', () => {
assert.equal(result.commits[0].pr, null);
});
+test('rejects v1 schema with reset instruction', () => {
+ const tmp = fs.mkdtempSync('/tmp/bench-hist-test-');
+ const historyPath = path.join(tmp, 'bench-history.json');
+ fs.writeFileSync(historyPath, JSON.stringify({ schema_version: 1, commits: [] }));
+ try {
+ runAppend({ sha: 'x', historyPath });
+ assert.fail('expected script to exit non-zero');
+ }
+ catch (e) {
+ const combined = (e.stderr ?? '') + (e.message ?? '');
+ assert.match(combined, /Unsupported schema_version 1/);
+ assert.match(combined, /Reset the file/, 'error explains how to recover');
+ }
+});
+
test('rejects an unsupported schema version', () => {
const tmp = fs.mkdtempSync('/tmp/bench-hist-test-');
const historyPath = path.join(tmp, 'bench-history.json');
diff --git a/tools/ci/bench/reporter/bench-history.json b/tools/ci/bench/reporter/bench-history.json
index fe3a9ff4a..519c3e110 100644
--- a/tools/ci/bench/reporter/bench-history.json
+++ b/tools/ci/bench/reporter/bench-history.json
@@ -1,2452 +1,4 @@
{
- "schema_version": 1,
- "commits": [
- {
- "sha": "9a9db17f7df9607795b07320a1d326d284458a61",
- "msg": "Bench: Harden against PR-authored gaming and amplify short benches (#…",
- "parent_sha": "5073f4ef5706ac96d3c2d2dbf9fd4755d0825b17",
- "timestamp": "2026-04-18T04:27:10Z",
- "pr": null,
- "metrics": {
- "create-1k": {
- "ci": [
- 132.0894,
- 133.6734
- ],
- "mean_ms": 132.8814
- },
- "create-10k": {
- "ci": [
- 1129.326,
- 1133.994
- ],
- "mean_ms": 1131.66
- },
- "replace-1k": {
- "ci": [
- 105.0825,
- 106.3689
- ],
- "mean_ms": 105.7257
- },
- "append-1k": {
- "ci": [
- 117.123,
- 120.5541
- ],
- "mean_ms": 118.8386
- },
- "update-10th-10": {
- "ci": [
- 175.9073,
- 179.6984
- ],
- "mean_ms": 177.8029
- },
- "select-40": {
- "ci": [
- 720.8028,
- 730.7715
- ],
- "mean_ms": 725.7871
- },
- "swap-rows-20": {
- "ci": [
- 973.9431,
- 981.1683
- ],
- "mean_ms": 977.5557
- },
- "remove-row-front-20": {
- "ci": [
- 666.3791,
- 673.9952
- ],
- "mean_ms": 670.1871
- },
- "remove-row-middle-20": {
- "ci": [
- 366.2978,
- 372.8794
- ],
- "mean_ms": 369.5886
- },
- "remove-row-back-10": {
- "ci": [
- 155.6664,
- 156.885
- ],
- "mean_ms": 156.2757
- },
- "clear-10k": {
- "ci": [
- 154.6596,
- 160.689
- ],
- "mean_ms": 157.6743
- },
- "signal-reactive-fanout-500x1200": {
- "ci": [
- 71.1405,
- 72.4395
- ],
- "mean_ms": 71.79
- },
- "signal-computed-chain-10x60k": {
- "ci": [
- 160.073,
- 161.6287
- ],
- "mean_ms": 160.8508
- },
- "signal-reactive-multi-read-5x160k": {
- "ci": [
- 161.6434,
- 163.4849
- ],
- "mean_ms": 162.5642
- },
- "signal-reactive-list-replace-1000x1000": {
- "ci": [
- 247.1402,
- 249.1565
- ],
- "mean_ms": 248.1483
- },
- "signal-reactive-list-filter-1000x300": {
- "ci": [
- 87.2396,
- 89.0854
- ],
- "mean_ms": 88.1625
- },
- "signal-reactive-push-2000x20": {
- "ci": [
- 170.415,
- 173.7433
- ],
- "mean_ms": 172.0792
- },
- "signal-reactive-set-index-300": {
- "ci": [
- 79.9497,
- 80.772
- ],
- "mean_ms": 80.3608
- },
- "signal-reactive-set-property-by-id-200": {
- "ci": [
- 158.7703,
- 160.2547
- ],
- "mean_ms": 159.5125
- },
- "bulk-add-500": {
- "ci": [
- 230.805,
- 232.355
- ],
- "mean_ms": 231.58
- },
- "add-20": {
- "ci": [
- 332.93,
- 333.1775
- ],
- "mean_ms": 333.0537
- },
- "toggle-10": {
- "ci": [
- 158.9467,
- 159.5583
- ],
- "mean_ms": 159.2525
- },
- "toggle-all-20": {
- "ci": [
- 327.7712,
- 328.3438
- ],
- "mean_ms": 328.0575
- },
- "remove-5-front": {
- "ci": [
- 84.0102,
- 87.2673
- ],
- "mean_ms": 85.6388
- },
- "remove-10-middle": {
- "ci": [
- 157.333,
- 158.722
- ],
- "mean_ms": 158.0275
- },
- "remove-5-back": {
- "ci": [
- 71.6982,
- 74.5968
- ],
- "mean_ms": 73.1475
- },
- "clear-completed-250": {
- "ci": [
- 56.0405,
- 56.757
- ],
- "mean_ms": 56.3988
- },
- "toggle-first-10": {
- "ci": [
- 160.151,
- 160.774
- ],
- "mean_ms": 160.4625
- },
- "toggle-middle-10": {
- "ci": [
- 160.1048,
- 160.8652
- ],
- "mean_ms": 160.485
- },
- "toggle-last-10": {
- "ci": [
- 158.0281,
- 158.6769
- ],
- "mean_ms": 158.3525
- },
- "remove-first-10": {
- "ci": [
- 180.5551,
- 183.3924
- ],
- "mean_ms": 181.9737
- },
- "remove-middle-10": {
- "ci": [
- 160.9466,
- 162.3759
- ],
- "mean_ms": 161.6612
- },
- "remove-last-10": {
- "ci": [
- 161.1027,
- 161.6223
- ],
- "mean_ms": 161.3625
- },
- "filter-cycle-20": {
- "ci": [
- 464.8641,
- 472.4334
- ],
- "mean_ms": 468.6487
- },
- "edit-start-10": {
- "ci": [
- 163.2351,
- 166.6574
- ],
- "mean_ms": 164.9463
- },
- "edit-cycle-5": {
- "ci": [
- 163.3843,
- 164.7382
- ],
- "mean_ms": 164.0612
- }
- }
- },
- {
- "sha": "74c6bd0c362bc7e98dec90485e3400dc13599d36",
- "msg": "Chore: Refresh All Deps to Bleeding Edge (#162)",
- "parent_sha": "74c6bd0c362bc7e98dec90485e3400dc13599d36^",
- "timestamp": "2026-04-28T20:14:45Z",
- "pr": 162,
- "metrics": {
- "create-1k": {
- "ci": [
- 96.0559,
- 97.5784
- ],
- "mean_ms": 96.8171
- },
- "create-10k": {
- "ci": [
- 863.8939,
- 869.0261
- ],
- "mean_ms": 866.46
- },
- "replace-1k": {
- "ci": [
- 77.9862,
- 79.5224
- ],
- "mean_ms": 78.7543
- },
- "append-1k": {
- "ci": [
- 87.6799,
- 91.2973
- ],
- "mean_ms": 89.4886
- },
- "update-10th-10": {
- "ci": [
- 162.1222,
- 166.5892
- ],
- "mean_ms": 164.3557
- },
- "select-40": {
- "ci": [
- 680.4813,
- 687.8359
- ],
- "mean_ms": 684.1586
- },
- "swap-rows-20": {
- "ci": [
- 840.2324,
- 852.0619
- ],
- "mean_ms": 846.1471
- },
- "remove-row-front-20": {
- "ci": [
- 539.9729,
- 549.9757
- ],
- "mean_ms": 544.9743
- },
- "remove-row-middle-20": {
- "ci": [
- 330.4854,
- 334.9831
- ],
- "mean_ms": 332.7343
- },
- "remove-row-back-10": {
- "ci": [
- 159.3841,
- 162.3759
- ],
- "mean_ms": 160.88
- },
- "clear-10k": {
- "ci": [
- 144.9236,
- 153.6964
- ],
- "mean_ms": 149.31
- },
- "signal-reactive-fanout-500x1200": {
- "ci": [
- 92.8349,
- 94.1071
- ],
- "mean_ms": 93.471
- },
- "signal-computed-chain-10x60k": {
- "ci": [
- 225.4863,
- 227.8817
- ],
- "mean_ms": 226.684
- },
- "signal-reactive-multi-read-5x160k": {
- "ci": [
- 210.2843,
- 211.8777
- ],
- "mean_ms": 211.081
- },
- "signal-reactive-list-replace-1000x1000": {
- "ci": [
- 316.2378,
- 324.7882
- ],
- "mean_ms": 320.513
- },
- "signal-reactive-list-filter-1000x300": {
- "ci": [
- 123.3182,
- 125.7198
- ],
- "mean_ms": 124.519
- },
- "signal-reactive-push-2000x20": {
- "ci": [
- 230.4132,
- 234.4168
- ],
- "mean_ms": 232.415
- },
- "signal-reactive-set-index-300": {
- "ci": [
- 115.7946,
- 116.7654
- ],
- "mean_ms": 116.28
- },
- "signal-reactive-set-property-by-id-200": {
- "ci": [
- 222.7847,
- 224.7153
- ],
- "mean_ms": 223.75
- },
- "bulk-add-500": {
- "ci": [
- 223.1906,
- 226.2944
- ],
- "mean_ms": 224.7425
- },
- "add-20": {
- "ci": [
- 332.8107,
- 333.1243
- ],
- "mean_ms": 332.9675
- },
- "toggle-10": {
- "ci": [
- 161.3392,
- 162.1383
- ],
- "mean_ms": 161.7387
- },
- "toggle-all-20": {
- "ci": [
- 330.152,
- 331.288
- ],
- "mean_ms": 330.72
- },
- "remove-5-front": {
- "ci": [
- 83.8823,
- 84.4602
- ],
- "mean_ms": 84.1713
- },
- "remove-10-middle": {
- "ci": [
- 161.0796,
- 162.0779
- ],
- "mean_ms": 161.5788
- },
- "remove-5-back": {
- "ci": [
- 68.5324,
- 71.2576
- ],
- "mean_ms": 69.895
- },
- "clear-completed-250": {
- "ci": [
- 51.5728,
- 52.2297
- ],
- "mean_ms": 51.9013
- },
- "toggle-first-10": {
- "ci": [
- 161.3677,
- 161.6273
- ],
- "mean_ms": 161.4975
- },
- "toggle-middle-10": {
- "ci": [
- 161.0843,
- 161.8332
- ],
- "mean_ms": 161.4587
- },
- "toggle-last-10": {
- "ci": [
- 159.0385,
- 159.654
- ],
- "mean_ms": 159.3463
- },
- "remove-first-10": {
- "ci": [
- 175.4489,
- 177.3536
- ],
- "mean_ms": 176.4012
- },
- "remove-middle-10": {
- "ci": [
- 163.6918,
- 164.1332
- ],
- "mean_ms": 163.9125
- },
- "remove-last-10": {
- "ci": [
- 162.5071,
- 162.7179
- ],
- "mean_ms": 162.6125
- },
- "filter-cycle-20": {
- "ci": [
- 452.2739,
- 455.7861
- ],
- "mean_ms": 454.03
- },
- "edit-start-10": {
- "ci": [
- 160.9437,
- 163.6788
- ],
- "mean_ms": 162.3113
- },
- "edit-cycle-5": {
- "ci": [
- 163.6608,
- 163.9142
- ],
- "mean_ms": 163.7875
- }
- }
- },
- {
- "sha": "6f9ea44d835d921bffe19a9541740e765c4e8735",
- "msg": "Chore: Roadmap Consolidation Pass (#167)",
- "parent_sha": "788a3ff80b1778b58dd2c8f4e718996cd6934b06",
- "timestamp": "2026-04-30T14:23:17Z",
- "pr": 167,
- "metrics": {
- "create-1k": {
- "ci": [
- 121.2436,
- 122.8964
- ],
- "mean_ms": 122.07
- },
- "create-10k": {
- "ci": [
- 1083.7555,
- 1096.1674
- ],
- "mean_ms": 1089.9614
- },
- "replace-1k": {
- "ci": [
- 97.5465,
- 100.0706
- ],
- "mean_ms": 98.8086
- },
- "append-1k": {
- "ci": [
- 103.1163,
- 106.578
- ],
- "mean_ms": 104.8471
- },
- "update-10th-10": {
- "ci": [
- 179.5629,
- 187.1056
- ],
- "mean_ms": 183.3343
- },
- "select-40": {
- "ci": [
- 708.4661,
- 720.5196
- ],
- "mean_ms": 714.4929
- },
- "swap-rows-20": {
- "ci": [
- 1037.3811,
- 1054.4646
- ],
- "mean_ms": 1045.9229
- },
- "remove-row-front-20": {
- "ci": [
- 640.6487,
- 654.1828
- ],
- "mean_ms": 647.4157
- },
- "remove-row-middle-20": {
- "ci": [
- 349.7874,
- 362.7697
- ],
- "mean_ms": 356.2786
- },
- "remove-row-back-10": {
- "ci": [
- 160.798,
- 163.3506
- ],
- "mean_ms": 162.0743
- },
- "clear-10k": {
- "ci": [
- 178.0877,
- 185.0752
- ],
- "mean_ms": 181.5814
- },
- "signal-reactive-fanout-500x1200": {
- "ci": [
- 89.3624,
- 91.3812
- ],
- "mean_ms": 90.3718
- },
- "signal-computed-chain-10x60k": {
- "ci": [
- 219.0336,
- 222.6155
- ],
- "mean_ms": 220.8245
- },
- "signal-reactive-multi-read-5x160k": {
- "ci": [
- 204.3207,
- 206.8938
- ],
- "mean_ms": 205.6073
- },
- "signal-reactive-list-replace-1000x1000": {
- "ci": [
- 318.4392,
- 321.339
- ],
- "mean_ms": 319.8891
- },
- "signal-reactive-list-filter-1000x300": {
- "ci": [
- 118.8881,
- 121.0664
- ],
- "mean_ms": 119.9773
- },
- "signal-reactive-push-2000x20": {
- "ci": [
- 223.9049,
- 228.446
- ],
- "mean_ms": 226.1755
- },
- "signal-reactive-set-index-300": {
- "ci": [
- 110.2156,
- 111.9699
- ],
- "mean_ms": 111.0927
- },
- "signal-reactive-set-property-by-id-200": {
- "ci": [
- 214.342,
- 217.9744
- ],
- "mean_ms": 216.1582
- },
- "bulk-add-500": {
- "ci": [
- 238.7413,
- 241.0587
- ],
- "mean_ms": 239.9
- },
- "add-20": {
- "ci": [
- 332.5884,
- 332.9891
- ],
- "mean_ms": 332.7888
- },
- "toggle-10": {
- "ci": [
- 158.0541,
- 158.8659
- ],
- "mean_ms": 158.46
- },
- "toggle-all-20": {
- "ci": [
- 327.0134,
- 327.5966
- ],
- "mean_ms": 327.305
- },
- "remove-5-front": {
- "ci": [
- 90.4416,
- 92.1184
- ],
- "mean_ms": 91.28
- },
- "remove-10-middle": {
- "ci": [
- 156.8117,
- 159.1208
- ],
- "mean_ms": 157.9662
- },
- "remove-5-back": {
- "ci": [
- 67.8777,
- 71.2723
- ],
- "mean_ms": 69.575
- },
- "clear-completed-250": {
- "ci": [
- 57.7515,
- 58.666
- ],
- "mean_ms": 58.2088
- },
- "toggle-first-10": {
- "ci": [
- 155.0691,
- 155.4884
- ],
- "mean_ms": 155.2788
- },
- "toggle-middle-10": {
- "ci": [
- 155.0401,
- 155.8224
- ],
- "mean_ms": 155.4313
- },
- "toggle-last-10": {
- "ci": [
- 153.9249,
- 154.6926
- ],
- "mean_ms": 154.3087
- },
- "remove-first-10": {
- "ci": [
- 158.0552,
- 161.4673
- ],
- "mean_ms": 159.7612
- },
- "remove-middle-10": {
- "ci": [
- 157.1569,
- 157.6381
- ],
- "mean_ms": 157.3975
- },
- "remove-last-10": {
- "ci": [
- 156.2037,
- 156.6063
- ],
- "mean_ms": 156.405
- },
- "filter-cycle-20": {
- "ci": [
- 349.9872,
- 356.0653
- ],
- "mean_ms": 353.0262
- },
- "edit-start-10": {
- "ci": [
- 156.8101,
- 158.0599
- ],
- "mean_ms": 157.435
- },
- "edit-cycle-5": {
- "ci": [
- 157.0218,
- 157.5282
- ],
- "mean_ms": 157.275
- }
- }
- },
- {
- "sha": "9d3535941f9fa7213ae2bdeeab2837dc126b9fd2",
- "msg": "Chore: Polish Renderer Comments (#168)",
- "parent_sha": "fa6219745eb500d1e0d2e05ceb76a3203917e67f",
- "timestamp": "2026-04-30T19:26:16Z",
- "pr": 168,
- "metrics": {
- "create-1k": {
- "ci": [
- 130.4457,
- 131.8143
- ],
- "mean_ms": 131.13
- },
- "create-10k": {
- "ci": [
- 1129.9299,
- 1136.6929
- ],
- "mean_ms": 1133.3114
- },
- "replace-1k": {
- "ci": [
- 102.815,
- 103.8907
- ],
- "mean_ms": 103.3529
- },
- "append-1k": {
- "ci": [
- 116.3304,
- 120.2467
- ],
- "mean_ms": 118.2886
- },
- "update-10th-10": {
- "ci": [
- 167.5619,
- 172.3609
- ],
- "mean_ms": 169.9614
- },
- "select-40": {
- "ci": [
- 713.5973,
- 724.577
- ],
- "mean_ms": 719.0871
- },
- "swap-rows-20": {
- "ci": [
- 950.7696,
- 956.9133
- ],
- "mean_ms": 953.8414
- },
- "remove-row-front-20": {
- "ci": [
- 638.5405,
- 644.8109
- ],
- "mean_ms": 641.6757
- },
- "remove-row-middle-20": {
- "ci": [
- 343.2676,
- 348.3981
- ],
- "mean_ms": 345.8329
- },
- "remove-row-back-10": {
- "ci": [
- 156.7397,
- 158.0174
- ],
- "mean_ms": 157.3786
- },
- "clear-10k": {
- "ci": [
- 158.4797,
- 165.7231
- ],
- "mean_ms": 162.1014
- },
- "signal-reactive-fanout-500x1200": {
- "ci": [
- 88.3107,
- 90.3657
- ],
- "mean_ms": 89.3382
- },
- "signal-computed-chain-10x60k": {
- "ci": [
- 216.4715,
- 220.034
- ],
- "mean_ms": 218.2527
- },
- "signal-reactive-multi-read-5x160k": {
- "ci": [
- 199.1985,
- 201.7742
- ],
- "mean_ms": 200.4864
- },
- "signal-reactive-list-replace-1000x1000": {
- "ci": [
- 316.1935,
- 318.6774
- ],
- "mean_ms": 317.4355
- },
- "signal-reactive-list-filter-1000x300": {
- "ci": [
- 117.6693,
- 119.8726
- ],
- "mean_ms": 118.7709
- },
- "signal-reactive-push-2000x20": {
- "ci": [
- 223.3342,
- 226.5513
- ],
- "mean_ms": 224.9427
- },
- "signal-reactive-set-index-300": {
- "ci": [
- 110.6177,
- 111.8823
- ],
- "mean_ms": 111.25
- },
- "signal-reactive-set-property-by-id-200": {
- "ci": [
- 213.5613,
- 215.6642
- ],
- "mean_ms": 214.6127
- },
- "bulk-add-500": {
- "ci": [
- 221.428,
- 223.697
- ],
- "mean_ms": 222.5625
- },
- "add-20": {
- "ci": [
- 333.0254,
- 333.2321
- ],
- "mean_ms": 333.1288
- },
- "toggle-10": {
- "ci": [
- 162.0781,
- 162.4069
- ],
- "mean_ms": 162.2425
- },
- "toggle-all-20": {
- "ci": [
- 330.5567,
- 331.4233
- ],
- "mean_ms": 330.99
- },
- "remove-5-front": {
- "ci": [
- 82.4207,
- 83.7693
- ],
- "mean_ms": 83.095
- },
- "remove-10-middle": {
- "ci": [
- 160.7824,
- 163.1801
- ],
- "mean_ms": 161.9812
- },
- "remove-5-back": {
- "ci": [
- 68.1248,
- 70.4477
- ],
- "mean_ms": 69.2863
- },
- "clear-completed-250": {
- "ci": [
- 50.4678,
- 51.0447
- ],
- "mean_ms": 50.7563
- },
- "toggle-first-10": {
- "ci": [
- 161.6737,
- 164.4688
- ],
- "mean_ms": 163.0712
- },
- "toggle-middle-10": {
- "ci": [
- 158.288,
- 161.302
- ],
- "mean_ms": 159.795
- },
- "toggle-last-10": {
- "ci": [
- 163.7451,
- 164.5849
- ],
- "mean_ms": 164.165
- },
- "remove-first-10": {
- "ci": [
- 166.0585,
- 167.309
- ],
- "mean_ms": 166.6837
- },
- "remove-middle-10": {
- "ci": [
- 152.0601,
- 153.4224
- ],
- "mean_ms": 152.7413
- },
- "remove-last-10": {
- "ci": [
- 153.4311,
- 156.4714
- ],
- "mean_ms": 154.9512
- },
- "filter-cycle-20": {
- "ci": [
- 421.839,
- 426.601
- ],
- "mean_ms": 424.22
- },
- "edit-start-10": {
- "ci": [
- 160.513,
- 162.7445
- ],
- "mean_ms": 161.6288
- },
- "edit-cycle-5": {
- "ci": [
- 152.0439,
- 152.8111
- ],
- "mean_ms": 152.4275
- }
- }
- },
- {
- "sha": "4abfcac3ef9d8af7f2893647d975bb4e95b1ae96",
- "msg": "BREAKING: Prefer id over _id in Signal.getIDs",
- "parent_sha": "4abfcac3ef9d8af7f2893647d975bb4e95b1ae96^",
- "timestamp": "2026-04-30T20:28:28Z",
- "pr": null,
- "metrics": {
- "hydrate-each-100": {
- "ci": [
- 15.3755,
- 15.6199
- ],
- "mean_ms": 15.4977
- },
- "create-1k": {
- "ci": [
- 128.9837,
- 131.3935
- ],
- "mean_ms": 130.1886
- },
- "create-10k": {
- "ci": [
- 1123.7055,
- 1129.5345
- ],
- "mean_ms": 1126.62
- },
- "replace-1k": {
- "ci": [
- 102.4579,
- 103.3821
- ],
- "mean_ms": 102.92
- },
- "append-1k": {
- "ci": [
- 112.4906,
- 115.9752
- ],
- "mean_ms": 114.2329
- },
- "update-10th-10": {
- "ci": [
- 167.1009,
- 176.262
- ],
- "mean_ms": 171.6814
- },
- "select-40": {
- "ci": [
- 705.0498,
- 717.2102
- ],
- "mean_ms": 711.13
- },
- "swap-rows-20": {
- "ci": [
- 954.9402,
- 961.3483
- ],
- "mean_ms": 958.1443
- },
- "remove-row-front-20": {
- "ci": [
- 641.4635,
- 648.7451
- ],
- "mean_ms": 645.1043
- },
- "remove-row-middle-20": {
- "ci": [
- 345.0391,
- 352.2609
- ],
- "mean_ms": 348.65
- },
- "remove-row-back-10": {
- "ci": [
- 156.6568,
- 157.7432
- ],
- "mean_ms": 157.2
- },
- "clear-10k": {
- "ci": [
- 155.1025,
- 162.669
- ],
- "mean_ms": 158.8857
- },
- "signal-reactive-fanout-500x1200": {
- "ci": [
- 88.6721,
- 90.2024
- ],
- "mean_ms": 89.4373
- },
- "signal-computed-chain-10x60k": {
- "ci": [
- 199.1389,
- 200.6247
- ],
- "mean_ms": 199.8818
- },
- "signal-reactive-multi-read-5x160k": {
- "ci": [
- 197.0489,
- 198.5729
- ],
- "mean_ms": 197.8109
- },
- "signal-reactive-list-replace-1000x1000": {
- "ci": [
- 314.4587,
- 316.2104
- ],
- "mean_ms": 315.3345
- },
- "signal-reactive-list-filter-1000x300": {
- "ci": [
- 110.7993,
- 112.6534
- ],
- "mean_ms": 111.7264
- },
- "signal-reactive-push-2000x20": {
- "ci": [
- 209.8147,
- 213.2198
- ],
- "mean_ms": 211.5173
- },
- "signal-reactive-set-index-300": {
- "ci": [
- 101.7476,
- 103.5233
- ],
- "mean_ms": 102.6355
- },
- "signal-reactive-set-property-by-id-200": {
- "ci": [
- 202.4732,
- 204.5813
- ],
- "mean_ms": 203.5273
- },
- "bulk-add-500": {
- "ci": [
- 221.2835,
- 222.859
- ],
- "mean_ms": 222.0712
- },
- "add-20": {
- "ci": [
- 332.518,
- 332.912
- ],
- "mean_ms": 332.715
- },
- "toggle-10": {
- "ci": [
- 161.9251,
- 162.2774
- ],
- "mean_ms": 162.1013
- },
- "toggle-all-20": {
- "ci": [
- 330.4103,
- 331.2647
- ],
- "mean_ms": 330.8375
- },
- "remove-5-front": {
- "ci": [
- 81.332,
- 82.478
- ],
- "mean_ms": 81.905
- },
- "remove-10-middle": {
- "ci": [
- 160.7672,
- 162.2978
- ],
- "mean_ms": 161.5325
- },
- "remove-5-back": {
- "ci": [
- 70.1158,
- 72.6717
- ],
- "mean_ms": 71.3937
- },
- "clear-completed-250": {
- "ci": [
- 49.8955,
- 50.467
- ],
- "mean_ms": 50.1812
- },
- "toggle-first-10": {
- "ci": [
- 160.9207,
- 161.3793
- ],
- "mean_ms": 161.15
- },
- "toggle-middle-10": {
- "ci": [
- 160.5238,
- 161.2462
- ],
- "mean_ms": 160.885
- },
- "toggle-last-10": {
- "ci": [
- 158.7703,
- 159.5347
- ],
- "mean_ms": 159.1525
- },
- "remove-first-10": {
- "ci": [
- 181.0862,
- 183.4988
- ],
- "mean_ms": 182.2925
- },
- "remove-middle-10": {
- "ci": [
- 161.0195,
- 162.4255
- ],
- "mean_ms": 161.7225
- },
- "remove-last-10": {
- "ci": [
- 161.7053,
- 162.0797
- ],
- "mean_ms": 161.8925
- },
- "filter-cycle-20": {
- "ci": [
- 463.463,
- 468.2795
- ],
- "mean_ms": 465.8713
- },
- "edit-start-10": {
- "ci": [
- 163.7252,
- 166.6523
- ],
- "mean_ms": 165.1887
- },
- "edit-cycle-5": {
- "ci": [
- 163.8138,
- 164.7137
- ],
- "mean_ms": 164.2638
- }
- }
- },
- {
- "sha": "5075cb68218a1ba5abea47a3e446510b773dfec0",
- "msg": "Test: Add Template Coverage (#174)",
- "parent_sha": "4b5597d214c53331b4161b2b27e21daaadb066d9",
- "timestamp": "2026-05-02T20:10:53Z",
- "pr": 174,
- "metrics": {
- "hydrate-each-100": {
- "ci": [
- 16.852,
- 17.2227
- ],
- "mean_ms": 17.0373
- },
- "create-1k": {
- "ci": [
- 125.862,
- 128.678
- ],
- "mean_ms": 127.27
- },
- "create-10k": {
- "ci": [
- 1080.796,
- 1093.2583
- ],
- "mean_ms": 1087.0271
- },
- "replace-1k": {
- "ci": [
- 100.8569,
- 102.6603
- ],
- "mean_ms": 101.7586
- },
- "append-1k": {
- "ci": [
- 107.6165,
- 111.6178
- ],
- "mean_ms": 109.6171
- },
- "update-10th-10": {
- "ci": [
- 204.0219,
- 209.9381
- ],
- "mean_ms": 206.98
- },
- "select-40": {
- "ci": [
- 718.5825,
- 728.2861
- ],
- "mean_ms": 723.4343
- },
- "swap-rows-20": {
- "ci": [
- 1088.0658,
- 1106.1742
- ],
- "mean_ms": 1097.12
- },
- "remove-row-front-20": {
- "ci": [
- 676.2411,
- 685.3818
- ],
- "mean_ms": 680.8114
- },
- "remove-row-middle-20": {
- "ci": [
- 391.4408,
- 400.6934
- ],
- "mean_ms": 396.0671
- },
- "remove-row-back-10": {
- "ci": [
- 161.6732,
- 163.2011
- ],
- "mean_ms": 162.4371
- },
- "clear-10k": {
- "ci": [
- 172.1075,
- 178.9325
- ],
- "mean_ms": 175.52
- },
- "signal-reactive-fanout-500x1200": {
- "ci": [
- 89.2659,
- 90.6705
- ],
- "mean_ms": 89.9682
- },
- "signal-computed-chain-10x60k": {
- "ci": [
- 217.8812,
- 220.8806
- ],
- "mean_ms": 219.3809
- },
- "signal-reactive-multi-read-5x160k": {
- "ci": [
- 201.6045,
- 204.4228
- ],
- "mean_ms": 203.0136
- },
- "signal-reactive-list-replace-1000x1000": {
- "ci": [
- 317.3246,
- 319.9699
- ],
- "mean_ms": 318.6473
- },
- "signal-reactive-list-filter-1000x300": {
- "ci": [
- 118.9975,
- 121.5025
- ],
- "mean_ms": 120.25
- },
- "signal-reactive-push-2000x20": {
- "ci": [
- 223.5224,
- 226.5867
- ],
- "mean_ms": 225.0545
- },
- "signal-reactive-set-index-300": {
- "ci": [
- 110.2215,
- 111.4657
- ],
- "mean_ms": 110.8436
- },
- "signal-reactive-set-property-by-id-200": {
- "ci": [
- 212.8455,
- 214.829
- ],
- "mean_ms": 213.8373
- },
- "bulk-add-500": {
- "ci": [
- 220.1906,
- 222.2919
- ],
- "mean_ms": 221.2413
- },
- "add-20": {
- "ci": [
- 332.7105,
- 333.092
- ],
- "mean_ms": 332.9012
- },
- "toggle-10": {
- "ci": [
- 161.6934,
- 162.8491
- ],
- "mean_ms": 162.2712
- },
- "toggle-all-20": {
- "ci": [
- 326.1738,
- 329.1037
- ],
- "mean_ms": 327.6387
- },
- "remove-5-front": {
- "ci": [
- 81.5771,
- 82.9479
- ],
- "mean_ms": 82.2625
- },
- "remove-10-middle": {
- "ci": [
- 160.9032,
- 162.3468
- ],
- "mean_ms": 161.625
- },
- "remove-5-back": {
- "ci": [
- 73.5491,
- 76.7034
- ],
- "mean_ms": 75.1263
- },
- "clear-completed-250": {
- "ci": [
- 50.1381,
- 50.7219
- ],
- "mean_ms": 50.43
- },
- "toggle-first-10": {
- "ci": [
- 159.5625,
- 159.945
- ],
- "mean_ms": 159.7538
- },
- "toggle-middle-10": {
- "ci": [
- 158.9,
- 159.7675
- ],
- "mean_ms": 159.3338
- },
- "toggle-last-10": {
- "ci": [
- 155.9102,
- 156.8023
- ],
- "mean_ms": 156.3562
- },
- "remove-first-10": {
- "ci": [
- 187.8916,
- 191.1309
- ],
- "mean_ms": 189.5112
- },
- "remove-middle-10": {
- "ci": [
- 157.7199,
- 159.2976
- ],
- "mean_ms": 158.5087
- },
- "remove-last-10": {
- "ci": [
- 160.0248,
- 160.4052
- ],
- "mean_ms": 160.215
- },
- "filter-cycle-20": {
- "ci": [
- 480.4583,
- 485.1567
- ],
- "mean_ms": 482.8075
- },
- "edit-start-10": {
- "ci": [
- 163.9342,
- 166.6383
- ],
- "mean_ms": 165.2863
- },
- "edit-cycle-5": {
- "ci": [
- 163.8863,
- 165.8062
- ],
- "mean_ms": 164.8462
- }
- }
- },
- {
- "sha": "271be03f0c6f573ef135423e37eb5bd0fea1cefe",
- "msg": "Test: Make Hydration Tests Actually Hydrate (#176)",
- "parent_sha": "cdc23508efb17771e66aba5a358b57439bd07c81",
- "timestamp": "2026-05-02T20:54:03Z",
- "pr": 176,
- "metrics": {
- "hydrate-each-100-mount": {
- "ci": [
- 13.5399,
- 13.8072
- ],
- "mean_ms": 13.6736
- },
- "hydrate-each-100": {
- "ci": [
- 12.3881,
- 12.8055
- ],
- "mean_ms": 12.5968
- },
- "hydrate-helper-100-mount": {
- "ci": [
- 7.2045,
- 7.6054
- ],
- "mean_ms": 7.4049
- },
- "hydrate-helper-100-state-change": {
- "ci": [
- 16.1098,
- 16.3465
- ],
- "mean_ms": 16.2281
- },
- "create-1k": {
- "ci": [
- 138.771,
- 140.3918
- ],
- "mean_ms": 139.5814
- },
- "create-10k": {
- "ci": [
- 1171.5337,
- 1177.4349
- ],
- "mean_ms": 1174.4843
- },
- "replace-1k": {
- "ci": [
- 107.4437,
- 108.682
- ],
- "mean_ms": 108.0629
- },
- "append-1k": {
- "ci": [
- 119.0467,
- 122.6705
- ],
- "mean_ms": 120.8586
- },
- "update-10th-10": {
- "ci": [
- 194.114,
- 198.4803
- ],
- "mean_ms": 196.2971
- },
- "select-40": {
- "ci": [
- 744.7183,
- 756.7103
- ],
- "mean_ms": 750.7143
- },
- "swap-rows-20": {
- "ci": [
- 1026.3983,
- 1043.0074
- ],
- "mean_ms": 1034.7029
- },
- "remove-row-front-20": {
- "ci": [
- 704.7967,
- 716.5147
- ],
- "mean_ms": 710.6557
- },
- "remove-row-middle-20": {
- "ci": [
- 399.771,
- 412.0604
- ],
- "mean_ms": 405.9157
- },
- "remove-row-back-10": {
- "ci": [
- 155.3815,
- 157.9413
- ],
- "mean_ms": 156.6614
- },
- "clear-10k": {
- "ci": [
- 157.056,
- 164.5726
- ],
- "mean_ms": 160.8143
- },
- "signal-reactive-fanout-500x1200": {
- "ci": [
- 87.9659,
- 89.3704
- ],
- "mean_ms": 88.6682
- },
- "signal-computed-chain-10x60k": {
- "ci": [
- 215.7779,
- 218.553
- ],
- "mean_ms": 217.1655
- },
- "signal-reactive-multi-read-5x160k": {
- "ci": [
- 198.8306,
- 202.3457
- ],
- "mean_ms": 200.5882
- },
- "signal-reactive-list-replace-1000x1000": {
- "ci": [
- 315.6392,
- 318.468
- ],
- "mean_ms": 317.0536
- },
- "signal-reactive-list-filter-1000x300": {
- "ci": [
- 117.3281,
- 119.3919
- ],
- "mean_ms": 118.36
- },
- "signal-reactive-push-2000x20": {
- "ci": [
- 221.5828,
- 224.9717
- ],
- "mean_ms": 223.2773
- },
- "signal-reactive-set-index-300": {
- "ci": [
- 110.0775,
- 111.5207
- ],
- "mean_ms": 110.7991
- },
- "signal-reactive-set-property-by-id-200": {
- "ci": [
- 213.0625,
- 215.052
- ],
- "mean_ms": 214.0573
- },
- "bulk-add-500": {
- "ci": [
- 220.2518,
- 221.8232
- ],
- "mean_ms": 221.0375
- },
- "add-20": {
- "ci": [
- 332.8475,
- 333.1625
- ],
- "mean_ms": 333.005
- },
- "toggle-10": {
- "ci": [
- 161.8,
- 162.565
- ],
- "mean_ms": 162.1825
- },
- "toggle-all-20": {
- "ci": [
- 327.1657,
- 329.8468
- ],
- "mean_ms": 328.5062
- },
- "remove-5-front": {
- "ci": [
- 81.4485,
- 82.734
- ],
- "mean_ms": 82.0913
- },
- "remove-10-middle": {
- "ci": [
- 160.3834,
- 161.5891
- ],
- "mean_ms": 160.9863
- },
- "remove-5-back": {
- "ci": [
- 73.1662,
- 76.2713
- ],
- "mean_ms": 74.7188
- },
- "clear-completed-250": {
- "ci": [
- 50.4723,
- 51.4627
- ],
- "mean_ms": 50.9675
- },
- "toggle-first-10": {
- "ci": [
- 162.0593,
- 162.5257
- ],
- "mean_ms": 162.2925
- },
- "toggle-middle-10": {
- "ci": [
- 161.8677,
- 162.6023
- ],
- "mean_ms": 162.235
- },
- "toggle-last-10": {
- "ci": [
- 158.5647,
- 159.3078
- ],
- "mean_ms": 158.9363
- },
- "remove-first-10": {
- "ci": [
- 169.3907,
- 173.5118
- ],
- "mean_ms": 171.4512
- },
- "remove-middle-10": {
- "ci": [
- 163.0775,
- 164.425
- ],
- "mean_ms": 163.7512
- },
- "remove-last-10": {
- "ci": [
- 162.8876,
- 163.3574
- ],
- "mean_ms": 163.1225
- },
- "filter-cycle-20": {
- "ci": [
- 444.1092,
- 447.8108
- ],
- "mean_ms": 445.96
- },
- "edit-start-10": {
- "ci": [
- 158.9864,
- 161.2311
- ],
- "mean_ms": 160.1088
- },
- "edit-cycle-5": {
- "ci": [
- 163.2897,
- 167.0103
- ],
- "mean_ms": 165.15
- }
- }
- },
- {
- "sha": "eba1804155615787f332ffda9b9aa30617581fc7",
- "msg": "Test: Bump hydrate bench to 1000 items (#177)",
- "parent_sha": "09a1aef0cb0f9ae120656303d0f7f66aaa9738d7",
- "timestamp": "2026-05-02T22:32:10Z",
- "pr": 177,
- "metrics": {
- "hydrate-each-100-mount": {
- "ci": [
- 69.783,
- 70.497
- ],
- "mean_ms": 70.14
- },
- "hydrate-each-100": {
- "ci": [
- 77.4681,
- 78.127
- ],
- "mean_ms": 77.7976
- },
- "hydrate-helper-100-mount": {
- "ci": [
- 39.3764,
- 39.7933
- ],
- "mean_ms": 39.5848
- },
- "hydrate-helper-100-state-change": {
- "ci": [
- 4.9923,
- 5.7794
- ],
- "mean_ms": 5.3859
- },
- "create-1k": {
- "ci": [
- 98.0622,
- 99.9807
- ],
- "mean_ms": 99.0214
- },
- "create-10k": {
- "ci": [
- 862.7343,
- 869.8229
- ],
- "mean_ms": 866.2786
- },
- "replace-1k": {
- "ci": [
- 79.2167,
- 82.029
- ],
- "mean_ms": 80.6229
- },
- "append-1k": {
- "ci": [
- 86.4345,
- 89.9713
- ],
- "mean_ms": 88.2029
- },
- "update-10th-10": {
- "ci": [
- 164.6956,
- 170.9901
- ],
- "mean_ms": 167.8429
- },
- "select-40": {
- "ci": [
- 680.2427,
- 687.0316
- ],
- "mean_ms": 683.6371
- },
- "swap-rows-20": {
- "ci": [
- 844.897,
- 873.3144
- ],
- "mean_ms": 859.1057
- },
- "remove-row-front-20": {
- "ci": [
- 529.0465,
- 543.2164
- ],
- "mean_ms": 536.1314
- },
- "remove-row-middle-20": {
- "ci": [
- 330.5977,
- 334.4651
- ],
- "mean_ms": 332.5314
- },
- "remove-row-back-10": {
- "ci": [
- 160.3505,
- 162.8924
- ],
- "mean_ms": 161.6214
- },
- "clear-10k": {
- "ci": [
- 148.7672,
- 157.0357
- ],
- "mean_ms": 152.9014
- },
- "signal-reactive-fanout-500x1200": {
- "ci": [
- 90.1705,
- 91.4822
- ],
- "mean_ms": 90.8264
- },
- "signal-computed-chain-10x60k": {
- "ci": [
- 215.5262,
- 218.6374
- ],
- "mean_ms": 217.0818
- },
- "signal-reactive-multi-read-5x160k": {
- "ci": [
- 203.9613,
- 205.9387
- ],
- "mean_ms": 204.95
- },
- "signal-reactive-list-replace-1000x1000": {
- "ci": [
- 320.7561,
- 323.4966
- ],
- "mean_ms": 322.1264
- },
- "signal-reactive-list-filter-1000x300": {
- "ci": [
- 123.1347,
- 125.2143
- ],
- "mean_ms": 124.1745
- },
- "signal-reactive-push-2000x20": {
- "ci": [
- 229.6417,
- 233.2147
- ],
- "mean_ms": 231.4282
- },
- "signal-reactive-set-index-300": {
- "ci": [
- 115.026,
- 116.3976
- ],
- "mean_ms": 115.7118
- },
- "signal-reactive-set-property-by-id-200": {
- "ci": [
- 222.3474,
- 224.9526
- ],
- "mean_ms": 223.65
- },
- "bulk-add-500": {
- "ci": [
- 179.2871,
- 181.4179
- ],
- "mean_ms": 180.3525
- },
- "add-20": {
- "ci": [
- 332.721,
- 333.0265
- ],
- "mean_ms": 332.8737
- },
- "toggle-10": {
- "ci": [
- 153.5517,
- 154.8508
- ],
- "mean_ms": 154.2012
- },
- "toggle-all-20": {
- "ci": [
- 320.9405,
- 321.6345
- ],
- "mean_ms": 321.2875
- },
- "remove-5-front": {
- "ci": [
- 72.2576,
- 73.6249
- ],
- "mean_ms": 72.9413
- },
- "remove-10-middle": {
- "ci": [
- 153.0789,
- 154.2836
- ],
- "mean_ms": 153.6812
- },
- "remove-5-back": {
- "ci": [
- 70.7239,
- 72.0161
- ],
- "mean_ms": 71.37
- },
- "clear-completed-250": {
- "ci": [
- 43.776,
- 44.359
- ],
- "mean_ms": 44.0675
- },
- "toggle-first-10": {
- "ci": [
- 161.1752,
- 161.4173
- ],
- "mean_ms": 161.2962
- },
- "toggle-middle-10": {
- "ci": [
- 161.2664,
- 161.8536
- ],
- "mean_ms": 161.56
- },
- "toggle-last-10": {
- "ci": [
- 157.418,
- 158.212
- ],
- "mean_ms": 157.815
- },
- "remove-first-10": {
- "ci": [
- 177.1577,
- 180.1973
- ],
- "mean_ms": 178.6775
- },
- "remove-middle-10": {
- "ci": [
- 162.8027,
- 163.7948
- ],
- "mean_ms": 163.2988
- },
- "remove-last-10": {
- "ci": [
- 162.1562,
- 162.3863
- ],
- "mean_ms": 162.2712
- },
- "filter-cycle-20": {
- "ci": [
- 455.3097,
- 459.8978
- ],
- "mean_ms": 457.6038
- },
- "edit-start-10": {
- "ci": [
- 158.832,
- 161.3355
- ],
- "mean_ms": 160.0837
- },
- "edit-cycle-5": {
- "ci": [
- 163.4294,
- 163.6831
- ],
- "mean_ms": 163.5563
- }
- }
- },
- {
- "sha": "23766ddb1a72582d7a104e7142ff8f248168ffe8",
- "msg": "Bug: Each Bindings Lose External Reactivity (#175)",
- "parent_sha": "23766ddb1a72582d7a104e7142ff8f248168ffe8^",
- "timestamp": "2026-05-04T15:13:07Z",
- "pr": 175,
- "metrics": {
- "hydrate-each-100-mount": {
- "ci": [
- 76.3732,
- 77.1499
- ],
- "mean_ms": 76.7615
- },
- "hydrate-each-100": {
- "ci": [
- 90.8867,
- 91.6795
- ],
- "mean_ms": 91.2831
- },
- "hydrate-helper-100-mount": {
- "ci": [
- 45.7945,
- 46.1785
- ],
- "mean_ms": 45.9865
- },
- "hydrate-helper-100-state-change": {
- "ci": [
- 5.2293,
- 6.1261
- ],
- "mean_ms": 5.6777
- },
- "create-1k": {
- "ci": [
- 121.0991,
- 122.7123
- ],
- "mean_ms": 121.9057
- },
- "create-10k": {
- "ci": [
- 1067.0064,
- 1075.2393
- ],
- "mean_ms": 1071.1229
- },
- "replace-1k": {
- "ci": [
- 96.5312,
- 98.6545
- ],
- "mean_ms": 97.5929
- },
- "append-1k": {
- "ci": [
- 101.7585,
- 105.87
- ],
- "mean_ms": 103.8143
- },
- "update-10th-10": {
- "ci": [
- 177.4312,
- 183.4917
- ],
- "mean_ms": 180.4614
- },
- "select-40": {
- "ci": [
- 707.7106,
- 716.2494
- ],
- "mean_ms": 711.98
- },
- "swap-rows-20": {
- "ci": [
- 1020.4131,
- 1032.8612
- ],
- "mean_ms": 1026.6371
- },
- "remove-row-front-20": {
- "ci": [
- 638.3569,
- 648.2345
- ],
- "mean_ms": 643.2957
- },
- "remove-row-middle-20": {
- "ci": [
- 349.0153,
- 359.8133
- ],
- "mean_ms": 354.4143
- },
- "remove-row-back-10": {
- "ci": [
- 158.4658,
- 161.6513
- ],
- "mean_ms": 160.0586
- },
- "clear-10k": {
- "ci": [
- 171.5691,
- 179.6223
- ],
- "mean_ms": 175.5957
- },
- "signal-reactive-fanout-500x1200": {
- "ci": [
- 89.5667,
- 90.6533
- ],
- "mean_ms": 90.11
- },
- "signal-computed-chain-10x60k": {
- "ci": [
- 200.0892,
- 201.8071
- ],
- "mean_ms": 200.9482
- },
- "signal-reactive-multi-read-5x160k": {
- "ci": [
- 199.6741,
- 203.8787
- ],
- "mean_ms": 201.7764
- },
- "signal-reactive-list-replace-1000x1000": {
- "ci": [
- 316.0785,
- 318.627
- ],
- "mean_ms": 317.3527
- },
- "signal-reactive-list-filter-1000x300": {
- "ci": [
- 111.2999,
- 113.9019
- ],
- "mean_ms": 112.6009
- },
- "signal-reactive-push-2000x20": {
- "ci": [
- 213.3122,
- 217.0551
- ],
- "mean_ms": 215.1836
- },
- "signal-reactive-set-index-300": {
- "ci": [
- 102.6044,
- 104.5574
- ],
- "mean_ms": 103.5809
- },
- "signal-reactive-set-property-by-id-200": {
- "ci": [
- 203.464,
- 206.1105
- ],
- "mean_ms": 204.7873
- },
- "bulk-add-500": {
- "ci": [
- 204.6832,
- 206.3218
- ],
- "mean_ms": 205.5025
- },
- "add-20": {
- "ci": [
- 332.8735,
- 333.129
- ],
- "mean_ms": 333.0012
- },
- "toggle-10": {
- "ci": [
- 161.0933,
- 163.5192
- ],
- "mean_ms": 162.3063
- },
- "toggle-all-20": {
- "ci": [
- 321.896,
- 325.194
- ],
- "mean_ms": 323.545
- },
- "remove-5-front": {
- "ci": [
- 78.8375,
- 80.8725
- ],
- "mean_ms": 79.855
- },
- "remove-10-middle": {
- "ci": [
- 158.1358,
- 160.6017
- ],
- "mean_ms": 159.3688
- },
- "remove-5-back": {
- "ci": [
- 73.3237,
- 76.5213
- ],
- "mean_ms": 74.9225
- },
- "clear-completed-250": {
- "ci": [
- 52.1178,
- 52.5747
- ],
- "mean_ms": 52.3462
- },
- "toggle-first-10": {
- "ci": [
- 161.4877,
- 162.0248
- ],
- "mean_ms": 161.7562
- },
- "toggle-middle-10": {
- "ci": [
- 161.1851,
- 161.9349
- ],
- "mean_ms": 161.56
- },
- "toggle-last-10": {
- "ci": [
- 157.0273,
- 157.7852
- ],
- "mean_ms": 157.4063
- },
- "remove-first-10": {
- "ci": [
- 177.1202,
- 178.9898
- ],
- "mean_ms": 178.055
- },
- "remove-middle-10": {
- "ci": [
- 163.4644,
- 163.9856
- ],
- "mean_ms": 163.725
- },
- "remove-last-10": {
- "ci": [
- 161.8664,
- 162.2811
- ],
- "mean_ms": 162.0737
- },
- "filter-cycle-20": {
- "ci": [
- 454.6342,
- 457.4858
- ],
- "mean_ms": 456.06
- },
- "edit-start-10": {
- "ci": [
- 158.7521,
- 161.0504
- ],
- "mean_ms": 159.9013
- },
- "edit-cycle-5": {
- "ci": [
- 163.4524,
- 163.9051
- ],
- "mean_ms": 163.6788
- }
- }
- }
- ]
+ "schema_version": 2,
+ "commits": []
}
diff --git a/tools/ci/bench/reporter/extract-metrics.js b/tools/ci/bench/reporter/extract-metrics.js
new file mode 100644
index 000000000..417335487
--- /dev/null
+++ b/tools/ci/bench/reporter/extract-metrics.js
@@ -0,0 +1,109 @@
+/*
+ Shared helpers for reading per-metric data out of a tachometer results
+ directory. Three callers live in this folder:
+ - reporter.js (renders PR comment + JSON adjunct)
+ - append-history.js (archives main-push runs to bench-history.json)
+ - fetch-pr-history.js (rebuilds PR-iteration history from prior run artifacts)
+
+ Each pairs `this-change` with `tip-of-tree` benchmarks per metric so the
+ within-session percent-delta from `differences[]` is reachable. Absolute
+ ms across two sessions cannot be combined safely, so any cross-iteration
+ consumer must reach for the percent-delta — not the mean.
+*/
+
+import fs from 'node:fs';
+import path from 'node:path';
+
+/**
+ * Yield one record per metric across all `*.json` files under `dir`.
+ *
+ * Each record carries the matched `this-change` benchmark, its
+ * `tip-of-tree` counterpart (when present), and the precomputed
+ * `differences[]` entry that pairs them. Caller projects into whatever
+ * shape it needs.
+ *
+ * Lets `JSON.parse` errors propagate so a corrupt artifact fails CI
+ * loudly with a stack-traced path rather than silently shrinking the
+ * report.
+ */
+export function* iterMetricPairs(dir) {
+ for (const entry of walk(dir)) {
+ if (!entry.endsWith('.json')) { continue; }
+ const data = JSON.parse(fs.readFileSync(entry, 'utf8'));
+ if (!Array.isArray(data.benchmarks)) { continue; }
+
+ const byName = new Map();
+ data.benchmarks.forEach((bm, i) => {
+ const mName = bm.measurement?.name ?? bm.name;
+ const source = (bm.name ?? '').split(' [')[0];
+ if (!byName.has(mName)) { byName.set(mName, {}); }
+ byName.get(mName)[source] = { index: i, bm };
+ });
+
+ for (const [name, pair] of byName) {
+ const cur = pair['this-change'];
+ const base = pair['tip-of-tree'];
+ if (!cur?.bm.mean) { continue; }
+ const diff = base ? cur.bm.differences?.[base.index] : null;
+ yield { name, current: cur, base, diff };
+ }
+ }
+}
+
+/**
+ * Project results into the bench-history entry shape:
+ * { name → { ci, mean_ms, percent_delta_ci?, baseline_sha? } }
+ *
+ * `baseline_sha` (when supplied) pins each `percent_delta_ci` to its
+ * comparison reference. Entries without a paired tip-of-tree skip both
+ * `percent_delta_ci` and `baseline_sha` — there's nothing to pin to.
+ */
+export function loadHistoryMetrics(dir, baselineSha = '') {
+ const out = {};
+ for (const { name, current, diff } of iterMetricPairs(dir)) {
+ const m = current.bm.mean;
+ const entry = {
+ ci: [round4(m.low), round4(m.high)],
+ mean_ms: round4((m.low + m.high) / 2),
+ };
+ if (diff?.percentChange) {
+ entry.percent_delta_ci = [round4(diff.percentChange.low), round4(diff.percentChange.high)];
+ if (baselineSha) {
+ entry.baseline_sha = baselineSha;
+ }
+ }
+ out[name] = entry;
+ }
+ return out;
+}
+
+/**
+ * Read the `baseline-sha.txt` sidecar uploaded next to the tachometer
+ * JSONs. Returns '' when absent or unreadable; caller decides what
+ * "no baseline" means in its context.
+ */
+export function readBaselineSha(dir) {
+ for (const entry of walk(dir)) {
+ if (entry.endsWith('baseline-sha.txt')) {
+ try {
+ return fs.readFileSync(entry, 'utf8').trim();
+ }
+ catch {
+ return '';
+ }
+ }
+ }
+ return '';
+}
+
+function round4(n) {
+ return Number(n.toFixed(4));
+}
+
+function* walk(dir) {
+ for (const ent of fs.readdirSync(dir, { withFileTypes: true })) {
+ const full = path.join(dir, ent.name);
+ if (ent.isDirectory()) { yield* walk(full); }
+ else { yield full; }
+ }
+}
diff --git a/tools/ci/bench/reporter/fetch-pr-history.js b/tools/ci/bench/reporter/fetch-pr-history.js
index 2e49b76d0..0e7bb6900 100644
--- a/tools/ci/bench/reporter/fetch-pr-history.js
+++ b/tools/ci/bench/reporter/fetch-pr-history.js
@@ -2,13 +2,13 @@
/*
Fetch prior bench results from this PR's branch to build a per-iteration
history. Walks completed Benchmarks workflow runs, downloads their
- results-* artifacts, extracts per-metric absolute CIs, and outputs a
+ results-* artifacts, extracts per-metric CIs, and outputs a
pr-history.json in the same schema as bench-history.json.
The reporter merges this PR-iteration history with bench-history.json
- (main-commit history) to compute cross-run peak attribution. An agent
- iterating on a perf branch sees: "iteration 3 was the best on
- update-10th; your current iteration regressed from that."
+ (main-commit history) for cross-run peak attribution. An agent iterating
+ on a perf branch sees: "iteration 3 was the best on update-10th; your
+ current iteration regressed from that."
Usage:
node fetch-pr-history.js \
@@ -23,7 +23,7 @@
import { execSync } from 'node:child_process';
import fs from 'node:fs';
-import path from 'node:path';
+import { loadHistoryMetrics, readBaselineSha } from './extract-metrics.js';
const args = parseArgs(process.argv.slice(2));
const branch = required(args, 'branch');
@@ -59,7 +59,8 @@ for (const run of prRuns) {
continue;
}
- const metrics = loadMetrics(dir);
+ const baselineSha = readBaselineSha(dir);
+ const metrics = loadHistoryMetrics(dir, baselineSha);
if (Object.keys(metrics).length === 0) {
console.log(` Skip ${run.databaseId} (no metrics)`);
continue;
@@ -73,60 +74,19 @@ for (const run of prRuns) {
pr: null,
metrics,
});
- console.log(` ${run.headSha.slice(0, 7)} — ${Object.keys(metrics).length} metrics`);
+ console.log(
+ ` ${run.headSha.slice(0, 7)} — ${Object.keys(metrics).length} metrics`
+ + (baselineSha ? ` @ baseline ${baselineSha.slice(0, 7)}` : ''),
+ );
}
-// Chronological order (oldest first) so peak-index → bisect-candidates
-// after peak produces a causal timeline.
+// Sort by timestamp so peak → bisect-candidates after peak produces a
+// causal timeline.
commits.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
-fs.writeFileSync(outPath, JSON.stringify({ schema_version: 1, commits }, null, 2) + '\n');
+fs.writeFileSync(outPath, JSON.stringify({ schema_version: 2, commits }, null, 2) + '\n');
console.log(`Wrote ${commits.length} entries to ${outPath}`);
-/**
- * Walk a results directory and extract one { ci, mean_ms } entry per
- * metric. Uses the `this-change` absolute CI — same extraction logic
- * as append-history.js.
- */
-function loadMetrics(dir) {
- const out = {};
- for (const entry of walk(dir)) {
- if (!entry.endsWith('.json')) { continue; }
- let data;
- try {
- data = JSON.parse(fs.readFileSync(entry, 'utf8'));
- }
- catch {
- continue;
- }
- if (!Array.isArray(data.benchmarks)) { continue; }
-
- for (const bm of data.benchmarks) {
- const source = (bm.name ?? '').split(' [')[0];
- if (source !== 'this-change') { continue; }
- const metricName = bm.measurement?.name ?? bm.name;
- if (!bm.mean) { continue; }
- out[metricName] = {
- ci: [round4(bm.mean.low), round4(bm.mean.high)],
- mean_ms: round4((bm.mean.low + bm.mean.high) / 2),
- };
- }
- }
- return out;
-}
-
-function round4(n) {
- return Number(n.toFixed(4));
-}
-
-function* walk(dir) {
- for (const ent of fs.readdirSync(dir, { withFileTypes: true })) {
- const full = path.join(dir, ent.name);
- if (ent.isDirectory()) { yield* walk(full); }
- else { yield full; }
- }
-}
-
function exec(cmd) {
return execSync(cmd, { encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] });
}
diff --git a/tools/ci/bench/reporter/fixtures/history-sample.json b/tools/ci/bench/reporter/fixtures/history-sample.json
index a9a1f1419..b95a186e3 100644
--- a/tools/ci/bench/reporter/fixtures/history-sample.json
+++ b/tools/ci/bench/reporter/fixtures/history-sample.json
@@ -1,5 +1,5 @@
{
- "schema_version": 1,
+ "schema_version": 2,
"commits": [
{
"sha": "aaaa111111111111",
@@ -8,9 +8,24 @@
"timestamp": "2026-04-10T00:00:00Z",
"pr": 101,
"metrics": {
- "update-10th": { "ci": [10.0, 11.0], "mean_ms": 10.5 },
- "select": { "ci": [20.0, 21.0], "mean_ms": 20.5 },
- "toggle-middle": { "ci": [8.0, 9.0], "mean_ms": 8.5 }
+ "update-10th": {
+ "ci": [10.0, 11.0],
+ "mean_ms": 10.5,
+ "percent_delta_ci": [-5.0, -3.0],
+ "baseline_sha": "0000000000000000"
+ },
+ "select": {
+ "ci": [20.0, 21.0],
+ "mean_ms": 20.5,
+ "percent_delta_ci": [-2.0, -1.0],
+ "baseline_sha": "0000000000000000"
+ },
+ "toggle-middle": {
+ "ci": [8.0, 9.0],
+ "mean_ms": 8.5,
+ "percent_delta_ci": [-3.0, -1.0],
+ "baseline_sha": "0000000000000000"
+ }
}
},
{
@@ -20,9 +35,24 @@
"timestamp": "2026-04-11T00:00:00Z",
"pr": 102,
"metrics": {
- "update-10th": { "ci": [6.0, 7.0], "mean_ms": 6.5 },
- "select": { "ci": [18.0, 19.0], "mean_ms": 18.5 },
- "toggle-middle": { "ci": [7.0, 8.0], "mean_ms": 7.5 }
+ "update-10th": {
+ "ci": [6.0, 7.0],
+ "mean_ms": 6.5,
+ "percent_delta_ci": [-40.0, -35.0],
+ "baseline_sha": "aaaa111111111111"
+ },
+ "select": {
+ "ci": [18.0, 19.0],
+ "mean_ms": 18.5,
+ "percent_delta_ci": [-12.0, -8.0],
+ "baseline_sha": "aaaa111111111111"
+ },
+ "toggle-middle": {
+ "ci": [7.0, 8.0],
+ "mean_ms": 7.5,
+ "percent_delta_ci": [-15.0, -10.0],
+ "baseline_sha": "aaaa111111111111"
+ }
}
},
{
@@ -32,9 +62,24 @@
"timestamp": "2026-04-12T00:00:00Z",
"pr": 103,
"metrics": {
- "update-10th": { "ci": [7.0, 8.0], "mean_ms": 7.5 },
- "select": { "ci": [18.2, 19.1], "mean_ms": 18.65 },
- "toggle-middle": { "ci": [7.1, 8.1], "mean_ms": 7.6 }
+ "update-10th": {
+ "ci": [7.0, 8.0],
+ "mean_ms": 7.5,
+ "percent_delta_ci": [12.0, 18.0],
+ "baseline_sha": "bbbb222222222222"
+ },
+ "select": {
+ "ci": [18.2, 19.1],
+ "mean_ms": 18.65,
+ "percent_delta_ci": [-1.0, 2.0],
+ "baseline_sha": "bbbb222222222222"
+ },
+ "toggle-middle": {
+ "ci": [7.1, 8.1],
+ "mean_ms": 7.6,
+ "percent_delta_ci": [-1.0, 3.0],
+ "baseline_sha": "bbbb222222222222"
+ }
}
},
{
@@ -44,9 +89,24 @@
"timestamp": "2026-04-13T00:00:00Z",
"pr": 104,
"metrics": {
- "update-10th": { "ci": [9.0, 10.0], "mean_ms": 9.5 },
- "select": { "ci": [18.1, 19.0], "mean_ms": 18.55 },
- "toggle-middle": { "ci": [7.2, 8.2], "mean_ms": 7.7 }
+ "update-10th": {
+ "ci": [9.0, 10.0],
+ "mean_ms": 9.5,
+ "percent_delta_ci": [22.0, 28.0],
+ "baseline_sha": "cccc333333333333"
+ },
+ "select": {
+ "ci": [18.1, 19.0],
+ "mean_ms": 18.55,
+ "percent_delta_ci": [-0.8, 0.2],
+ "baseline_sha": "cccc333333333333"
+ },
+ "toggle-middle": {
+ "ci": [7.2, 8.2],
+ "mean_ms": 7.7,
+ "percent_delta_ci": [0.5, 2.5],
+ "baseline_sha": "cccc333333333333"
+ }
}
}
]
diff --git a/tools/ci/bench/reporter/reporter.js b/tools/ci/bench/reporter/reporter.js
index 414e6743b..522cba179 100644
--- a/tools/ci/bench/reporter/reporter.js
+++ b/tools/ci/bench/reporter/reporter.js
@@ -17,17 +17,27 @@
--repo-root filesystem root for resolving bench sources (default: cwd)
--wall-clock total bench run duration — footer metadata
--history bench-history.json path (default: /tools/ci/bench/reporter/bench-history.json)
- --pr-history PR-iteration history from fetch-pr-history.js (merged with --history)
+ --pr-history PR-iteration history from fetch-pr-history.js
+ --scope peak attribution scope. 'pr' restricts comparison
+ to PR-iteration history only (drops main-history
+ overlay; eliminates phantom REOPENEDs on test-only
+ PRs). 'all' merges main + PR (default).
--out output directory (default: ./bench-report)
- Cross-run taxonomy (WIN / TIED-PEAK / REOPENED) engages automatically once
- bench-history.json has entries. Empty or missing history file → reporter
- falls back to current-vs-baseline only (no peak attribution rendered).
+ Cross-run taxonomy (WIN / TIED-PEAK / REOPENED) operates on within-session
+ percent-delta CIs — the only number tachometer warrants for cross-iteration
+ comparison. Drift detection walks main-history's chain of percent-deltas
+ between baselines and renders ⚠️ when main moved enough on the metric to
+ plausibly confound peak attribution within the PR.
+
+ Schema: bench-history.json is schema_version 2.
*/
import fs from 'node:fs';
import path from 'node:path';
+import { iterMetricPairs, readBaselineSha } from './extract-metrics.js';
+
const args = parseArgs(process.argv.slice(2));
const resultsDir = required(args, 'results');
const sha = required(args, 'sha');
@@ -45,6 +55,15 @@ const wallClockSec = args['wall-clock'] ? Number(args['wall-clock']) : null;
const historyPath = args.history ?? path.join(repoRoot, 'tools/ci/bench/reporter/bench-history.json');
const prHistoryPath = args['pr-history'] ?? '';
const outDir = args.out ?? './bench-report';
+// 'pr' restricts peak attribution to PR-iteration history only — main-history
+// commits round-robin'd against their own tip-of-trees, so cross-session
+// comparison would mix iteration drift with environmental variance. 'all'
+// merges both for push-to-main runs that want the full timeline.
+const scope = args.scope ?? 'all';
+if (scope !== 'all' && scope !== 'pr') {
+ console.error(`Invalid --scope: ${scope}; expected 'all' or 'pr'`);
+ process.exit(1);
+}
const NOISE_FLOOR = 2; // percent — matches autoSampleConditions
@@ -78,6 +97,12 @@ const TEASER_ROWS = 5;
// present in the JSON adjunct for agent use.
const BISECT_MARKDOWN_MAX = 3;
+// Cumulative main-side drift (in percentage points) above which a row
+// renders the baseline-drift flag. Below this, drift is in the runner
+// noise floor across the chain anyway; flagging would clutter every
+// long-running PR.
+const DRIFT_THRESHOLD_PP = 5;
+
/**
* Expected percent-change CI width for an unresolved CI given the bench's
* absolute duration. Derived from the standard-error-of-the-difference of
@@ -92,7 +117,11 @@ function expectedNoisePp(meanMs) {
const benchDirs = findBenchDirs(repoRoot);
const mainHistory = loadHistory(historyPath);
const prHistory = loadHistory(prHistoryPath);
-const history = mergeHistories(mainHistory, prHistory);
+const peakHistory = scope === 'pr' ? prHistory : mergeHistories(mainHistory, prHistory);
+// Drift always walks main-history regardless of peak scope — the chain of
+// per-commit percent-deltas is what quantifies how the baseline itself moved.
+const driftHistory = mainHistory;
+const currentBaselineSha = readBaselineSha(resultsDir);
const metrics = loadAllMetrics(resultsDir);
const report = buildReport(metrics);
const markdown = renderMarkdown(report);
@@ -109,42 +138,23 @@ console.log(
);
/**
- * Walk `dir` for tachometer JSON files and extract per-metric this-change
- * vs tip-of-tree data. Each tachometer file can contain multiple metrics;
- * each metric has one "this-change [X]" and one "tip-of-tree [X]" entry.
+ * Project paired tachometer benchmarks into the shape buildReport consumes.
+ * Skips metrics without a tip-of-tree counterpart or differences[] entry —
+ * the comment renderer needs both sides plus the precomputed diff.
*/
function loadAllMetrics(dir) {
const out = [];
- for (const entry of walk(dir)) {
- if (!entry.endsWith('.json')) { continue; }
- const data = JSON.parse(fs.readFileSync(entry, 'utf8'));
- if (!Array.isArray(data.benchmarks)) { continue; }
-
- // Group benchmarks by measurement name, index by source (this-change | tip-of-tree)
- const byName = new Map();
- data.benchmarks.forEach((bm, i) => {
- const mName = bm.measurement?.name ?? bm.name;
- const source = (bm.name ?? '').split(' [')[0];
- if (!byName.has(mName)) { byName.set(mName, {}); }
- byName.get(mName)[source] = { index: i, bm };
+ for (const { name, current, base, diff } of iterMetricPairs(dir)) {
+ if (!base || !diff) { continue; }
+ out.push({
+ name,
+ thisChangeMs: [current.bm.mean.low, current.bm.mean.high],
+ tipOfTreeMs: [base.bm.mean.low, base.bm.mean.high],
+ absoluteMsDelta: [diff.absolute.low, diff.absolute.high],
+ percentDelta: [diff.percentChange.low, diff.percentChange.high],
+ baselineSha: currentBaselineSha || null,
});
-
- for (const [name, pair] of byName) {
- const cur = pair['this-change'];
- const base = pair['tip-of-tree'];
- if (!cur || !base) { continue; }
- const diff = cur.bm.differences?.[base.index];
- if (!diff) { continue; }
- out.push({
- name,
- thisChangeMs: [cur.bm.mean.low, cur.bm.mean.high],
- tipOfTreeMs: [base.bm.mean.low, base.bm.mean.high],
- absoluteMsDelta: [diff.absolute.low, diff.absolute.high],
- percentDelta: [diff.percentChange.low, diff.percentChange.high],
- });
- }
}
- // Deterministic order for snapshot stability
out.sort((a, b) => a.name.localeCompare(b.name));
return out;
}
@@ -170,7 +180,7 @@ function buildReport(metrics) {
const expectedPp = expectedNoisePp(meanMs);
const ratio = expectedPp > 0 ? widthPp / expectedPp : Infinity;
const source = resolveMetricSource(m.name, benchDirs, repoRoot);
- const historyStatus = computeHistoryStatus(m, history);
+ const historyStatus = computeHistoryStatus(m, peakHistory, driftHistory);
return {
...m,
meanMs,
@@ -196,7 +206,8 @@ function buildReport(metrics) {
noise_ratio_tolerance: NOISE_RATIO_TOLERANCE,
summary,
history_summary: historySummary,
- history_available: history !== null && history.commits.length > 0,
+ history_available: peakHistory !== null && peakHistory.commits.length > 0,
+ scope,
metrics: classified.map(toJsonMetric),
};
}
@@ -236,21 +247,22 @@ function toJsonMetric(m) {
expected_noise_pp: Number(m.expectedPp.toFixed(2)),
observed_noise_ratio: Number(m.ratio.toFixed(2)),
source: m.source,
+ baseline_sha: m.baselineSha ?? null,
};
- // Cross-run fields only populated when history has data for this metric.
- // Agents can detect absence vs "no peak" via missing key, not null.
if (m.historyStatus) {
out.history_status = m.historyStatus.status;
out.peak = m.historyStatus.peak;
out.delta_from_peak_pct = m.historyStatus.delta_from_peak_pct;
out.bisect_candidates = m.historyStatus.bisect_candidates;
+ if (m.historyStatus.drift?.detected) {
+ out.drift = m.historyStatus.drift;
+ }
}
return out;
}
/**
- * Markdown renderer — implements the rubric at ai/workspace/tmp/bench-reporter-rubric.md.
- * Layout:
+ * Markdown renderer. Layout:
* 1. Top header: h3 with state emoji + commit link + "on Benchmark Suite 📊"
* 2. Metadata line: Base · Action · Raw (bench-report.json link)
* 3. GitHub alert block with verdict copy
@@ -312,7 +324,6 @@ function renderMarkdown(report) {
`🔍 ${unsureTotal} unsure`,
`⚪ ${noChange.length} no change`,
];
- // Surface REOPENED count in the headline when history has flagged any.
const reopenedCount = report.history_summary?.REOPENED ?? 0;
if (reopenedCount > 0) {
resultsParts.push(`📜 ${reopenedCount} reopened`);
@@ -333,9 +344,6 @@ function renderMarkdown(report) {
}
// ─── Regressions from peak (cross-run; auto-expanded when present) ───
- // Load-bearing signal — a metric better on a prior commit is always a
- // cherry-pick candidate. Sits alongside Faster/Slower, not hidden in a
- // collapsible, because a reviewer should never miss this.
renderRegressionsFromPeak(lines, report);
// ─── No Change (always collapsed) ────────────────────────────────────
@@ -422,33 +430,43 @@ function renderMarkdown(report) {
/**
* Append a "Regressions from peak" section when one or more metrics are
- * REOPENED (current CI dominated by a prior commit's CI). Actionable signal:
- * the metric was once better and this PR — or a commit before it — gave
- * that improvement back. Skipped entirely when history is empty or no
- * REOPENED metrics exist.
+ * REOPENED (current pct-delta dominated by a prior iteration's pct-delta).
+ * Actionable signal: the metric was once better and this PR — or a commit
+ * before it — gave that improvement back.
+ *
+ * Surface units are within-session percent-deltas (the pct-delta this run
+ * achieved vs its baseline; the pct-delta peak achieved vs ITS baseline).
+ * `delta_from_peak_pct` is the difference between those two midpoints in
+ * percentage points (pp). Drift footnotes fire when peak and current had
+ * different baselines and main moved enough on the metric to plausibly
+ * confound the comparison.
*/
function renderRegressionsFromPeak(lines, report) {
const reopened = report.metrics.filter((m) => m.history_status === 'REOPENED');
if (reopened.length === 0) { return; }
- // Sort by severity — largest delta-from-peak first.
- const sorted = [...reopened].sort((a, b) => (b.delta_from_peak_pct ?? 0) - (a.delta_from_peak_pct ?? 0));
+ const sorted = [...reopened].sort(
+ (a, b) => (b.delta_from_peak_pct ?? 0) - (a.delta_from_peak_pct ?? 0),
+ );
lines.push(`#### 📜 Regressions from peak (${reopened.length})`);
lines.push('');
lines.push(
- `These metrics were better on a prior commit than they are now. The peak CI dominates current CI — not attributable to per-sample noise. Bisect candidates are the commits between the peak and HEAD; nearest-to-peak is usually the best bet.`,
+ `These metrics were better on a prior iteration than they are now. The peak's percent-delta vs its baseline dominates current's percent-delta vs its baseline — not attributable to per-sample noise. Bisect candidates are the commits between the peak iteration and HEAD; nearest-to-peak is usually the best bet.`,
);
lines.push('');
lines.push('| metric | current | peak | vs peak | bisect candidates |');
lines.push('|---|---|---|---|---|');
+
+ const flagged = [];
+
for (const m of sorted) {
- const currentMid = (m.mean_ms ?? ((m.this_change_ms_ci[0] + m.this_change_ms_ci[1]) / 2));
- const peakMid = m.peak.mean_ms;
- const deltaStr = m.delta_from_peak_pct > 0
- ? `+${m.delta_from_peak_pct.toFixed(0)}%`
- : `${m.delta_from_peak_pct.toFixed(0)}%`;
+ const currentStr = formatSignedPct(mid(m.percent_change_ci));
+ const peakStr = formatSignedPct(mid(m.peak.percent_delta_ci));
const peakLink = commitOrPrLink(m.peak, report.repo);
+ const deltaStr = m.delta_from_peak_pct > 0
+ ? `regressed +${m.delta_from_peak_pct.toFixed(0)}pp`
+ : `${m.delta_from_peak_pct.toFixed(0)}pp`;
const bisectMd = (m.bisect_candidates ?? [])
.slice(0, BISECT_MARKDOWN_MAX)
.map((c) => commitOrPrLink(c, report.repo))
@@ -456,13 +474,65 @@ function renderRegressionsFromPeak(lines, report) {
const bisectCell = m.bisect_candidates && m.bisect_candidates.length > BISECT_MARKDOWN_MAX
? `${bisectMd} +${m.bisect_candidates.length - BISECT_MARKDOWN_MAX} more`
: bisectMd || '—';
+
+ // Fires on threshold breach or chain-gap (magnitude unavailable).
+ let driftFlag = '';
+ if (m.drift?.detected) {
+ const mag = m.drift.magnitude;
+ const fires = mag === null || Math.abs(mag) >= DRIFT_THRESHOLD_PP;
+ if (fires) {
+ const idx = flagged.length + 1;
+ driftFlag = ` ⚠️${idx}`;
+ flagged.push({
+ idx,
+ metric: m.name,
+ drift: m.drift,
+ currentBaseline: m.baseline_sha,
+ peakBaseline: m.peak.baseline_sha,
+ });
+ }
+ }
+
lines.push(
- `| ${metricLink(m, report)} | ${currentMid.toFixed(1)}ms | ${
- peakMid.toFixed(1)
- }ms @ ${peakLink} | ${deltaStr} | ${bisectCell} |`,
+ `| ${
+ metricLink(m, report)
+ } | ${currentStr}${driftFlag} | ${peakStr} @ ${peakLink} | ${deltaStr} | ${bisectCell} |`,
);
}
lines.push('');
+
+ if (flagged.length > 0) {
+ for (const f of flagged) {
+ lines.push(formatDriftFootnote(f));
+ }
+ lines.push('');
+ }
+}
+
+function formatSignedPct(pct) {
+ return pct > 0 ? `+${pct.toFixed(0)}%` : `${pct.toFixed(0)}%`;
+}
+
+function formatDriftFootnote({ idx, metric, drift, currentBaseline, peakBaseline }) {
+ const peakSha = peakBaseline ? peakBaseline.slice(0, 7) : '?';
+ const currentSha = currentBaseline ? currentBaseline.slice(0, 7) : '?';
+ if (drift.magnitude !== null) {
+ const sign = drift.magnitude > 0 ? '+' : '';
+ const links = drift.chain_len === 1 ? '1 main commit' : `${drift.chain_len} main commits`;
+ return (
+ `⚠️${idx} main moved ${sign}${drift.magnitude.toFixed(0)}pp on \`${metric}\` `
+ + `between baselines (\`${peakSha}\` → \`${currentSha}\`, chained across ${links}). `
+ + `Comparison may include main-side change.`
+ );
+ }
+ const total = drift.chain_len + drift.missing;
+ const detail = total === 0
+ ? '0 entries available in chain'
+ : `${drift.missing}/${total} entries missing percent_delta_ci`;
+ return (
+ `⚠️${idx} main moved between baselines on \`${metric}\` (\`${peakSha}\` → \`${currentSha}\`); `
+ + `drift magnitude unavailable (${detail}). Comparison may include main-side change.`
+ );
}
/**
@@ -686,27 +756,32 @@ function mergeHistories(mainHist, prHist) {
...(mainHist?.commits ?? []),
...(prHist?.commits ?? []),
];
- // Deduplicate by SHA (same commit shouldn't appear twice)
const seen = new Set();
const deduped = commits.filter((c) => {
if (seen.has(c.sha)) { return false; }
seen.add(c.sha);
return true;
});
- // Chronological order so bisect candidates are in causal sequence
+ // Chronological so bisect candidates land in causal order.
deduped.sort((a, b) => (a.timestamp ?? '').localeCompare(b.timestamp ?? ''));
- return { schema_version: 1, commits: deduped };
+ return { schema_version: 2, commits: deduped };
}
/**
- * Load bench-history.json. Returns null if missing/empty/invalid — the
- * reporter's D3b features all gracefully degrade on a null history.
+ * Load bench-history.json. Returns null on missing/empty/invalid — peak
+ * attribution gracefully degrades on null. Schema v2 only.
+ *
+ * Sort on read because `computeBaselineDrift` walks `commits` by index,
+ * and the rebase-retry path on push-to-main can land entries with
+ * non-monotonic timestamps.
*/
function loadHistory(filePath) {
if (!filePath || !fs.existsSync(filePath)) { return null; }
try {
const parsed = JSON.parse(fs.readFileSync(filePath, 'utf8'));
- if (parsed.schema_version !== 1 || !Array.isArray(parsed.commits)) { return null; }
+ if (parsed.schema_version !== 2) { return null; }
+ if (!Array.isArray(parsed.commits)) { return null; }
+ parsed.commits.sort((a, b) => (a.timestamp ?? '').localeCompare(b.timestamp ?? ''));
return parsed;
}
catch {
@@ -715,56 +790,67 @@ function loadHistory(filePath) {
}
/**
- * Compute per-metric cross-run status against the history file.
+ * Per-metric cross-run status against `peakHist`.
+ *
+ * Peak = the entry with the most-negative `percent_delta_ci` upper bound:
+ * the iteration that produced the largest improvement vs its own baseline.
+ * On exact ties, prefer the more recent entry (`commits` is chronological).
*
- * Peak = the commit whose CI upper bound is lowest (smallest time = best
- * perf). On exact ties, prefer the most recent entry (history is ordered
- * chronologically by append order).
+ * Status compares within-session percent-delta CIs at both ends. Cross-
+ * session absolute-ms compare would mix per-session environmental variance:
+ * WIN — current pct-delta CI dominates peak's (current high < peak low)
+ * REOPENED — peak pct-delta CI dominates current's (peak high < current low)
+ * TIED-PEAK — CIs overlap
*
- * Status taxonomy (current CI = this-change absolute ms):
- * WIN — current CI dominates peak (current high < peak low)
- * REOPENED — peak CI dominates current (peak high < current low)
- * TIED-PEAK — CIs overlap (no dominance either way)
+ * Drift detection runs against `driftHist` (always main-history) regardless
+ * of the peak scope. See `computeBaselineDrift`.
*
- * Returns null when history is null/empty OR this metric has no prior
- * entries (new bench). The reporter's caller distinguishes these cases
- * via the `history_available` summary flag and per-metric presence.
+ * Returns null when this metric has no prior entries with `percent_delta_ci`
+ * — new bench, no comparable history yet.
*/
-function computeHistoryStatus(metric, hist) {
- if (!hist || hist.commits.length === 0) { return null; }
- const entries = hist.commits.filter((c) => c.metrics && metric.name in c.metrics);
+function computeHistoryStatus(metric, peakHist, driftHist) {
+ if (!peakHist || peakHist.commits.length === 0) { return null; }
+ const entries = peakHist.commits.filter(
+ (c) => c.metrics?.[metric.name]?.percent_delta_ci,
+ );
if (entries.length === 0) { return null; }
- // Pick peak. Tie-break: newer commit wins (commits array is chronological).
+ // Pick peak. Tie-break: newer commit wins.
let peakIdx = 0;
for (let i = 1; i < entries.length; i++) {
- const candHigh = entries[i].metrics[metric.name].ci[1];
- const peakHigh = entries[peakIdx].metrics[metric.name].ci[1];
- if (candHigh < peakHigh) { peakIdx = i; }
- else if (candHigh === peakHigh) { peakIdx = i; }
+ const candHigh = entries[i].metrics[metric.name].percent_delta_ci[1];
+ const peakHigh = entries[peakIdx].metrics[metric.name].percent_delta_ci[1];
+ if (candHigh <= peakHigh) { peakIdx = i; }
}
const peakEntry = entries[peakIdx];
const peakMetric = peakEntry.metrics[metric.name];
- const currentCi = metric.thisChangeMs;
- const peakCi = peakMetric.ci;
+ const currentPctCi = metric.percentDelta;
+ const peakPctCi = peakMetric.percent_delta_ci;
let status;
- if (currentCi[1] < peakCi[0]) { status = 'WIN'; }
- else if (peakCi[1] < currentCi[0]) { status = 'REOPENED'; }
+ if (currentPctCi[1] < peakPctCi[0]) { status = 'WIN'; }
+ else if (peakPctCi[1] < currentPctCi[0]) { status = 'REOPENED'; }
else { status = 'TIED-PEAK'; }
- // Bisect candidates: commits between peak and HEAD of history that
- // contain this metric. Oldest-to-newest so the reviewer/agent sees
- // the timeline in causal order.
- const peakHistIdx = hist.commits.findIndex((c) => c.sha === peakEntry.sha);
- const bisectCandidates = hist.commits
+ // Bisect candidates: commits after peak that also report this metric.
+ const peakHistIdx = peakHist.commits.findIndex((c) => c.sha === peakEntry.sha);
+ const bisectCandidates = peakHist.commits
.slice(peakHistIdx + 1)
- .filter((c) => c.metrics && metric.name in c.metrics)
+ .filter((c) => c.metrics?.[metric.name]?.percent_delta_ci)
.map((c) => ({ sha: c.sha, msg: c.msg, pr: c.pr ?? null }));
- const currentMid = (currentCi[0] + currentCi[1]) / 2;
- const peakMid = (peakCi[0] + peakCi[1]) / 2;
- const deltaFromPeakPct = peakMid > 0 ? ((currentMid - peakMid) / peakMid) * 100 : 0;
+ // Difference in percentage points (not %): current's pct-delta midpoint
+ // minus peak's. Positive reads as "regressed N pp of improvement."
+ const currentMid = (currentPctCi[0] + currentPctCi[1]) / 2;
+ const peakMid = (peakPctCi[0] + peakPctCi[1]) / 2;
+ const deltaFromPeakPct = currentMid - peakMid;
+
+ const drift = computeBaselineDrift(
+ metric.name,
+ metric.baselineSha,
+ peakMetric.baseline_sha,
+ driftHist,
+ );
return {
status,
@@ -772,11 +858,72 @@ function computeHistoryStatus(metric, hist) {
sha: peakEntry.sha,
msg: peakEntry.msg,
pr: peakEntry.pr ?? null,
- ci: peakCi,
+ ci: peakMetric.ci,
mean_ms: peakMetric.mean_ms,
+ percent_delta_ci: peakPctCi,
+ baseline_sha: peakMetric.baseline_sha ?? null,
},
delta_from_peak_pct: Number(deltaFromPeakPct.toFixed(2)),
bisect_candidates: bisectCandidates,
+ drift,
+ };
+}
+
+/**
+ * Quantify cumulative main-side drift between two baselines on a metric by
+ * walking the chain of main commits between them and combining their
+ * within-session percent-deltas. Subtracting absolute ms across sessions
+ * would mix per-session environmental variance into the result.
+ *
+ * Returns one of:
+ * { detected: false } — baselines match (or one/both unknown)
+ * { detected: true, magnitude: N, chain_len: K, missing: M } — quantified; N in pp, positive = main got slower
+ * { detected: true, magnitude: null, chain_len: K, missing: M } — chain partially or wholly unwalkable
+ *
+ * Combines multiplicatively: ∏(1 + pct_i) − 1.
+ */
+function computeBaselineDrift(metricName, currentBaselineSha, peakBaselineSha, hist) {
+ if (!currentBaselineSha || !peakBaselineSha) {
+ return { detected: false };
+ }
+ if (currentBaselineSha === peakBaselineSha) {
+ return { detected: false };
+ }
+ if (!hist || hist.commits.length === 0) {
+ return { detected: true, magnitude: null, chain_len: 0, missing: 0 };
+ }
+ const idxCurrent = hist.commits.findIndex((c) => c.sha === currentBaselineSha);
+ const idxPeak = hist.commits.findIndex((c) => c.sha === peakBaselineSha);
+ if (idxCurrent < 0 || idxPeak < 0) {
+ return { detected: true, magnitude: null, chain_len: 0, missing: 0 };
+ }
+ const lo = Math.min(idxCurrent, idxPeak);
+ const hi = Math.max(idxCurrent, idxPeak);
+ // Commits AFTER the older baseline through the newer one — their deltas are
+ // what accumulated between the two baselines.
+ const chain = hist.commits.slice(lo + 1, hi + 1);
+ let chainLen = 0;
+ let missing = 0;
+ let cumulative = 1;
+ for (const c of chain) {
+ const m = c.metrics?.[metricName];
+ if (!m?.percent_delta_ci) {
+ missing++;
+ continue;
+ }
+ chainLen++;
+ const pctMid = (m.percent_delta_ci[0] + m.percent_delta_ci[1]) / 2;
+ cumulative *= 1 + pctMid / 100;
+ }
+ if (chainLen === 0) {
+ return { detected: true, magnitude: null, chain_len: 0, missing };
+ }
+ const magnitudePp = (cumulative - 1) * 100;
+ return {
+ detected: true,
+ magnitude: Number(magnitudePp.toFixed(2)),
+ chain_len: chainLen,
+ missing,
};
}
@@ -811,14 +958,6 @@ function round4([low, high]) {
return [Number(low.toFixed(4)), Number(high.toFixed(4))];
}
-function* walk(dir) {
- for (const ent of fs.readdirSync(dir, { withFileTypes: true })) {
- const full = path.join(dir, ent.name);
- if (ent.isDirectory()) { yield* walk(full); }
- else { yield full; }
- }
-}
-
function parseArgs(argv) {
const out = {};
for (let i = 0; i < argv.length; i++) {
diff --git a/tools/ci/bench/reporter/reporter.test.js b/tools/ci/bench/reporter/reporter.test.js
index 895e66735..d9f22495d 100644
--- a/tools/ci/bench/reporter/reporter.test.js
+++ b/tools/ci/bench/reporter/reporter.test.js
@@ -28,6 +28,8 @@ function runReporter({
repo = '',
wallClock = '',
history = '',
+ prHistory = '',
+ scope = '',
resultsDir = null,
}) {
const tmp = fs.mkdtempSync(path.join('/tmp', 'bench-report-test-'));
@@ -55,6 +57,8 @@ function runReporter({
// resolves to /tools/ci/bench/reporter/bench-history.json;
// tests opt in to a specific fixture.
if (history) { argv.push('--history', history); }
+ if (prHistory) { argv.push('--pr-history', prHistory); }
+ if (scope) { argv.push('--scope', scope); }
// Run from repo root so resolveMetricSource can find packages/.../bench/tachometer
const cwd = path.resolve(__dirname, '..', '..', '..', '..');
execFileSync('node', argv, { stdio: ['ignore', 'pipe', 'inherit'], cwd });
@@ -63,30 +67,55 @@ function runReporter({
return { report, markdown };
}
+function writeFixture(content) {
+ const tmp = fs.mkdtempSync('/tmp/bench-fixture-');
+ const filePath = path.join(tmp, 'history.json');
+ fs.writeFileSync(filePath, JSON.stringify(content));
+ return filePath;
+}
+
/**
* Build a tiny tachometer JSON fixture with one metric whose this-change
- * mean CI is the caller-specified range. Used to force specific cross-run
- * outcomes against fixtures/history-sample.json.
+ * mean CI and percent-delta CI are caller-specified. Used to force specific
+ * cross-run outcomes against fixtures/history-sample.json.
+ *
+ * `pctDeltaCi` is the within-session percent-delta vs tip-of-tree — the
+ * number cross-iteration peak attribution operates on. If `baselineSha`
+ * is provided, also writes the baseline-sha.txt sidecar (so the reporter
+ * picks up the current run's baseline for drift detection).
*/
-function writeHandcraftedResults(metricName, thisChangeCi, tipOfTreeCi = [10, 11]) {
+function writeHandcraftedResults(
+ metricName,
+ thisChangeCi,
+ tipOfTreeCi = [10, 11],
+ pctDeltaCi = [-5, 5],
+ baselineSha = '',
+) {
const dir = fs.mkdtempSync('/tmp/bench-reporter-handcraft-');
+ const diff = {
+ absolute: { low: -1, high: 1 },
+ percentChange: { low: pctDeltaCi[0], high: pctDeltaCi[1] },
+ };
const data = {
benchmarks: [
{
name: `this-change [${metricName}]`,
measurement: { name: metricName, mode: 'performance', entryName: metricName },
mean: { low: thisChangeCi[0], high: thisChangeCi[1] },
- differences: [null, { absolute: { low: -1, high: 1 }, percentChange: { low: -5, high: 5 } }],
+ differences: [null, diff],
},
{
name: `tip-of-tree [${metricName}]`,
measurement: { name: metricName, mode: 'performance', entryName: metricName },
mean: { low: tipOfTreeCi[0], high: tipOfTreeCi[1] },
- differences: [{ absolute: { low: -1, high: 1 }, percentChange: { low: -5, high: 5 } }, null],
+ differences: [diff, null],
},
],
};
fs.writeFileSync(path.join(dir, 'handcrafted.json'), JSON.stringify(data));
+ if (baselineSha) {
+ fs.writeFileSync(path.join(dir, 'baseline-sha.txt'), baselineSha);
+ }
return dir;
}
@@ -193,8 +222,9 @@ test('real-delta fixture — rubric markdown structure', () => {
// Rows auto-expanded (≤ 15) — no teaser pattern
assert.ok(!markdown.includes('top 5 shown'));
- // Metric source links present (paths resolved to bench.js / bench-todo.js at given SHA)
- assert.ok(/\/blob\/abcdef012345678\/packages\/renderer\/bench\/tachometer\/bench[-\w]*\.js#L\d+/.test(markdown));
+ // Metric source links resolve to bench-*.js. Match any package — the
+ // suite layout isn't pinned and reorganizes over time.
+ assert.ok(/\/blob\/abcdef012345678\/packages\/\w+\/bench\/tachometer\/bench[-\w]*\.js#L\d+/.test(markdown));
// Wall-clock footer
assert.ok(markdown.includes('Wall-clock: 10m42s'));
@@ -307,10 +337,10 @@ test('severity emoji suffix placement', () => {
const HISTORY_FIXTURE = path.join(__dirname, 'fixtures', 'history-sample.json');
-test('cross-run: WIN when current CI dominates historical peak', () => {
- // history-sample.json has update-10th peak CI [6.0, 7.0] at bbbb...
- // Current CI [4.5, 5.5] is entirely below → WIN
- const dir = writeHandcraftedResults('update-10th', [4.5, 5.5]);
+test('cross-run: WIN when current pct-delta dominates historical peak', () => {
+ // history-sample.json's update-10th peak percent_delta_ci is [-40, -35] at bbbb.
+ // Current pct-delta [-50, -45] is more negative across the whole range → WIN.
+ const dir = writeHandcraftedResults('update-10th', [4.5, 5.5], [10, 11], [-50, -45]);
const { report, markdown } = runReporter({
resultsDir: dir,
sha: 'feedbeef',
@@ -320,8 +350,8 @@ test('cross-run: WIN when current CI dominates historical peak', () => {
});
const m = report.metrics.find((x) => x.name === 'update-10th');
assert.equal(m.history_status, 'WIN');
- assert.equal(m.peak.sha, 'bbbb222222222222', 'peak commit is the best prior entry');
- assert.ok(m.delta_from_peak_pct < 0, 'delta is negative when current is faster than peak');
+ assert.equal(m.peak.sha, 'bbbb222222222222', 'peak entry is the most-improved prior');
+ assert.ok(m.delta_from_peak_pct < 0, 'delta_from_peak_pct is negative when current dominates peak');
assert.ok(!markdown.includes('Regressions from peak'), 'no reopened section when no REOPENED');
});
@@ -329,6 +359,7 @@ test('cross-run: peak links to PR conversation when PR number is known', () => {
// Peak has pr: 102 in the fixture → peak SHA in markdown should link
// to /pull/102, not /commit/. Takes reviewers directly to the
// bench comment from that PR (which is where the peak value came from).
+ // Default current pct-delta [-5, 5] vs peak [-40, -35] triggers REOPENED.
const dir = writeHandcraftedResults('update-10th', [20.0, 21.0]);
const { report, markdown } = runReporter({
resultsDir: dir,
@@ -349,9 +380,9 @@ test('cross-run: peak links to PR conversation when PR number is known', () => {
assert.ok(markdown.includes('](https://github.com/Semantic-Org/Semantic-Next/pull/104)'));
});
-test('cross-run: REOPENED when a prior commit dominates current', () => {
- // history-sample.json peak for update-10th = CI [6.0, 7.0]
- // Current CI [20.0, 21.0] is entirely above → REOPENED
+test('cross-run: REOPENED when a prior iteration dominates current', () => {
+ // history-sample.json peak for update-10th = pct-delta [-40, -35] at bbbb.
+ // Default current pct-delta [-5, 5] is entirely above peak's range → REOPENED.
const dir = writeHandcraftedResults('update-10th', [20.0, 21.0]);
const { report, markdown } = runReporter({
resultsDir: dir,
@@ -364,7 +395,11 @@ test('cross-run: REOPENED when a prior commit dominates current', () => {
const m = report.metrics.find((x) => x.name === 'update-10th');
assert.equal(m.history_status, 'REOPENED');
assert.equal(m.peak.sha, 'bbbb222222222222');
- assert.ok(m.delta_from_peak_pct > 50, `delta should be large positive when reopened (got ${m.delta_from_peak_pct})`);
+ // Current midpoint 0pp - peak midpoint -37.5pp = +37.5pp regressed from peak.
+ assert.ok(
+ m.delta_from_peak_pct > 30,
+ `delta_from_peak_pct should be large positive (got ${m.delta_from_peak_pct})`,
+ );
// Bisect candidates = commits after peak (cccc, dddd)
const bisectShas = m.bisect_candidates.map((c) => c.sha);
@@ -376,10 +411,10 @@ test('cross-run: REOPENED when a prior commit dominates current', () => {
assert.ok(markdown.includes('📜 1 reopened'), 'headline count includes reopened');
});
-test('cross-run: TIED-PEAK when CIs overlap', () => {
- // history peak for update-10th = [6.0, 7.0]
- // Current CI [6.5, 7.5] overlaps → TIED-PEAK
- const dir = writeHandcraftedResults('update-10th', [6.5, 7.5]);
+test('cross-run: TIED-PEAK when pct-delta CIs overlap', () => {
+ // history peak for update-10th = pct-delta [-40, -35]
+ // Current pct-delta [-38, -33] overlaps → TIED-PEAK
+ const dir = writeHandcraftedResults('update-10th', [6.5, 7.5], [10, 11], [-38, -33]);
const { report } = runReporter({
resultsDir: dir,
sha: 'aaaa',
@@ -391,6 +426,231 @@ test('cross-run: TIED-PEAK when CIs overlap', () => {
assert.equal(m.history_status, 'TIED-PEAK');
});
+test('--scope pr excludes main-history from peak attribution', () => {
+ // Default current pct-delta [-5, 5] would be REOPENED vs main-history's
+ // peak [-40, -35]. With --scope pr and an empty PR-iteration history,
+ // peak attribution sees no entries → no history_status, no REOPENED
+ // section in markdown.
+ const dir = writeHandcraftedResults('update-10th', [20, 21]);
+ const emptyPrHistory = writeFixture({ schema_version: 2, commits: [] });
+ const { report, markdown } = runReporter({
+ resultsDir: dir,
+ sha: 'abc',
+ msg: 'x',
+ baseSha: 'def',
+ history: HISTORY_FIXTURE,
+ prHistory: emptyPrHistory,
+ scope: 'pr',
+ });
+ const m = report.metrics.find((x) => x.name === 'update-10th');
+ assert.ok(!('history_status' in m), 'no peak attribution under --scope pr with empty PR history');
+ assert.ok(!markdown.includes('Regressions from peak'), 'no REOPENED section');
+ assert.equal(report.scope, 'pr', 'scope echoed to JSON adjunct');
+});
+
+test('--scope pr uses PR-iteration history when present', () => {
+ // PR-iteration history has its own "peak" at -30%; current at -5% → REOPENED.
+ // Main-history's [-40, -35] peak is excluded under --scope pr.
+ const dir = writeHandcraftedResults('update-10th', [10, 11]);
+ const prHistoryFile = writeFixture({
+ schema_version: 2,
+ commits: [{
+ sha: 'pr-iter-1234567',
+ msg: 'iteration 1',
+ parent_sha: '',
+ timestamp: '2026-04-20T00:00:00Z',
+ pr: null,
+ metrics: {
+ 'update-10th': {
+ ci: [9, 10],
+ mean_ms: 9.5,
+ percent_delta_ci: [-32, -28],
+ baseline_sha: 'main-tip-12345',
+ },
+ },
+ }],
+ });
+ const { report } = runReporter({
+ resultsDir: dir,
+ sha: 'abc',
+ msg: 'x',
+ baseSha: 'def',
+ history: HISTORY_FIXTURE,
+ prHistory: prHistoryFile,
+ scope: 'pr',
+ });
+ const m = report.metrics.find((x) => x.name === 'update-10th');
+ assert.equal(m.history_status, 'REOPENED', 'REOPENED against PR-iteration peak only');
+ assert.equal(m.peak.sha, 'pr-iter-1234567', 'peak from PR history, not main-history');
+});
+
+test('drift flag fires with magnitude when chain is quantifiable', () => {
+ // Setup: current's baseline is mainB; PR-iteration peak's baseline is mainA.
+ // main-history has commits between mainA and mainB whose percent_delta_ci
+ // values combine to >5pp drift.
+ const driftHistory = writeFixture({
+ schema_version: 2,
+ commits: [
+ {
+ sha: 'mainA',
+ msg: 'main A',
+ parent_sha: '',
+ timestamp: '2026-04-15T00:00:00Z',
+ pr: null,
+ metrics: {
+ 'update-10th': { ci: [10, 11], mean_ms: 10.5 },
+ },
+ },
+ {
+ sha: 'main-mid',
+ msg: 'main mid commit',
+ parent_sha: 'mainA',
+ timestamp: '2026-04-16T00:00:00Z',
+ pr: null,
+ metrics: {
+ 'update-10th': {
+ ci: [10.5, 11.5],
+ mean_ms: 11,
+ percent_delta_ci: [4, 6],
+ baseline_sha: 'mainA',
+ },
+ },
+ },
+ {
+ sha: 'mainB',
+ msg: 'main B',
+ parent_sha: 'main-mid',
+ timestamp: '2026-04-17T00:00:00Z',
+ pr: null,
+ metrics: {
+ 'update-10th': {
+ ci: [11, 12],
+ mean_ms: 11.5,
+ percent_delta_ci: [3, 5],
+ baseline_sha: 'main-mid',
+ },
+ },
+ },
+ ],
+ });
+ // PR-iteration peak: pct-delta [-30, -25] vs mainA. Best so far.
+ const prHistory = writeFixture({
+ schema_version: 2,
+ commits: [{
+ sha: 'pr-best-iter',
+ msg: 'best iteration',
+ parent_sha: '',
+ timestamp: '2026-04-15T01:00:00Z',
+ pr: null,
+ metrics: {
+ 'update-10th': {
+ ci: [7, 8],
+ mean_ms: 7.5,
+ percent_delta_ci: [-30, -25],
+ baseline_sha: 'mainA',
+ },
+ },
+ }],
+ });
+ // Current: pct-delta [-5, 0] vs mainB. Worse than peak → REOPENED + drift.
+ const dir = writeHandcraftedResults('update-10th', [11, 12], [11.5, 12.5], [-5, 0], 'mainB');
+ const { report, markdown } = runReporter({
+ resultsDir: dir,
+ sha: 'current',
+ msg: 'x',
+ baseSha: 'def',
+ history: driftHistory,
+ prHistory,
+ scope: 'pr',
+ });
+ const m = report.metrics.find((x) => x.name === 'update-10th');
+ assert.equal(m.history_status, 'REOPENED');
+ assert.ok(m.drift?.detected, 'drift detected');
+ assert.ok(m.drift.magnitude !== null, 'magnitude available');
+ // Chain: main-mid (+5pp midpoint) × mainB (+4pp midpoint). Combined ~9pp.
+ assert.ok(m.drift.magnitude > 5, `drift magnitude > 5pp (got ${m.drift.magnitude})`);
+ assert.equal(m.drift.chain_len, 2, 'two chain links');
+ assert.equal(m.drift.missing, 0);
+ // Markdown footnote rendered
+ assert.ok(/⚠️1 main moved \+\d+pp/.test(markdown), 'drift footnote with magnitude');
+ assert.ok(markdown.includes('chained across 2 main commits'));
+});
+
+test('drift flag binary-only when main-history is empty', () => {
+ // Current and peak baselines differ, but driftHist has nothing to walk →
+ // detected: true, magnitude: null. Footnote renders without a number.
+ const prHistory = writeFixture({
+ schema_version: 2,
+ commits: [{
+ sha: 'pr-best',
+ msg: 'best',
+ parent_sha: '',
+ timestamp: '2026-04-15T00:00:00Z',
+ pr: null,
+ metrics: {
+ 'update-10th': {
+ ci: [7, 8],
+ mean_ms: 7.5,
+ percent_delta_ci: [-30, -25],
+ baseline_sha: 'mainA',
+ },
+ },
+ }],
+ });
+ const emptyMain = writeFixture({ schema_version: 2, commits: [] });
+ const dir = writeHandcraftedResults('update-10th', [11, 12], [11.5, 12.5], [-5, 0], 'mainB-different');
+ const { report, markdown } = runReporter({
+ resultsDir: dir,
+ sha: 'current',
+ msg: 'x',
+ baseSha: 'def',
+ history: emptyMain,
+ prHistory,
+ scope: 'pr',
+ });
+ const m = report.metrics.find((x) => x.name === 'update-10th');
+ assert.equal(m.history_status, 'REOPENED');
+ assert.ok(m.drift?.detected, 'drift detected (binary)');
+ assert.equal(m.drift.magnitude, null, 'magnitude unavailable when chain is empty');
+ assert.ok(/drift magnitude unavailable/.test(markdown), 'gap footnote rendered');
+});
+
+test('no drift flag when baselines match', () => {
+ // Current and peak share a baseline → no drift, no flag, no footnote.
+ const sharedBaseline = 'shared-main-tip';
+ const prHistory = writeFixture({
+ schema_version: 2,
+ commits: [{
+ sha: 'pr-best',
+ msg: 'best',
+ parent_sha: '',
+ timestamp: '2026-04-15T00:00:00Z',
+ pr: null,
+ metrics: {
+ 'update-10th': {
+ ci: [7, 8],
+ mean_ms: 7.5,
+ percent_delta_ci: [-30, -25],
+ baseline_sha: sharedBaseline,
+ },
+ },
+ }],
+ });
+ const dir = writeHandcraftedResults('update-10th', [11, 12], [11.5, 12.5], [-5, 0], sharedBaseline);
+ const { report, markdown } = runReporter({
+ resultsDir: dir,
+ sha: 'current',
+ msg: 'x',
+ baseSha: 'def',
+ prHistory,
+ scope: 'pr',
+ });
+ const m = report.metrics.find((x) => x.name === 'update-10th');
+ assert.equal(m.history_status, 'REOPENED');
+ assert.ok(!m.drift || m.drift.detected === false, 'no drift when baselines match');
+ assert.ok(!markdown.includes('main moved'), 'no drift footnote');
+});
+
test('cross-run: graceful degrade when history file is missing', () => {
const dir = writeHandcraftedResults('update-10th', [10, 11]);
const { report, markdown } = runReporter({