diff --git a/.github/workflows/baselines.yml b/.github/workflows/baselines.yml
new file mode 100644
index 0000000..78b5442
--- /dev/null
+++ b/.github/workflows/baselines.yml
@@ -0,0 +1,85 @@
+name: Baselines
+
+# Manually-triggered: reseeds benchmark/baselines/baseline-{adreno,mali}.json
+# from the latest Benchmark workflow run on this branch (or a specific run id),
+# then commits the updated files. The next Benchmark run on this branch will
+# compare against the new baselines — turning a previously-red "improvement"
+# or first-real-data PR green.
+on:
+  workflow_dispatch:
+    inputs:
+      run_id:
+        description: "Benchmark run ID to source from (blank = latest on this branch)"
+        required: false
+        type: string
+
+concurrency:
+  group: baselines-${{ github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  update-baselines:
+    name: update-baselines
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      actions: read
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.ref_name }}
+          # Need write token so we can push back.
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Resolve source Benchmark run
+        id: run
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          if [ -n "${{ inputs.run_id }}" ]; then
+            id="${{ inputs.run_id }}"
+          else
+            id=$(gh run list \
+                   --workflow=benchmark.yml \
+                   --branch="${{ github.ref_name }}" \
+                   --limit=1 \
+                   --json databaseId \
+                   --jq '.[0].databaseId')
+            if [ -z "$id" ] || [ "$id" = "null" ]; then
+              echo "::error::No Benchmark run found on branch ${{ github.ref_name }}. Run Benchmark first."
+              exit 1
+            fi
+          fi
+          echo "Sourcing baselines from Benchmark run $id"
+          echo "id=$id" >> "$GITHUB_OUTPUT"
+
+      - name: Download adreno results
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: gh run download "${{ steps.run.outputs.id }}" -n benchmark-results-adreno -D adreno/
+
+      - name: Download mali results
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: gh run download "${{ steps.run.outputs.id }}" -n benchmark-results-mali -D mali/
+
+      - name: Overwrite baseline files
+        run: |
+          cp adreno/results-adreno.json benchmark/baselines/baseline-adreno.json
+          cp mali/results-mali.json     benchmark/baselines/baseline-mali.json
+          echo "--- adreno baseline ---"
+          head -20 benchmark/baselines/baseline-adreno.json
+          echo "--- mali baseline ---"
+          head -20 benchmark/baselines/baseline-mali.json
+
+      - name: Commit and push
+        run: |
+          git config user.name  "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          if git diff --quiet benchmark/baselines/; then
+            echo "Baselines already match Benchmark run ${{ steps.run.outputs.id }} — nothing to commit."
+            exit 0
+          fi
+          git add benchmark/baselines/baseline-adreno.json benchmark/baselines/baseline-mali.json
+          git commit -m "ci: refresh baselines from Benchmark run ${{ steps.run.outputs.id }}"
+          git push origin "HEAD:${{ github.ref_name }}"
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..f60572c
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,300 @@
+name: Benchmark
+
+on:
+  pull_request:
+
+# Only one benchmark run per PR branch at a time; cancel the stale one.
+concurrency:
+  group: benchmark-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # ── Job 1: build (same as build.yml but runs inside this workflow so the
+  # benchmark jobs can download the artifacts without a cross-workflow lookup) ──
+  build:
+    name: build
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: 17
+
+      - uses: gradle/actions/setup-gradle@v3
+        with:
+          cache-read-only: true
+
+      - name: Build app + androidTest APKs (arm64-v8a only)
+        run: |
+          ./gradlew \
+            :app:assembleRelease \
+            :app:assembleReleaseAndroidTest \
+            -Pandroid.injected.build.abi=arm64-v8a \
+            --stacktrace
+
+      - name: Stage APKs for upload
+        run: |
+          mkdir -p staged-apks
+          find app/build -name "app-release.apk"             -exec cp {} staged-apks/app-release.apk \;
+          find app/build -name "app-release-androidTest.apk" -exec cp {} staged-apks/app-release-androidTest.apk \;
+          ls -lh staged-apks/
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: app-release-apk
+          path: staged-apks/app-release.apk
+          retention-days: 1
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: app-release-androidTest-apk
+          path: staged-apks/app-release-androidTest.apk
+          retention-days: 1
+
+  # ── Reusable FTL runner ───────────────────────────────────────────────────────
+  # Two parallel jobs — one per GPU family.  Both download the same APKs from
+  # the build job, run the benchmark on a different FTL device, then compare
+  # against the matching baseline file.
+
+  benchmark-adreno:
+    name: benchmark-adreno
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/download-artifact@v4
+        with:
+          name: app-release-apk
+          path: apks/
+
+      - uses: actions/download-artifact@v4
+        with:
+          name: app-release-androidTest-apk
+          path: apks/
+
+      - uses: google-github-actions/auth@v2
+        with:
+          credentials_json: ${{ secrets.GCP_SA_KEY }}
+
+      - uses: google-github-actions/setup-gcloud@v2
+
+      - name: Run frame-latency capture on FTL (Galaxy A52s — Adreno 642L)
+        run: |
+          # Spark free tier: 5 physical device-runs/day.
+          # a52sxq = Galaxy A52s 5G, Snapdragon 778G, Adreno 642L.
+          # Picked over redfin (Pixel 5 / Adreno 620) because redfin is locked
+          # to Android 11 on FTL and perfetto's short-form CLI requires API 31+.
+          # --timeout is generous; the test itself runs 5×10 s = 50 s of actual
+          # capture, plus app warm-up and FTL setup overhead (~2 min total).
+          set -o pipefail
+          gcloud firebase test android run \
+            --type instrumentation \
+            --app apks/app-release.apk \
+            --test apks/app-release-androidTest.apk \
+            --device model=a52sxq,version=34,locale=en,orientation=portrait \
+            --timeout 10m \
+            --directories-to-pull /sdcard/Android/media/com.dz.camerafast/additional_test_output \
+            --results-bucket ${{ secrets.GCP_RESULTS_BUCKET }} \
+            --results-dir benchmark-adreno-${{ github.run_id }}-${{ github.run_attempt }} \
+            --test-runner-class androidx.test.runner.AndroidJUnitRunner \
+            --test-targets "class com.dz.camerafast.perf.FrameLatencyCapture" \
+            --environment-variables "additionalTestOutputDir=/sdcard/Android/media/com.dz.camerafast/additional_test_output,dz.iterations=5,dz.duration.ms=10000" \
+            2>&1 | tee ftl-adreno.log
+
+      - name: Pull trace output from GCS
+        run: |
+          # FTL preserves the full on-device path under artifacts/, so our
+          # /sdcard/Android/media/com.dz.camerafast/additional_test_output/
+          # ends up at artifacts/sdcard/Android/media/<pkg>/additional_test_output/.
+          # gsutil cp -r requires the destination dir to exist when source
+          # resolves to multiple files.
+          mkdir -p trace-output-adreno
+          gsutil -m cp -r \
+            "gs://${{ secrets.GCP_RESULTS_BUCKET }}/benchmark-adreno-${{ github.run_id }}-${{ github.run_attempt }}/a52sxq-34-en-portrait/artifacts/sdcard/Android/media/com.dz.camerafast/additional_test_output" \
+            trace-output-adreno/
+
+      - name: Aggregate traces → results.json
+        run: |
+          python3 scripts/aggregate-traces.py \
+            trace-output-adreno \
+            results-adreno.json \
+            --device-model "Galaxy A52s 5G" \
+            --gpu "Adreno 642L" \
+            --ftl-model-id "a52sxq" \
+            --android-sdk 34 \
+            --duration-s 10
+
+      - name: Compare against baseline
+        run: |
+          python3 scripts/compare-baseline.py \
+            benchmark/baselines/baseline-adreno.json \
+            results-adreno.json \
+            --output-md comparison-adreno.md
+
+      - uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: benchmark-results-adreno
+          path: |
+            results-adreno.json
+            comparison-adreno.md
+            trace-output-adreno/
+            ftl-adreno.log
+          retention-days: 14
+
+  benchmark-mali:
+    name: benchmark-mali
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/download-artifact@v4
+        with:
+          name: app-release-apk
+          path: apks/
+
+      - uses: actions/download-artifact@v4
+        with:
+          name: app-release-androidTest-apk
+          path: apks/
+
+      - uses: google-github-actions/auth@v2
+        with:
+          credentials_json: ${{ secrets.GCP_SA_KEY }}
+
+      - uses: google-github-actions/setup-gcloud@v2
+
+      - name: Run frame-latency capture on FTL (Pixel 6 — Mali-G78)
+        run: |
+          # oriole = Pixel 6, Google Tensor, Mali-G78. Android 13 (API 33) so
+          # perfetto's short-form CLI is available.
+          set -o pipefail
+          gcloud firebase test android run \
+            --type instrumentation \
+            --app apks/app-release.apk \
+            --test apks/app-release-androidTest.apk \
+            --device model=oriole,version=33,locale=en,orientation=portrait \
+            --timeout 10m \
+            --directories-to-pull /sdcard/Android/media/com.dz.camerafast/additional_test_output \
+            --results-bucket ${{ secrets.GCP_RESULTS_BUCKET }} \
+            --results-dir benchmark-mali-${{ github.run_id }}-${{ github.run_attempt }} \
+            --test-runner-class androidx.test.runner.AndroidJUnitRunner \
+            --test-targets "class com.dz.camerafast.perf.FrameLatencyCapture" \
+            --environment-variables "additionalTestOutputDir=/sdcard/Android/media/com.dz.camerafast/additional_test_output,dz.iterations=5,dz.duration.ms=10000" \
+            2>&1 | tee ftl-mali.log
+
+      - name: Pull trace output from GCS
+        run: |
+          mkdir -p trace-output-mali
+          gsutil -m cp -r \
+            "gs://${{ secrets.GCP_RESULTS_BUCKET }}/benchmark-mali-${{ github.run_id }}-${{ github.run_attempt }}/oriole-33-en-portrait/artifacts/sdcard/Android/media/com.dz.camerafast/additional_test_output" \
+            trace-output-mali/
+
+      - name: Aggregate traces → results.json
+        run: |
+          python3 scripts/aggregate-traces.py \
+            trace-output-mali \
+            results-mali.json \
+            --device-model "Pixel 6" \
+            --gpu "Mali-G78" \
+            --ftl-model-id "oriole" \
+            --android-sdk 33 \
+            --duration-s 10
+
+      - name: Compare against baseline
+        run: |
+          python3 scripts/compare-baseline.py \
+            benchmark/baselines/baseline-mali.json \
+            results-mali.json \
+            --output-md comparison-mali.md
+
+      - uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: benchmark-results-mali
+          path: |
+            results-mali.json
+            comparison-mali.md
+            trace-output-mali/
+            ftl-mali.log
+          retention-days: 14
+
+  # ── PR comment with the consolidated p50/p90/p99 delta table ──────────────
+  # Runs after both benchmark jobs regardless of their pass/fail status so a
+  # regression still produces a visible comment (showing which metric tripped).
+  # PR-merge gating remains on the individual benchmark-{adreno,mali} jobs.
+  comment:
+    name: comment
+    needs: [benchmark-adreno, benchmark-mali]
+    if: always() && github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    steps:
+      - uses: actions/download-artifact@v4
+        if: always()
+        continue-on-error: true
+        with:
+          name: benchmark-results-adreno
+          path: adreno/
+
+      - uses: actions/download-artifact@v4
+        if: always()
+        continue-on-error: true
+        with:
+          name: benchmark-results-mali
+          path: mali/
+
+      - name: Build comment body
+        run: |
+          {
+            echo '<!-- benchmark-comment -->'
+            echo '## Frame-latency benchmark'
+            echo
+            echo '### Adreno (Galaxy A52s 5G, Adreno 642L)'
+            if [ -f adreno/comparison-adreno.md ]; then
+              cat adreno/comparison-adreno.md
+            else
+              echo '> ❌ benchmark-adreno did not produce a comparison — see the workflow run for details.'
+            fi
+            echo
+            echo '### Mali (Pixel 6, Mali-G78)'
+            if [ -f mali/comparison-mali.md ]; then
+              cat mali/comparison-mali.md
+            else
+              echo '> ❌ benchmark-mali did not produce a comparison — see the workflow run for details.'
+            fi
+            echo
+            echo '---'
+            echo
+            echo 'To re-seed baselines from this run, manually trigger the **Baselines** workflow under [Actions → Baselines](../../actions/workflows/baselines.yml) and pick this branch as the ref. (Only visible after the workflow file lands on the default branch — GitHub limitation for `workflow_dispatch`.)'
+          } > comment.md
+          echo "--- preview ---"
+          cat comment.md
+
+      - uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const body = fs.readFileSync('comment.md', 'utf8');
+            const marker = '<!-- benchmark-comment -->';
+            const pr = context.issue.number;
+            const { data: comments } = await github.rest.issues.listComments({
+              ...context.repo, issue_number: pr,
+            });
+            const existing = comments.find(c => (c.body || '').includes(marker));
+            if (existing) {
+              await github.rest.issues.updateComment({
+                ...context.repo, comment_id: existing.id, body,
+              });
+            } else {
+              await github.rest.issues.createComment({
+                ...context.repo, issue_number: pr, body,
+              });
+            }
diff --git a/.gitignore b/.gitignore
index 214cb5d..c735c7f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,5 @@
 .cxx
 **/build
 /.idea
-/.cache
\ No newline at end of file
+/.cache
+.java-version
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
index 9773e77..df4f107 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -75,11 +75,14 @@ Design decisions worth remembering:
 | What you want | Skill / Script |
 |---|---|
 | One-shot frame-latency measurement (single N-second capture) | `scripts/measure-frame-latency.sh [seconds]` |
-| Establish a baseline with dispersion (5 × 10s by default, JSON output) | `scripts/baseline-frame-latency.sh` — invokable via `/frame-latency-baseline` |
+| Establish a local baseline with dispersion (5 × 10s by default, JSON output) | `scripts/baseline-frame-latency.sh` — invokable via `/frame-latency-baseline` |
+| Run the CI capture instrumentation test locally on a tethered device | `./gradlew :app:connectedReleaseAndroidTest -Pandroid.injected.build.abi=arm64-v8a -Pandroid.testInstrumentationRunnerArguments.additionalTestOutputDir=/sdcard/Android/media/com.dz.camerafast/additional_test_output` |
+| Aggregate perfetto traces (from FTL or local connected test) into results.json | `scripts/aggregate-traces.py <traces-dir> <output.json>` |
+| Compare results.json against a per-GPU baseline | `scripts/compare-baseline.py benchmark/baselines/baseline-<gpu>.json results.json` |
 | Build, install, launch, screenshot for visual verification of UI changes | `/verify-on-device` |
 | Discover Android-platform skills (camera, performance, perfetto-sql, etc.) | `vendor/android-skills/` submodule |
 
-All three scripts/skills assume a single ADB device. Set `ANDROID_SERIAL=<serial>` if multiple are attached. They auto-download Perfetto's `trace_processor` to `.cache/frame-latency/` (gitignored, ~25 MB) on first run.
+The bash scripts and the `:app/androidTest` capture (`FrameLatencyCapture`) both emit `.pftrace` files that `scripts/aggregate-traces.py` consumes, so there is one place for stats math. All tools assume a single ADB device locally; set `ANDROID_SERIAL=<serial>` if multiple are attached. `trace_processor` is auto-downloaded to `.cache/frame-latency/` (gitignored, ~25 MB) on first use.
 
 ## Build / install gotchas
 
@@ -118,17 +121,33 @@ OpenGL is ~1.7 ms faster end-to-end on average (`frame_e2e` avg 13.25 vs 14.91),
 **Most of `frame_to_screen` is vsync wait.** `frame_to_screen.gl.avg ≈ 10.9 ms` but only ~0.57 ms of that is actual GL command submission (`frame_render.gl.avg`); the remaining ~10.3 ms is Choreographer/vsync wait. Same shape for Vulkan: ~11.7 ms total vs ~1.76 ms of work. Optimizations that shave µs off GL/VK commands won't move the e2e needle until the vsync wait is what we're trying to displace (e.g. higher refresh rate, lower-latency presentation extensions).
 
 **Which metrics to gate PRs on:**
-- **Tight (±5%)**: `frame_e2e.{gl,vk}.{avg, p90, p99}`, `frame_to_screen.{gl,vk}.p90`. All sub-3% CV.
-- **Looser (±10%)**: `frame_render.{gl,vk}.avg`, `frame_native_proc.{gl,vk}.avg`. CV 2–7%.
-- **Watch only, no gate**: `frame_native_proc.{gl,vk}.{p90, p99}` and `frame_render.{gl,vk}.{p90, p99}` (5–25% CV — single-tail-sample noise).
+- **Tight (±5% AND ±1.5 ms)**: `frame_e2e.{gl,vk}.p90`, `frame_to_screen.{gl,vk}.p90`. All sub-3% CV locally; the 1.5 ms floor is calibrated for FTL Pixel 6 / Galaxy A52s, which drift ~1 ms run-to-run on identical commits (vs ~0.25 ms on local SM-F936B).
+- **Looser (±10% AND ±0.5 ms, or ±5 frames for counters)**: `frame_e2e.{gl,vk}.p99`, `dropped_frames.{gl,vk}`. CV 2–7%. p99 is the worst 1% of frames per iteration — inherently outlier-sensitive and observed to be the noisiest tight-tier metric on FTL, so it lives in loose despite frame_e2e.p90 staying tight.
+- **Watch only, no gate**: every `avg` (means are skewed by one slow frame and not representative — p90 captures steady state, p99 the tail), plus `frame_native_proc.{gl,vk}.{p90, p99}` and `frame_render.{gl,vk}.{p90, p99}` (5–25% CV — single-tail-sample noise).
 - **Skip entirely**: every `max` (single-outlier sensitive, 15–40% CV), and `p50` on screen-facing stages (bimodal — submit-to-vsync alignment).
 
+**Dual gate (relative + absolute floor).** Each gated tier has *both* a percentage tolerance and an absolute floor. A metric **passes** when *either* threshold is satisfied — `|Δ%| ≤ tolerance_pct` **OR** `|Δabs| ≤ abs_floor`. The absolute floor exists because sub-ms metrics like `frame_native_proc.avg` (~0.7 ms baseline) blow up to +14% on a 0.1 ms shift that is below any frame-budget significance. Real regressions exceed both thresholds; pure relative noise on tiny absolutes is filtered out.
+
 Slice counts are deterministic to within ±1 per 10 s window: ~298 frames per renderer (~30 fps from camera). A meaningful deviation in count is itself a regression signal.
 
-## Planned next steps (not yet implemented)
+## CI pipeline
+
+Three required GitHub Actions checks gate every PR:
+
+| Check | File | What it does |
+|---|---|---|
+| `build` | `.github/workflows/build.yml` | `assembleRelease` + `assembleReleaseAndroidTest` (arm64-v8a), uploads APK artifacts |
+| `benchmark-adreno` | `.github/workflows/benchmark.yml` | Runs `com.dz.camerafast.perf.FrameLatencyCapture` (an `:app/androidTest` instrumentation test that drives N×Ds Perfetto captures) on FTL Galaxy A52s 5G (Adreno 642L, API 34), compares against `benchmark/baselines/baseline-adreno.json` |
+| `benchmark-mali` | `.github/workflows/benchmark.yml` | Same on FTL Pixel 6 (Mali-G78, API 33), compares against `benchmark/baselines/baseline-mali.json` |
+
+The compare step uses **two-sided tolerance gates** from `benchmark/gates.yaml` (tight ±5%/±1.5 ms, loose ±10%/±0.5 ms — pass if EITHER bound holds):
+- **Exit 1 (regression)** — blocks merge; fix the performance issue.
+- **Exit 2 (improvement)** — also blocks merge; copy the proposed JSON from the step summary into `benchmark/baselines/baseline-<gpu>.json` and commit.
+- **Exit 0** — all gated metrics within tolerance; green.
+
+**Device source:** Firebase Test Lab **Spark free tier** (5 physical runs/day, $0). See `docs/ci-setup.md` for one-time GCP setup (~15 min) and the swap path to BrowserStack Open Source Program (unlimited, apply separately).
 
-- **Macrobenchmark module** wrapping the same capture flow with `TraceSectionMetric`, so the run produces the JSON straight from a Gradle task rather than a bash wrapper. The `testing/testing-setup` skill in `vendor/android-skills/` is the entry point for scaffolding.
-- **CI gate via GitHub Actions** running the macrobenchmark on either Firebase Test Lab (real hardware, paid per device-minute) or Gradle Managed Devices (emulator on the GHA runner, free but GPU≠real). Likely GMD for speed, with periodic FTL runs for trend tracking. The PR check diffs against a `baseline.json` checked into the repo and fails on regressions outside the gates listed above.
+**Per-GPU baseline files** live under `benchmark/baselines/`. They are placeholders until the first FTL CI run seeds them — see `benchmark/baselines/README.md`.
 
 ## Other tooling worth knowing about
 
diff --git a/app/build.gradle b/app/build.gradle
index 99e2a83..ea9a778 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -6,6 +6,11 @@ plugins {
 android {
     compileSdk 35
 
+    // Frame-latency capture (androidTest/.../FrameLatencyCapture.kt) must attach
+    // to a profileable, non-debug APK. The release variant declares
+    // <profileable android:shell="true"/> in AndroidManifest.xml.
+    testBuildType = "release"
+
     defaultConfig {
         applicationId "com.dz.camerafast"
         minSdk 29
@@ -84,4 +89,9 @@ dependencies {
 
     debugImplementation "androidx.compose.ui:ui-tooling:$compose_version"
     implementation "androidx.compose.ui:ui-tooling-preview:$compose_version"
+
+    // Frame-latency capture (com.dz.camerafast.perf.FrameLatencyCapture).
+    androidTestImplementation "junit:junit:4.13.2"
+    androidTestImplementation "androidx.test.ext:junit:1.3.0"
+    androidTestImplementation "androidx.test:runner:1.7.0"
 }
\ No newline at end of file
diff --git a/app/src/androidTest/java/com/dz/camerafast/ExampleInstrumentedTest.kt b/app/src/androidTest/java/com/dz/camerafast/ExampleInstrumentedTest.kt
deleted file mode 100644
index 2a2d347..0000000
--- a/app/src/androidTest/java/com/dz/camerafast/ExampleInstrumentedTest.kt
+++ /dev/null
@@ -1,24 +0,0 @@
-package com.dz.camerafast
-
-import androidx.test.platform.app.InstrumentationRegistry
-import androidx.test.ext.junit.runners.AndroidJUnit4
-
-import org.junit.Test
-import org.junit.runner.RunWith
-
-import org.junit.Assert.*
-
-/**
- * Instrumented test, which will execute on an Android device.
- *
- * See [testing documentation](http://d.android.com/tools/testing).
- */
-@RunWith(AndroidJUnit4::class)
-class ExampleInstrumentedTest {
-    @Test
-    fun useAppContext() {
-        // Context of the app under test.
-        val appContext = InstrumentationRegistry.getInstrumentation().targetContext
-        assertEquals("com.dz.camerafast", appContext.packageName)
-    }
-}
\ No newline at end of file
diff --git a/app/src/androidTest/java/com/dz/camerafast/perf/FrameLatencyCapture.kt b/app/src/androidTest/java/com/dz/camerafast/perf/FrameLatencyCapture.kt
new file mode 100644
index 0000000..faf8ecc
--- /dev/null
+++ b/app/src/androidTest/java/com/dz/camerafast/perf/FrameLatencyCapture.kt
@@ -0,0 +1,93 @@
+package com.dz.camerafast.perf
+
+import android.app.UiAutomation
+import android.content.Intent
+import android.os.ParcelFileDescriptor
+import androidx.test.ext.junit.runners.AndroidJUnit4
+import androidx.test.platform.app.InstrumentationRegistry
+import org.junit.Test
+import org.junit.runner.RunWith
+
+// Drives CameraActivity and captures N back-to-back Perfetto traces into
+// additionalTestOutputDir for FTL --directories-to-pull to export. The
+// resulting *.pftrace files are aggregated by scripts/aggregate-traces.py.
+//
+// Runner arguments (-e on the command line, or --environment-variables on FTL):
+//   dz.iterations           Number of capture iterations (default 5).
+//   dz.duration.ms          Capture window per iteration in ms (default 10000).
+//   additionalTestOutputDir Where to write the .pftrace files. Must be a path
+//                           the shell user can write and that FTL pulls via
+//                           --directories-to-pull. Locally AGP injects its own
+//                           value; on FTL we pass it via --environment-variables.
+//
+// Note: perfetto's short-form CLI (-t, -a, positional categories) requires
+// Android 12+. .github/workflows/benchmark.yml pins both FTL devices to API 31+
+// for this reason.
+@RunWith(AndroidJUnit4::class)
+class FrameLatencyCapture {
+
+    @Test
+    fun captureFrameLatencyTraces() {
+        val args = InstrumentationRegistry.getArguments()
+        val iterations = args.getString("dz.iterations", "5").toInt()
+        val durationS = args.getString("dz.duration.ms", "10000").toLong() / 1000L
+        val outputDir = args.getString("additionalTestOutputDir")
+            ?: error("Missing instrumentation arg 'additionalTestOutputDir'")
+
+        val instrumentation = InstrumentationRegistry.getInstrumentation()
+        val targetContext = instrumentation.targetContext
+        val ui = instrumentation.uiAutomation
+
+        ui.shell("mkdir -p $outputDir")
+        ui.shell("pm grant $TARGET_PKG android.permission.CAMERA")
+
+        // The instrumentation runs in the target app's own process (default
+        // when androidTest lives in :app), so `am force-stop com.dz.camerafast`
+        // would SIGKILL the test. Launch CameraActivity once and capture N
+        // adjacent steady-state windows — dz.frame_* slices are emitted
+        // continuously by the preview pipeline.
+        targetContext.startActivity(
+            Intent().setClassName(TARGET_PKG, "$TARGET_PKG.CameraActivity")
+                .addFlags(Intent.FLAG_ACTIVITY_NEW_TASK)
+        )
+        Thread.sleep(2_000L)  // camera + GPU contexts spin up
+
+        repeat(iterations) { i ->
+            val deviceTrace = "/data/misc/perfetto-traces/dz-frame-latency-$i.pftrace"
+            val outputTrace = "$outputDir/dz-frame-latency-$i.pftrace"
+
+            // -a <pkg> is mandatory: without it, app-tag atrace sections (where
+            // dz.frame_* lands) are filtered out. perfetto blocks for -t seconds.
+            ui.shell(
+                "perfetto -o $deviceTrace -t ${durationS}s -b 32mb " +
+                    "-a $TARGET_PKG gfx view app sched"
+            )
+
+            // /data/misc/perfetto-traces is shell:shell — copy out into the
+            // FTL-collected dir (shell can write /sdcard/Android/media/<pkg>).
+            ui.shell("cp $deviceTrace $outputTrace")
+            ui.shell("rm $deviceTrace")
+
+            // UiAutomation.executeShellCommand returns the moment the command
+            // exits but doesn't expose its exit code; if perfetto rejects the
+            // command line (e.g. short-form not available on this Android
+            // version) the trace file is missing — fail fast with context.
+            val ls = ui.shell("ls -l $outputTrace")
+            check(ls.isNotBlank()) {
+                "perfetto did not produce $outputTrace on iteration $i. " +
+                    "Output dir contents: ${ui.shell("ls -la $outputDir")}"
+            }
+        }
+    }
+
+    private fun UiAutomation.shell(cmd: String): String {
+        val pfd: ParcelFileDescriptor = executeShellCommand(cmd)
+        ParcelFileDescriptor.AutoCloseInputStream(pfd).use { stream ->
+            return stream.readBytes().decodeToString()
+        }
+    }
+
+    private companion object {
+        const val TARGET_PKG = "com.dz.camerafast"
+    }
+}
diff --git a/app/src/main/native/cpp/core_engine.cpp b/app/src/main/native/cpp/core_engine.cpp
index 53242a7..12dd3d3 100644
--- a/app/src/main/native/cpp/core_engine.cpp
+++ b/app/src/main/native/cpp/core_engine.cpp
@@ -90,7 +90,12 @@ void CoreEngine::nativeSendCameraFrame(JNIEnv &env, const jni::Object<HardwareBu
             .height = cameraBufferDescription.height,
             .layers = cameraBufferDescription.layers,
             .format = AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM,
-            .usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE | AHARDWAREBUFFER_USAGE_GPU_FRAMEBUFFER,
+            // CPU_WRITE_OFTEN must be declared at allocation: strict drivers
+            // (Mali on Pixel 6) return success+null from AHardwareBuffer_lock
+            // for a CPU map of a buffer that wasn't allocated CPU-writable.
+            .usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE
+                   | AHARDWAREBUFFER_USAGE_GPU_FRAMEBUFFER
+                   | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN,
     };
     int res = AHardwareBuffer_allocate(&gpuBufferDescription, &gpuBuffer);
     LOGI("HW buffer from camera does not support AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE.");
@@ -104,16 +109,26 @@ void CoreEngine::nativeSendCameraFrame(JNIEnv &env, const jni::Object<HardwareBu
   AHardwareBuffer_acquire(localGpuBuffer);
   lock.unlock();
 
+  // Belt-and-suspenders: even though we now allocate the GPU buffer with
+  // CPU_WRITE_OFTEN, lock can still fail (e.g. on a transient HW error), and
+  // a stricter driver may legitimately return success+null. Drop the frame
+  // instead of memcpy'ing through a null pointer.
   void* gpuData = nullptr;
   void* cpuData = nullptr;
-  AHardwareBuffer_lock(cameraBuffer, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN, -1, nullptr, &cpuData);
-  AHardwareBuffer_lock(localGpuBuffer, AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN, -1, nullptr, &gpuData);
-  memcpy(gpuData, cpuData, cameraBufferDescription.height * cameraBufferDescription.width * 4);
-  AHardwareBuffer_unlock(cameraBuffer, nullptr);
-  AHardwareBuffer_unlock(localGpuBuffer, nullptr);
+  int lockCam = AHardwareBuffer_lock(cameraBuffer, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN, -1, nullptr, &cpuData);
+  int lockGpu = AHardwareBuffer_lock(localGpuBuffer, AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN, -1, nullptr, &gpuData);
+  bool copied = lockCam == 0 && lockGpu == 0 && cpuData != nullptr && gpuData != nullptr;
+  if (copied) {
+    memcpy(gpuData, cpuData, cameraBufferDescription.height * cameraBufferDescription.width * 4);
+  } else {
+    LOGI("AHardwareBuffer_lock failed (cam=%d gpu=%d cpuData=%p gpuData=%p); dropping frame.",
+         lockCam, lockGpu, cpuData, gpuData);
+  }
+  if (lockCam == 0) AHardwareBuffer_unlock(cameraBuffer, nullptr);
+  if (lockGpu == 0) AHardwareBuffer_unlock(localGpuBuffer, nullptr);
 
   lock.lock();
-  if (renderer) {
+  if (renderer && copied) {
     renderer->processCameraFrame(localGpuBuffer, rotationDegrees, backCamera, frameId);
   }
   AHardwareBuffer_release(localGpuBuffer);
diff --git a/benchmark/baselines/README.md b/benchmark/baselines/README.md
new file mode 100644
index 0000000..a4e1f57
--- /dev/null
+++ b/benchmark/baselines/README.md
@@ -0,0 +1,26 @@
+# Per-GPU baselines
+
+`baseline-adreno.json` and `baseline-mali.json` are generated by running the
+macrobenchmark on the matching FTL device, then post-processed by
+`scripts/aggregate-traces.py`.
+
+## How to regenerate
+
+1. Open a throwaway PR (the initial baselines are all-zeros placeholders).
+2. CI runs the benchmark on the real FTL device and fails with exit-2 (improvement).
+3. The step summary prints the exact JSON block to paste here.
+4. Commit the updated file and re-push.
+
+After the first real run, subsequent regen follows the same flow: when CI exits 2
+(metric improved beyond tolerance), copy the proposed JSON from the step summary,
+paste it into the relevant baseline file, commit, and push.
+
+## Schema
+
+See `scripts/aggregate-traces.py` for the canonical output schema.
+Key fields:
+
+- `device_model` / `ftl_model_id` — the compare script refuses to run if the FTL
+  model used doesn't match `ftl_model_id` here (guards against silent pool swaps).
+- `stages.<name>.<metric>.mean` — baseline value the gate compares against.
+- `counters` — `dz.dropped_frames.{gl,vk}` total across all iterations.
diff --git a/benchmark/baselines/baseline-adreno.json b/benchmark/baselines/baseline-adreno.json
new file mode 100644
index 0000000..dadbfa7
--- /dev/null
+++ b/benchmark/baselines/baseline-adreno.json
@@ -0,0 +1,102 @@
+{
+  "device_model": "Galaxy A52s 5G",
+  "gpu": "Adreno 642L",
+  "ftl_model_id": "a52sxq",
+  "android_sdk": 34,
+  "captured_at": "2026-05-28T10:20:31.199977Z",
+  "runs": 5,
+  "duration_s": 10,
+  "stages": {
+    "dz.frame_e2e.gl": {
+      "n": 470,
+      "avg": 13.929,
+      "p50": 13.894,
+      "p90": 21.087,
+      "p99": 23.8,
+      "max": 28.995,
+      "stdev": 5.253
+    },
+    "dz.frame_e2e.vk": {
+      "n": 470,
+      "avg": 14.857,
+      "p50": 15.006,
+      "p90": 21.597,
+      "p99": 25.001,
+      "max": 28.336,
+      "stdev": 5.198
+    },
+    "dz.frame_native_proc.gl": {
+      "n": 470,
+      "avg": 0.856,
+      "p50": 0.835,
+      "p90": 1.322,
+      "p99": 2.037,
+      "max": 3.785,
+      "stdev": 0.396
+    },
+    "dz.frame_native_proc.vk": {
+      "n": 470,
+      "avg": 1.307,
+      "p50": 1.158,
+      "p90": 2.201,
+      "p99": 2.863,
+      "max": 3.919,
+      "stdev": 0.643
+    },
+    "dz.frame_render.gl": {
+      "n": 472,
+      "avg": 0.531,
+      "p50": 0.503,
+      "p90": 0.847,
+      "p99": 1.555,
+      "max": 1.919,
+      "stdev": 0.273
+    },
+    "dz.frame_render.vk": {
+      "n": 471,
+      "avg": 1.464,
+      "p50": 1.409,
+      "p90": 1.915,
+      "p99": 2.421,
+      "max": 9.044,
+      "stdev": 0.508
+    },
+    "dz.frame_to_native.gl": {
+      "n": 470,
+      "avg": 2.238,
+      "p50": 2.016,
+      "p90": 3.278,
+      "p99": 5.339,
+      "max": 12.714,
+      "stdev": 0.992
+    },
+    "dz.frame_to_native.vk": {
+      "n": 470,
+      "avg": 2.126,
+      "p50": 1.896,
+      "p90": 3.156,
+      "p99": 5.085,
+      "max": 6.117,
+      "stdev": 0.81
+    },
+    "dz.frame_to_screen.gl": {
+      "n": 470,
+      "avg": 10.791,
+      "p50": 10.839,
+      "p90": 17.132,
+      "p99": 19.397,
+      "max": 19.833,
+      "stdev": 4.877
+    },
+    "dz.frame_to_screen.vk": {
+      "n": 471,
+      "avg": 11.387,
+      "p50": 11.438,
+      "p90": 18.048,
+      "p99": 20.157,
+      "max": 21.093,
+      "stdev": 4.99
+    }
+  },
+  "counters": {}
+}
\ No newline at end of file
diff --git a/benchmark/baselines/baseline-mali.json b/benchmark/baselines/baseline-mali.json
new file mode 100644
index 0000000..33548b8
--- /dev/null
+++ b/benchmark/baselines/baseline-mali.json
@@ -0,0 +1,102 @@
+{
+  "device_model": "Pixel 6",
+  "gpu": "Mali-G78",
+  "ftl_model_id": "oriole",
+  "android_sdk": 33,
+  "captured_at": "2026-05-28T10:20:22.998637Z",
+  "runs": 5,
+  "duration_s": 10,
+  "stages": {
+    "dz.frame_e2e.gl": {
+      "n": 396,
+      "avg": 12.685,
+      "p50": 13.188,
+      "p90": 18.973,
+      "p99": 21.117,
+      "max": 22.772,
+      "stdev": 4.788
+    },
+    "dz.frame_e2e.vk": {
+      "n": 396,
+      "avg": 13.786,
+      "p50": 14.443,
+      "p90": 20.135,
+      "p99": 22.355,
+      "max": 24.183,
+      "stdev": 4.867
+    },
+    "dz.frame_native_proc.gl": {
+      "n": 396,
+      "avg": 0.772,
+      "p50": 0.636,
+      "p90": 1.095,
+      "p99": 2.844,
+      "max": 12.449,
+      "stdev": 0.77
+    },
+    "dz.frame_native_proc.vk": {
+      "n": 397,
+      "avg": 1.064,
+      "p50": 0.9,
+      "p90": 1.463,
+      "p99": 2.462,
+      "max": 11.943,
+      "stdev": 0.725
+    },
+    "dz.frame_render.gl": {
+      "n": 401,
+      "avg": 0.861,
+      "p50": 0.681,
+      "p90": 1.552,
+      "p99": 2.296,
+      "max": 3.752,
+      "stdev": 0.493
+    },
+    "dz.frame_render.vk": {
+      "n": 400,
+      "avg": 1.861,
+      "p50": 1.796,
+      "p90": 2.58,
+      "p99": 4.06,
+      "max": 5.021,
+      "stdev": 0.588
+    },
+    "dz.frame_to_native.gl": {
+      "n": 396,
+      "avg": 0.853,
+      "p50": 0.78,
+      "p90": 1.275,
+      "p99": 1.943,
+      "max": 2.425,
+      "stdev": 0.334
+    },
+    "dz.frame_to_native.vk": {
+      "n": 397,
+      "avg": 0.886,
+      "p50": 0.811,
+      "p90": 1.309,
+      "p99": 2.179,
+      "max": 4.33,
+      "stdev": 0.379
+    },
+    "dz.frame_to_screen.gl": {
+      "n": 396,
+      "avg": 11.029,
+      "p50": 11.372,
+      "p90": 17.472,
+      "p99": 19.233,
+      "max": 19.906,
+      "stdev": 4.741
+    },
+    "dz.frame_to_screen.vk": {
+      "n": 398,
+      "avg": 11.79,
+      "p50": 12.269,
+      "p90": 18.29,
+      "p99": 19.701,
+      "max": 21.536,
+      "stdev": 4.742
+    }
+  },
+  "counters": {}
+}
\ No newline at end of file
diff --git a/benchmark/gates.yaml b/benchmark/gates.yaml
new file mode 100644
index 0000000..6c5b37c
--- /dev/null
+++ b/benchmark/gates.yaml
@@ -0,0 +1,80 @@
+# Tolerance gates for scripts/compare-baseline.py.
+# Derived from CLAUDE.md baseline findings (SM-F936B, 5×10s, arm64-v8a release).
+#
+# Each gated tier has both a relative (tolerance_pct) and an absolute
+# (abs_floor_ms, or abs_floor_count for counter metrics) threshold. A metric
+# PASSES if EITHER threshold is satisfied:
+#   |Δ%|  ≤ tolerance_pct   OR   |Δabs| ≤ abs_floor
+# This protects sub-ms metrics from spurious red on small absolute jitter
+# (e.g. a 0.1ms shift on a 0.7ms baseline is +14% but below any frame-budget
+# meaningful threshold).
+#
+# tight  — fail on >5%  AND >1.5 ms deviation in either direction
+# loose  — fail on >10% AND >0.5 ms (or >5 frames for counters)
+# watch  — logged in step summary, never fails the check
+# skip   — not evaluated at all (high CV, single-outlier sensitive)
+#
+# Why 1.5 ms on tight: FTL Pixel 6 / Galaxy A52s show ~1 ms run-to-run drift
+# on frame_e2e even between identical commits (vs ~0.25 ms on local SM-F936B,
+# the source of CLAUDE.md's baseline calibration). 1.5 ms absorbs that
+# headroom while still catching meaningful >1.5 ms regressions on a 13-25 ms
+# baseline (which is what we actually want to detect).
+
+tight:
+  tolerance_pct: 5
+  abs_floor_ms: 1.5
+  metrics:
+    - dz.frame_e2e.gl.p90
+    - dz.frame_e2e.vk.p90
+    - dz.frame_to_screen.gl.p90
+    - dz.frame_to_screen.vk.p90
+
+loose:
+  tolerance_pct: 10
+  abs_floor_ms: 0.5
+  abs_floor_count: 5
+  metrics:
+    # p99 is the worst 1% of frames per iteration — inherently outlier-sensitive
+    # and observably the noisiest tight-tier metric on FTL devices. Loose
+    # tolerance (±10% / ±0.5 ms) still catches real worst-case regressions.
+    - dz.frame_e2e.gl.p99
+    - dz.frame_e2e.vk.p99
+    - dz.dropped_frames.gl
+    - dz.dropped_frames.vk
+
+watch:
+  # avg is not representative — one slow frame skews the mean across hundreds
+  # of frames per iteration. p90 captures the steady-state, p99 the tail.
+  # Keep avg visible (under the collapsible "Watch-only" section) but never
+  # gate on it.
+  metrics:
+    - dz.frame_e2e.gl.avg
+    - dz.frame_e2e.vk.avg
+    - dz.frame_render.gl.avg
+    - dz.frame_render.vk.avg
+    - dz.frame_native_proc.gl.avg
+    - dz.frame_native_proc.vk.avg
+    - dz.frame_native_proc.gl.p90
+    - dz.frame_native_proc.gl.p99
+    - dz.frame_native_proc.vk.p90
+    - dz.frame_native_proc.vk.p99
+    - dz.frame_render.gl.p90
+    - dz.frame_render.gl.p99
+    - dz.frame_render.vk.p90
+    - dz.frame_render.vk.p99
+
+skip:
+  # max values — single-outlier sensitive, 15-40% CV
+  - dz.frame_e2e.gl.max
+  - dz.frame_e2e.vk.max
+  - dz.frame_to_screen.gl.max
+  - dz.frame_to_screen.vk.max
+  - dz.frame_render.gl.max
+  - dz.frame_render.vk.max
+  - dz.frame_native_proc.gl.max
+  - dz.frame_native_proc.vk.max
+  # p50 on screen-facing stages — bimodal (submit-to-vsync alignment)
+  - dz.frame_e2e.gl.p50
+  - dz.frame_e2e.vk.p50
+  - dz.frame_to_screen.gl.p50
+  - dz.frame_to_screen.vk.p50
diff --git a/docs/ci-setup.md b/docs/ci-setup.md
new file mode 100644
index 0000000..c241959
--- /dev/null
+++ b/docs/ci-setup.md
@@ -0,0 +1,187 @@
+# CI setup
+
+## Prerequisites
+
+This project uses two GitHub Actions workflows:
+
+| Workflow | File | Trigger | Required |
+|---|---|---|---|
+| **Build** | `.github/workflows/build.yml` | push to `main`, all PRs | always |
+| **Benchmark** | `.github/workflows/benchmark.yml` | PRs only | always (required check) |
+
+The benchmark workflow runs on **Firebase Test Lab Spark** (free tier):
+- 5 physical device-runs per day — enough for 2 full PR validations (2 devices each)
+  before the daily cap resets. Force-pushes and re-runs consume the same quota.
+- If the cap becomes tight, apply for the
+  [BrowserStack Open Source Program](https://www.browserstack.com/open-source) —
+  see the swap instructions at the end of this doc.
+
+---
+
+## One-time GCP / FTL setup (~15 min)
+
+### 1. Create a GCP project on the Spark (free) plan
+
+1. Go to [console.cloud.google.com](https://console.cloud.google.com) and create a new project.
+   **Do not** add billing — the Spark plan is no-cost.
+2. Enable these APIs (Console → APIs & Services → Library):
+   - **Firebase Test Lab API** (`testing.googleapis.com`)
+   - **Cloud Storage API** (`storage.googleapis.com`)
+   - **Cloud Tool Results API** (`toolresults.googleapis.com`)
+
+### 2. Link a Firebase project
+
+1. Go to [console.firebase.google.com](https://console.firebase.google.com).
+2. Click **Add project** → select **"Use existing Google Cloud project"** → pick the project you created above.
+3. Accept the Spark plan.
+
+### 3. Create a service account
+
+In the GCP Console → IAM & Admin → Service Accounts:
+
+1. Create a service account (e.g. `github-ftl-runner`).
+2. Grant these roles:
+   - `roles/firebase.testLab.admin`
+   - `roles/storage.admin`
+   - `roles/cloudtoolresults.viewer`
+3. Create a JSON key and download it.
+
+### 4. Create a GCS results bucket
+
+In the GCP Console → Cloud Storage → Create bucket.
+Pick any name (`camerafast-ftl-results` works) in a single region nearest to you.
+Leave all other settings as default.
+
+Grant the service account `roles/storage.objectAdmin` on the bucket specifically
+(or the `roles/storage.admin` you granted in step 3 already covers it project-wide).
+
+### 5. Add GitHub Secrets
+
+Repo → Settings → Secrets and variables → Actions → New repository secret:
+
+| Secret name | Value |
+|---|---|
+| `GCP_SA_KEY` | Full contents of the JSON key file downloaded in step 3 |
+| `GCP_RESULTS_BUCKET` | Bucket name from step 4 (no `gs://` prefix) |
+
+---
+
+## Seeding the per-GPU baselines (first run)
+
+The checked-in `benchmark/baselines/baseline-{adreno,mali}.json` are placeholders.
+On the first PR run CI will fail with exit-2 ("improvement") because the placeholders
+have no real values to compare against.
+
+Steps to seed them:
+
+1. Open the failing PR's GitHub Actions run.
+2. Open the `benchmark-adreno` (or `benchmark-mali`) job.
+3. In the **"Compare against baseline"** step summary, copy the JSON block under
+   **"Proposed updated baseline"**.
+4. Paste it into `benchmark/baselines/baseline-adreno.json` (or `-mali.json`).
+5. Commit and push — the benchmark jobs will now compare against the real FTL values.
+
+After seeding, the baselines reflect FTL device performance. The existing
+`.cache/frame-latency/baseline.json` (from the local SM-F936B) is a separate
+reference and will diverge — that's expected.
+
+---
+
+## Regenerating a baseline after a real improvement
+
+When CI exits 2 (improvement beyond tolerance):
+
+1. The step summary shows a **"Proposed updated baseline"** JSON block.
+2. Copy-paste it into the relevant `benchmark/baselines/baseline-<gpu>.json`.
+3. Commit the file and push — the check will go green.
+
+You can alternatively re-run the capture locally with a tethered device.
+The same instrumented test that CI runs on FTL also runs via Gradle:
+
+```bash
+./gradlew :app:installRelease :app:connectedReleaseAndroidTest \
+  -Pandroid.injected.build.abi=$(adb shell getprop ro.product.cpu.abi | tr -d '\r') \
+  -Pandroid.testInstrumentationRunnerArguments.additionalTestOutputDir=/sdcard/Android/media/com.dz.camerafast/additional_test_output \
+  -Pandroid.testInstrumentationRunnerArguments.dz.iterations=5 \
+  -Pandroid.testInstrumentationRunnerArguments.dz.duration.ms=10000
+
+# AGP's UTP auto-pulls traces from the device into:
+python3 scripts/aggregate-traces.py \
+  "app/build/outputs/connected_android_test_additional_output/releaseAndroidTest/connected/<device>" \
+  benchmark/baselines/baseline-<gpu>.json \
+  --device-model "My Device" --gpu "Adreno 642L" --ftl-model-id "a52sxq" --android-sdk 34
+```
+
+For ad-hoc local measurement without going through Gradle, the Bash
+equivalents `scripts/measure-frame-latency.sh` and
+`scripts/baseline-frame-latency.sh` capture the same `dz.frame_*` slices.
+
+Note: locally-captured values differ from FTL — if CI already seeded the
+baseline from FTL, prefer the FTL numbers (copy from step summary).
+
+---
+
+## Branch protection (manual, one-time)
+
+Repo → Settings → Branches → Add rule for `main`:
+
+- [x] Require a pull request before merging
+- [x] Require status checks to pass:
+  - `build`
+  - `benchmark-adreno`
+  - `benchmark-mali`
+- [x] Require branches to be up to date before merging
+- [ ] Allow force pushes (leave unchecked)
+
+---
+
+## FTL device catalogue
+
+The workflow pins specific `model,version` pairs so a Spark catalogue change
+surfaces as a CI break rather than silent baseline drift.
+
+| Job | Model | Device | GPU | API |
+|---|---|---|---|---|
+| `benchmark-adreno` | `a52sxq` | Galaxy A52s 5G | Adreno 642L (Snapdragon 778G) | 34 |
+| `benchmark-mali` | `oriole` | Pixel 6 | Mali-G78 (Google Tensor) | 33 |
+
+Adreno coverage on FTL Spark is awkward: the natural choice is `redfin` (Pixel 5 / Adreno 620), but FTL only offers it on Android 11 and perfetto's short-form CLI we depend on requires API 31+. `a52sxq` is the next-closest tier (mid-range Snapdragon) on a modern enough OS.
+
+To check current Spark availability:
+```bash
+gcloud firebase test android models list --filter=manufacturer=google --format=table
+```
+
+If a model is no longer in the Spark catalogue, update `benchmark.yml` and
+regenerate both baselines.
+
+---
+
+## Swap path: BrowserStack Open Source Program
+
+BrowserStack's OSS Program offers unlimited real-device automation for public
+open-source repos — apply at [browserstack.com/open-source](https://www.browserstack.com/open-source).
+Requirements: public repo, OSS licence, BrowserStack logo in README.
+
+If accepted:
+
+1. Add secrets: `BROWSERSTACK_USERNAME`, `BROWSERSTACK_ACCESS_KEY`.
+2. Replace the FTL steps in each benchmark job with BrowserStack App Automate
+   (`curl -u "$BS_USER:$BS_KEY" -X POST ... `) targeting equivalent Adreno and
+   Mali devices.
+3. The `aggregate-traces.py` / `compare-baseline.py` / baseline files are
+   provider-agnostic — no changes needed there.
+
+This lifts the 5-runs/day cap entirely.
+
+---
+
+## Cost estimate
+
+| Scenario | Cost |
+|---|---|
+| FTL Spark, ≤5 physical runs/day | **$0/month** |
+| FTL Spark cap exceeded (runs over 5/day) | Requires upgrading to Blaze; ~$1/device-min (~$10 for a 10-min run) |
+| BrowserStack OSS Program (if approved) | **$0/month** |
+| BrowserStack paid | ~$249/mo base |
+| AWS Device Farm after free trial | ~$0.17/device-min (~$1.02 for a 6-min run, per device) |
diff --git a/scripts/aggregate-traces.py b/scripts/aggregate-traces.py
new file mode 100755
index 0000000..9ee29c6
--- /dev/null
+++ b/scripts/aggregate-traces.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+"""
+Aggregate per-iteration perfetto traces from a Macrobenchmark run into a
+results.json that matches the benchmark/baselines/baseline-<gpu>.json schema.
+
+Usage:
+    scripts/aggregate-traces.py <traces-dir> <output.json> [options]
+
+    <traces-dir>  Directory containing *.perfetto-trace files (one per iteration).
+                  Macrobenchmark writes them to:
+                  app/build/outputs/connected_android_test_additional_output/
+                    releaseAndroidTest/connected/<device>/
+    <output.json> Where to write the aggregated results.
+
+Options:
+    --trace-processor PATH   Path to trace_processor binary.
+                             Defaults to .cache/frame-latency/trace_processor;
+                             auto-downloaded if missing.
+    --device-model NAME      Human-readable device name  (e.g. "Pixel 5").
+    --gpu NAME               GPU name                     (e.g. "Adreno 620").
+    --ftl-model-id ID        FTL model ID                 (e.g. "redfin").
+    --android-sdk INT        Android API level             (e.g. 33).
+    --duration-s INT         Capture window in seconds    (default 10).
+"""
+
+import argparse
+import csv
+import datetime
+import glob
+import io
+import json
+import os
+import platform
+import stat
+import subprocess
+import sys
+import urllib.request
+from collections import defaultdict
+from statistics import mean, stdev
+
+SLICE_SQL = (
+    "SELECT name, dur FROM slice WHERE name LIKE 'dz.frame_%' AND dur >= 0"
+)
+COUNTER_SQL = (
+    "SELECT c.name, SUM(cs.value) AS total "
+    "FROM counter cs "
+    "JOIN counter_track c ON cs.track_id = c.id "
+    "WHERE c.name LIKE 'dz.dropped_frames.%' "
+    "GROUP BY c.name"
+)
+
+METRICS = ["avg", "p50", "p90", "p99", "max"]
+
+
+def download_trace_processor(dest: str) -> None:
+    system = platform.system().lower()
+    machine = platform.machine().lower()
+    if system == "linux":
+        url = "https://get.perfetto.dev/trace_processor"
+    elif system == "darwin":
+        url = "https://get.perfetto.dev/trace_processor"
+    else:
+        print(f"error: unsupported OS '{system}' for trace_processor auto-download", file=sys.stderr)
+        sys.exit(2)
+    print(f"Downloading trace_processor -> {dest} ...", file=sys.stderr)
+    urllib.request.urlretrieve(url, dest)
+    os.chmod(dest, os.stat(dest).st_mode | stat.S_IEXEC)
+
+
+def run_sql(tp: str, trace: str, sql: str) -> list[dict]:
+    result = subprocess.run(
+        [tp, "query", trace, sql],
+        capture_output=True, text=True, check=True
+    )
+    rows = []
+    reader = csv.DictReader(io.StringIO(result.stdout))
+    for row in reader:
+        rows.append(row)
+    return rows
+
+
+def percentile(values: list[float], p: float) -> float:
+    s = sorted(values)
+    k = (len(s) - 1) * p / 100
+    lo = int(k)
+    hi = min(lo + 1, len(s) - 1)
+    return s[lo] + (s[hi] - s[lo]) * (k - lo)
+
+
+def aggregate_slices(all_durations_ns: list[int]) -> dict:
+    if not all_durations_ns:
+        return {m: 0.0 for m in ["n"] + METRICS}
+    ms = [d / 1e6 for d in all_durations_ns]
+    return {
+        "n": len(ms),
+        "avg": round(mean(ms), 3),
+        "p50": round(percentile(ms, 50), 3),
+        "p90": round(percentile(ms, 90), 3),
+        "p99": round(percentile(ms, 99), 3),
+        "max": round(max(ms), 3),
+        "stdev": round(stdev(ms) if len(ms) > 1 else 0.0, 3),
+    }
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("traces_dir")
+    parser.add_argument("output_json")
+    parser.add_argument("--trace-processor", default=None)
+    parser.add_argument("--device-model", default="unknown")
+    parser.add_argument("--gpu", default="unknown")
+    parser.add_argument("--ftl-model-id", default="unknown")
+    parser.add_argument("--android-sdk", type=int, default=0)
+    parser.add_argument("--duration-s", type=int, default=10)
+    args = parser.parse_args()
+
+    root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    default_tp = os.path.join(root, ".cache", "frame-latency", "trace_processor")
+    tp = args.trace_processor or default_tp
+
+    if not os.path.isfile(tp):
+        os.makedirs(os.path.dirname(tp), exist_ok=True)
+        download_trace_processor(tp)
+
+    traces = sorted(glob.glob(os.path.join(args.traces_dir, "**", "*.perfetto-trace"), recursive=True))
+    if not traces:
+        # FTL may name them .pftrace
+        traces = sorted(glob.glob(os.path.join(args.traces_dir, "**", "*.pftrace"), recursive=True))
+    if not traces:
+        print(f"error: no .perfetto-trace / .pftrace files found under {args.traces_dir}", file=sys.stderr)
+        sys.exit(2)
+    print(f"Found {len(traces)} trace(s) under {args.traces_dir}", file=sys.stderr)
+
+    slice_buckets: dict[str, list[int]] = defaultdict(list)
+    counter_totals: dict[str, float] = defaultdict(float)
+
+    for trace in traces:
+        try:
+            for row in run_sql(tp, trace, SLICE_SQL):
+                slice_buckets[row["name"]].append(int(row["dur"]))
+            for row in run_sql(tp, trace, COUNTER_SQL):
+                counter_totals[row["name"]] += float(row["total"])
+        except subprocess.CalledProcessError as e:
+            print(f"warning: trace_processor failed on {trace}: {e.stderr.strip()}", file=sys.stderr)
+
+    if not slice_buckets:
+        print("error: no dz.frame_* slices found in any trace", file=sys.stderr)
+        print("  - Ensure the release APK is instrumented and -a com.dz.camerafast was passed to perfetto", file=sys.stderr)
+        sys.exit(1)
+
+    stages: dict[str, dict] = {}
+    for name in sorted(slice_buckets):
+        stages[name] = aggregate_slices(slice_buckets[name])
+
+    counters: dict[str, float] = {k: round(v, 3) for k, v in sorted(counter_totals.items())}
+
+    output = {
+        "device_model": args.device_model,
+        "gpu": args.gpu,
+        "ftl_model_id": args.ftl_model_id,
+        "android_sdk": args.android_sdk,
+        "captured_at": datetime.datetime.utcnow().isoformat() + "Z",
+        "runs": len(traces),
+        "duration_s": args.duration_s,
+        "stages": stages,
+        "counters": counters,
+    }
+
+    with open(args.output_json, "w") as f:
+        json.dump(output, f, indent=2)
+    print(f"Wrote {args.output_json}", file=sys.stderr)
+
+    # Print a summary table to stdout for humans / GHA step logs.
+    print(f"\n{'stage':<32} {'n':>5} {'avg':>7} {'p50':>7} {'p90':>7} {'p99':>7} {'max':>7}  (ms)")
+    print("-" * 80)
+    for name, s in sorted(stages.items()):
+        print(f"{name:<32} {s['n']:>5} {s['avg']:>7.2f} {s['p50']:>7.2f} {s['p90']:>7.2f} {s['p99']:>7.2f} {s['max']:>7.2f}")
+    if counters:
+        print()
+        for name, total in sorted(counters.items()):
+            print(f"{name:<32} total={total:.0f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/compare-baseline.py b/scripts/compare-baseline.py
new file mode 100755
index 0000000..591e66e
--- /dev/null
+++ b/scripts/compare-baseline.py
@@ -0,0 +1,390 @@
+#!/usr/bin/env python3
+"""
+Compare a benchmark results.json against a per-GPU baseline, enforce the
+tolerance gates defined in benchmark/gates.yaml, and write a GitHub Actions
+step summary (when $GITHUB_STEP_SUMMARY is set).
+
+Usage:
+    scripts/compare-baseline.py BASELINE.json RESULTS.json [--gates GATES.yaml]
+
+Exit codes:
+    0  All gated metrics within tolerance.
+    1  At least one regression beyond tolerance.
+    2  At least one improvement beyond tolerance — regenerate the baseline.
+       (The proposed new baseline JSON is printed to stdout for easy copy-paste.)
+
+A metric is a (stage, stat) pair such as "dz.frame_e2e.gl.p90".
+Counter metrics are keyed as "dz.dropped_frames.gl".
+"""
+
+import argparse
+import json
+import os
+import sys
+
+try:
+    import yaml
+except ImportError:
+    yaml = None
+
+
+# ── Simple YAML loader (avoids adding PyYAML as a hard dep in CI) ─────────────
+
+def _parse_yaml(text: str) -> dict:
+    """Minimal YAML parser — handles only the scalar/list/dict subset used in
+    gates.yaml. Falls back to PyYAML when available."""
+    if yaml is not None:
+        return yaml.safe_load(text)
+    # Hand-rolled: good enough for our controlled file.
+    root: dict = {}
+    current_key: str | None = None
+    current_list: list | None = None
+    for raw in text.splitlines():
+        line = raw.rstrip()
+        if not line or line.lstrip().startswith("#"):
+            continue
+        if not line.startswith(" "):
+            if ":" in line:
+                k, _, v = line.partition(":")
+                v = v.strip()
+                if v:
+                    try:
+                        root[k.strip()] = int(v)
+                    except ValueError:
+                        root[k.strip()] = v
+                else:
+                    root[k.strip()] = {}
+                current_key = k.strip()
+                current_list = None
+        else:
+            stripped = line.lstrip()
+            indent = len(line) - len(stripped)
+            if current_key is None:
+                continue
+            if stripped.startswith("- "):
+                val = stripped[2:].strip()
+                if not isinstance(root.get(current_key), list):
+                    root[current_key] = []
+                root[current_key].append(val)
+            elif ":" in stripped:
+                k2, _, v2 = stripped.partition(":")
+                v2 = v2.strip()
+                if isinstance(root.get(current_key), dict):
+                    try:
+                        root[current_key][k2.strip()] = int(v2)
+                    except ValueError:
+                        root[current_key][k2.strip()] = v2
+    return root
+
+
+def load_gates(path: str) -> dict:
+    with open(path) as f:
+        raw = _parse_yaml(f.read())
+
+    # metric_key -> (tier, tolerance_pct, abs_floor_ms, abs_floor_count)
+    gates: dict[str, tuple[str, float | None, float, float]] = {}
+
+    for tier in ("tight", "loose"):
+        block = raw.get(tier, {})
+        tol = float(block.get("tolerance_pct", 5 if tier == "tight" else 10))
+        floor_ms = float(block.get("abs_floor_ms", 0.0))
+        floor_ct = float(block.get("abs_floor_count", 0.0))
+        for m in block.get("metrics", []):
+            gates[m] = (tier, tol, floor_ms, floor_ct)
+
+    for m in raw.get("watch", {}).get("metrics", []) if isinstance(raw.get("watch"), dict) else []:
+        gates[m] = ("watch", None, 0.0, 0.0)
+
+    for m in (raw.get("skip") or []):
+        gates[m] = ("skip", None, 0.0, 0.0)
+
+    return gates
+
+
+# ── Value extraction ──────────────────────────────────────────────────────────
+
+def extract_values(data: dict) -> dict[str, float]:
+    """Flatten stages + counters into metric_key -> mean."""
+    out: dict[str, float] = {}
+    for stage, stats in data.get("stages", {}).items():
+        for stat, v in stats.items():
+            if stat in ("n", "stdev", "cv_pct", "values"):
+                continue
+            if isinstance(v, (int, float)):
+                out[f"{stage}.{stat}"] = float(v)
+            elif isinstance(v, dict) and "mean" in v:
+                out[f"{stage}.{stat}"] = float(v["mean"])
+    for name, v in data.get("counters", {}).items():
+        if isinstance(v, (int, float)):
+            out[name] = float(v)
+        elif isinstance(v, dict) and "mean" in v:
+            out[name] = float(v["mean"])
+    return out
+
+
+# ── Comparison ────────────────────────────────────────────────────────────────
+
+STATUS_PASS       = "✅ pass"
+STATUS_REGRESSION = "❌ REGRESSION"
+STATUS_IMPROVED   = "⚠️  IMPROVED — regen baseline"
+STATUS_WATCH      = "👁  watch"
+STATUS_SKIP       = "—"
+STATUS_MISSING    = "❓ missing"
+
+
+def compare(baseline: dict, results: dict, gates: dict) -> tuple[int, list[dict]]:
+    """Returns (exit_code, rows) where rows drive the markdown table."""
+    b_vals = extract_values(baseline)
+    r_vals = extract_values(results)
+
+    all_keys = sorted(set(b_vals) | set(r_vals))
+    rows = []
+    has_regression = False
+    has_improvement = False
+
+    for key in all_keys:
+        gate = gates.get(key, ("watch", None, 0.0, 0.0))
+        tier, tol, floor_ms, floor_ct = gate
+        if tier == "skip":
+            continue
+
+        b = b_vals.get(key)
+        r = r_vals.get(key)
+
+        if b is None or r is None:
+            rows.append({"key": key, "baseline": b, "observed": r,
+                         "delta_abs": None, "delta_pct": None,
+                         "tier": tier, "status": STATUS_MISSING})
+            continue
+
+        delta_abs = r - b
+        if b == 0.0:
+            delta_pct = 0.0 if r == 0.0 else float("inf")
+        else:
+            delta_pct = (r - b) / b * 100.0
+
+        # Counters (dropped_frames) are in frame counts, everything else in ms.
+        is_counter = key.startswith("dz.dropped_frames")
+        abs_floor = floor_ct if is_counter else floor_ms
+        within_pct = tol is not None and abs(delta_pct) <= tol
+        within_abs = abs_floor > 0 and abs(delta_abs) <= abs_floor
+
+        if tier == "watch" or tol is None:
+            status = STATUS_WATCH
+        elif within_pct or within_abs:
+            status = STATUS_PASS
+        elif delta_pct > 0:
+            status = STATUS_REGRESSION
+            has_regression = True
+        else:
+            status = STATUS_IMPROVED
+            has_improvement = True
+
+        rows.append({
+            "key": key, "baseline": b, "observed": r,
+            "delta_abs": delta_abs, "delta_pct": delta_pct,
+            "tier": tier, "status": status,
+        })
+
+    exit_code = 0
+    if has_regression:
+        exit_code = 1
+    elif has_improvement:
+        exit_code = 2
+    return exit_code, rows
+
+
+# ── Output ────────────────────────────────────────────────────────────────────
+
+def fmt_ms(v: float | None) -> str:
+    return f"{v:.3f}" if v is not None else "—"
+
+
+def fmt_delta(v: float | None) -> str:
+    if v is None:
+        return "—"
+    if v == float("inf"):
+        return "+∞%"
+    return f"{v:+.1f}%"
+
+
+def _split_renderer(key: str) -> tuple[str | None, str]:
+    """Pull the 'gl'/'vk' segment out of a metric key.
+
+    'dz.frame_e2e.gl.avg'      -> ('gl', 'dz.frame_e2e.avg')
+    'dz.dropped_frames.vk'     -> ('vk', 'dz.dropped_frames')
+    'something.else'           -> (None, 'something.else')
+    """
+    parts = key.split(".")
+    for i, p in enumerate(parts):
+        if p in ("gl", "vk"):
+            return p, ".".join(parts[:i] + parts[i + 1:])
+    return None, key
+
+
+def _fmt_abs(v: float | None) -> str:
+    if v is None:
+        return "—"
+    return f"{v:+.3f}"
+
+
+_TABLE_HEADER = (
+    "| metric | tier | baseline | observed | Δabs | Δ% | status |\n"
+    "|--------|------|----------|----------|------|-----|--------|"
+)
+
+
+def _format_row(r: dict) -> str:
+    _, display = _split_renderer(r["key"])
+    return (
+        f"| `{display}` | {r['tier']} "
+        f"| {fmt_ms(r['baseline'])} | {fmt_ms(r['observed'])} "
+        f"| {_fmt_abs(r.get('delta_abs'))} | {fmt_delta(r['delta_pct'])} | {r['status']} |"
+    )
+
+
+def _render_subtable(title: str, rows: list[dict]) -> list[str]:
+    # Drop SKIP rows entirely; collapse WATCH rows under a <details> block
+    # so the headline gated metrics stay visible by default.
+    gated = [r for r in rows if r["status"] not in (STATUS_SKIP, STATUS_WATCH)]
+    watched = [r for r in rows if r["status"] == STATUS_WATCH]
+    if not gated and not watched:
+        return []
+
+    out = [f"#### {title}", ""]
+
+    if gated:
+        out.append(_TABLE_HEADER)
+        out.extend(_format_row(r) for r in gated)
+        out.append("")
+
+    if watched:
+        out += [
+            f"<details>",
+            f"<summary>Watch-only metrics ({len(watched)}) — informational, never fail the build</summary>",
+            "",
+            _TABLE_HEADER,
+        ]
+        out.extend(_format_row(r) for r in watched)
+        out += ["", "</details>", ""]
+
+    return out
+
+
+def render_markdown(rows: list[dict], exit_code: int, baseline_path: str,
+                    results_path: str, ftl_mismatch: str | None) -> str:
+    lines = ["## Frame-latency benchmark results", ""]
+
+    if ftl_mismatch:
+        lines += [f"> ⚠️ {ftl_mismatch}", ""]
+
+    if exit_code == 0:
+        lines.append("> ✅ All gated metrics within tolerance.")
+    elif exit_code == 1:
+        lines.append("> ❌ **Regression detected** — fix the performance issue before merging.")
+    else:
+        lines.append("> ⚠️ **Improvement detected** — run `scripts/aggregate-traces.py` locally "
+                     "and commit the updated `baseline-<gpu>.json` before merging.")
+
+    lines += [
+        "",
+        f"Baseline: `{baseline_path}` | Results: `{results_path}`",
+        "",
+    ]
+
+    gl_rows = [r for r in rows if _split_renderer(r["key"])[0] == "gl"]
+    vk_rows = [r for r in rows if _split_renderer(r["key"])[0] == "vk"]
+    other_rows = [r for r in rows if _split_renderer(r["key"])[0] is None]
+
+    lines += _render_subtable("OpenGL ES", gl_rows)
+    lines += _render_subtable("Vulkan", vk_rows)
+    if other_rows:
+        lines += _render_subtable("Other", other_rows)
+
+    return "\n".join(lines) + "\n"
+
+
+def proposed_baseline_json(results: dict) -> str:
+    """Strip _placeholder fields and pretty-print for the step summary."""
+    clean = {k: v for k, v in results.items() if not k.startswith("_")}
+    return json.dumps(clean, indent=2)
+
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    default_gates = os.path.join(repo_root, "benchmark", "gates.yaml")
+
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("baseline_json")
+    parser.add_argument("results_json")
+    parser.add_argument("--gates", default=default_gates)
+    parser.add_argument("--output-md", default=None,
+                        help="Also write the markdown comparison table to this file.")
+    args = parser.parse_args()
+
+    with open(args.baseline_json) as f:
+        baseline = json.load(f)
+    with open(args.results_json) as f:
+        results = json.load(f)
+
+    # Placeholder baselines always "improve" — that's intentional on first run.
+    if baseline.get("_placeholder"):
+        print("Baseline is a placeholder — treating all metrics as improvements.", file=sys.stderr)
+        print("Copy the proposed JSON below into the baseline file and re-push.", file=sys.stderr)
+
+    # Guard against silent FTL pool swaps.
+    ftl_mismatch: str | None = None
+    b_model = baseline.get("ftl_model_id")
+    r_model = results.get("ftl_model_id")
+    if b_model and r_model and b_model != "unknown" and r_model != "unknown" and b_model != r_model:
+        ftl_mismatch = (
+            f"FTL model mismatch: baseline captured on `{b_model}`, "
+            f"this run used `{r_model}`. Results are not comparable."
+        )
+        print(f"error: {ftl_mismatch}", file=sys.stderr)
+        if args.output_md:
+            with open(args.output_md, "w") as f:
+                f.write(f"> ❌ {ftl_mismatch}\n")
+        sys.exit(3)
+
+    gates = load_gates(args.gates)
+    exit_code, rows = compare(baseline, results, gates)
+
+    # Console table.
+    print(f"\n{'metric':<40} {'tier':<6} {'baseline':>10} {'observed':>10} {'Δ%':>8}  status")
+    print("-" * 90)
+    for r in rows:
+        print(
+            f"{r['key']:<40} {r['tier']:<6} "
+            f"{fmt_ms(r['baseline']):>10} {fmt_ms(r['observed']):>10} "
+            f"{fmt_delta(r['delta_pct']):>8}  {r['status']}"
+        )
+
+    md = render_markdown(rows, exit_code, args.baseline_json, args.results_json, ftl_mismatch)
+
+    if args.output_md:
+        with open(args.output_md, "w") as f:
+            f.write(md)
+
+    step_summary = os.environ.get("GITHUB_STEP_SUMMARY")
+    if step_summary:
+        with open(step_summary, "a") as f:
+            f.write(md)
+        if exit_code == 2:
+            with open(step_summary, "a") as f:
+                f.write("\n### Proposed updated baseline\n\n```json\n")
+                f.write(proposed_baseline_json(results))
+                f.write("\n```\n")
+    else:
+        print("\n" + md)
+        if exit_code == 2:
+            print("### Proposed updated baseline\n")
+            print(proposed_baseline_json(results))
+
+    sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+    main()