diff --git a/.github/workflows/baselines.yml b/.github/workflows/baselines.yml new file mode 100644 index 0000000..78b5442 --- /dev/null +++ b/.github/workflows/baselines.yml @@ -0,0 +1,85 @@ +name: Baselines + +# Manually-triggered: reseeds benchmark/baselines/baseline-{adreno,mali}.json +# from the latest Benchmark workflow run on this branch (or a specific run id), +# then commits the updated files. The next Benchmark run on this branch will +# compare against the new baselines — turning a previously-red "improvement" +# or first-real-data PR green. +on: + workflow_dispatch: + inputs: + run_id: + description: "Benchmark run ID to source from (blank = latest on this branch)" + required: false + type: string + +concurrency: + group: baselines-${{ github.ref }} + cancel-in-progress: false + +jobs: + update-baselines: + name: update-baselines + runs-on: ubuntu-latest + permissions: + contents: write + actions: read + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.ref_name }} + # Need write token so we can push back. + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Resolve source Benchmark run + id: run + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + if [ -n "${{ inputs.run_id }}" ]; then + id="${{ inputs.run_id }}" + else + id=$(gh run list \ + --workflow=benchmark.yml \ + --branch="${{ github.ref_name }}" \ + --limit=1 \ + --json databaseId \ + --jq '.[0].databaseId') + if [ -z "$id" ] || [ "$id" = "null" ]; then + echo "::error::No Benchmark run found on branch ${{ github.ref_name }}. Run Benchmark first." + exit 1 + fi + fi + echo "Sourcing baselines from Benchmark run $id" + echo "id=$id" >> "$GITHUB_OUTPUT" + + - name: Download adreno results + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: gh run download "${{ steps.run.outputs.id }}" -n benchmark-results-adreno -D adreno/ + + - name: Download mali results + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: gh run download "${{ steps.run.outputs.id }}" -n benchmark-results-mali -D mali/ + + - name: Overwrite baseline files + run: | + cp adreno/results-adreno.json benchmark/baselines/baseline-adreno.json + cp mali/results-mali.json benchmark/baselines/baseline-mali.json + echo "--- adreno baseline ---" + head -20 benchmark/baselines/baseline-adreno.json + echo "--- mali baseline ---" + head -20 benchmark/baselines/baseline-mali.json + + - name: Commit and push + run: | + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + if git diff --quiet benchmark/baselines/; then + echo "Baselines already match Benchmark run ${{ steps.run.outputs.id }} — nothing to commit." + exit 0 + fi + git add benchmark/baselines/baseline-adreno.json benchmark/baselines/baseline-mali.json + git commit -m "ci: refresh baselines from Benchmark run ${{ steps.run.outputs.id }}" + git push origin "HEAD:${{ github.ref_name }}" diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..f60572c --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,300 @@ +name: Benchmark + +on: + pull_request: + +# Only one benchmark run per PR branch at a time; cancel the stale one. +concurrency: + group: benchmark-${{ github.ref }} + cancel-in-progress: true + +jobs: + # ── Job 1: build (same as build.yml but runs inside this workflow so the + # benchmark jobs can download the artifacts without a cross-workflow lookup) ── + build: + name: build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 + + - uses: gradle/actions/setup-gradle@v3 + with: + cache-read-only: true + + - name: Build app + androidTest APKs (arm64-v8a only) + run: | + ./gradlew \ + :app:assembleRelease \ + :app:assembleReleaseAndroidTest \ + -Pandroid.injected.build.abi=arm64-v8a \ + --stacktrace + + - name: Stage APKs for upload + run: | + mkdir -p staged-apks + find app/build -name "app-release.apk" -exec cp {} staged-apks/app-release.apk \; + find app/build -name "app-release-androidTest.apk" -exec cp {} staged-apks/app-release-androidTest.apk \; + ls -lh staged-apks/ + + - uses: actions/upload-artifact@v4 + with: + name: app-release-apk + path: staged-apks/app-release.apk + retention-days: 1 + + - uses: actions/upload-artifact@v4 + with: + name: app-release-androidTest-apk + path: staged-apks/app-release-androidTest.apk + retention-days: 1 + + # ── Reusable FTL runner ─────────────────────────────────────────────────────── + # Two parallel jobs — one per GPU family. Both download the same APKs from + # the build job, run the benchmark on a different FTL device, then compare + # against the matching baseline file. + + benchmark-adreno: + name: benchmark-adreno + needs: build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/download-artifact@v4 + with: + name: app-release-apk + path: apks/ + + - uses: actions/download-artifact@v4 + with: + name: app-release-androidTest-apk + path: apks/ + + - uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + + - uses: google-github-actions/setup-gcloud@v2 + + - name: Run frame-latency capture on FTL (Galaxy A52s — Adreno 642L) + run: | + # Spark free tier: 5 physical device-runs/day. + # a52sxq = Galaxy A52s 5G, Snapdragon 778G, Adreno 642L. + # Picked over redfin (Pixel 5 / Adreno 620) because redfin is locked + # to Android 11 on FTL and perfetto's short-form CLI requires API 31+. + # --timeout is generous; the test itself runs 5×10 s = 50 s of actual + # capture, plus app warm-up and FTL setup overhead (~2 min total). + set -o pipefail + gcloud firebase test android run \ + --type instrumentation \ + --app apks/app-release.apk \ + --test apks/app-release-androidTest.apk \ + --device model=a52sxq,version=34,locale=en,orientation=portrait \ + --timeout 10m \ + --directories-to-pull /sdcard/Android/media/com.dz.camerafast/additional_test_output \ + --results-bucket ${{ secrets.GCP_RESULTS_BUCKET }} \ + --results-dir benchmark-adreno-${{ github.run_id }}-${{ github.run_attempt }} \ + --test-runner-class androidx.test.runner.AndroidJUnitRunner \ + --test-targets "class com.dz.camerafast.perf.FrameLatencyCapture" \ + --environment-variables "additionalTestOutputDir=/sdcard/Android/media/com.dz.camerafast/additional_test_output,dz.iterations=5,dz.duration.ms=10000" \ + 2>&1 | tee ftl-adreno.log + + - name: Pull trace output from GCS + run: | + # FTL preserves the full on-device path under artifacts/, so our + # /sdcard/Android/media/com.dz.camerafast/additional_test_output/ + # ends up at artifacts/sdcard/Android/media//additional_test_output/. + # gsutil cp -r requires the destination dir to exist when source + # resolves to multiple files. + mkdir -p trace-output-adreno + gsutil -m cp -r \ + "gs://${{ secrets.GCP_RESULTS_BUCKET }}/benchmark-adreno-${{ github.run_id }}-${{ github.run_attempt }}/a52sxq-34-en-portrait/artifacts/sdcard/Android/media/com.dz.camerafast/additional_test_output" \ + trace-output-adreno/ + + - name: Aggregate traces → results.json + run: | + python3 scripts/aggregate-traces.py \ + trace-output-adreno \ + results-adreno.json \ + --device-model "Galaxy A52s 5G" \ + --gpu "Adreno 642L" \ + --ftl-model-id "a52sxq" \ + --android-sdk 34 \ + --duration-s 10 + + - name: Compare against baseline + run: | + python3 scripts/compare-baseline.py \ + benchmark/baselines/baseline-adreno.json \ + results-adreno.json \ + --output-md comparison-adreno.md + + - uses: actions/upload-artifact@v4 + if: always() + with: + name: benchmark-results-adreno + path: | + results-adreno.json + comparison-adreno.md + trace-output-adreno/ + ftl-adreno.log + retention-days: 14 + + benchmark-mali: + name: benchmark-mali + needs: build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/download-artifact@v4 + with: + name: app-release-apk + path: apks/ + + - uses: actions/download-artifact@v4 + with: + name: app-release-androidTest-apk + path: apks/ + + - uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + + - uses: google-github-actions/setup-gcloud@v2 + + - name: Run frame-latency capture on FTL (Pixel 6 — Mali-G78) + run: | + # oriole = Pixel 6, Google Tensor, Mali-G78. Android 13 (API 33) so + # perfetto's short-form CLI is available. + set -o pipefail + gcloud firebase test android run \ + --type instrumentation \ + --app apks/app-release.apk \ + --test apks/app-release-androidTest.apk \ + --device model=oriole,version=33,locale=en,orientation=portrait \ + --timeout 10m \ + --directories-to-pull /sdcard/Android/media/com.dz.camerafast/additional_test_output \ + --results-bucket ${{ secrets.GCP_RESULTS_BUCKET }} \ + --results-dir benchmark-mali-${{ github.run_id }}-${{ github.run_attempt }} \ + --test-runner-class androidx.test.runner.AndroidJUnitRunner \ + --test-targets "class com.dz.camerafast.perf.FrameLatencyCapture" \ + --environment-variables "additionalTestOutputDir=/sdcard/Android/media/com.dz.camerafast/additional_test_output,dz.iterations=5,dz.duration.ms=10000" \ + 2>&1 | tee ftl-mali.log + + - name: Pull trace output from GCS + run: | + mkdir -p trace-output-mali + gsutil -m cp -r \ + "gs://${{ secrets.GCP_RESULTS_BUCKET }}/benchmark-mali-${{ github.run_id }}-${{ github.run_attempt }}/oriole-33-en-portrait/artifacts/sdcard/Android/media/com.dz.camerafast/additional_test_output" \ + trace-output-mali/ + + - name: Aggregate traces → results.json + run: | + python3 scripts/aggregate-traces.py \ + trace-output-mali \ + results-mali.json \ + --device-model "Pixel 6" \ + --gpu "Mali-G78" \ + --ftl-model-id "oriole" \ + --android-sdk 33 \ + --duration-s 10 + + - name: Compare against baseline + run: | + python3 scripts/compare-baseline.py \ + benchmark/baselines/baseline-mali.json \ + results-mali.json \ + --output-md comparison-mali.md + + - uses: actions/upload-artifact@v4 + if: always() + with: + name: benchmark-results-mali + path: | + results-mali.json + comparison-mali.md + trace-output-mali/ + ftl-mali.log + retention-days: 14 + + # ── PR comment with the consolidated p50/p90/p99 delta table ────────────── + # Runs after both benchmark jobs regardless of their pass/fail status so a + # regression still produces a visible comment (showing which metric tripped). + # PR-merge gating remains on the individual benchmark-{adreno,mali} jobs. + comment: + name: comment + needs: [benchmark-adreno, benchmark-mali] + if: always() && github.event_name == 'pull_request' + runs-on: ubuntu-latest + permissions: + pull-requests: write + steps: + - uses: actions/download-artifact@v4 + if: always() + continue-on-error: true + with: + name: benchmark-results-adreno + path: adreno/ + + - uses: actions/download-artifact@v4 + if: always() + continue-on-error: true + with: + name: benchmark-results-mali + path: mali/ + + - name: Build comment body + run: | + { + echo '' + echo '## Frame-latency benchmark' + echo + echo '### Adreno (Galaxy A52s 5G, Adreno 642L)' + if [ -f adreno/comparison-adreno.md ]; then + cat adreno/comparison-adreno.md + else + echo '> ❌ benchmark-adreno did not produce a comparison — see the workflow run for details.' + fi + echo + echo '### Mali (Pixel 6, Mali-G78)' + if [ -f mali/comparison-mali.md ]; then + cat mali/comparison-mali.md + else + echo '> ❌ benchmark-mali did not produce a comparison — see the workflow run for details.' + fi + echo + echo '---' + echo + echo 'To re-seed baselines from this run, manually trigger the **Baselines** workflow under [Actions → Baselines](../../actions/workflows/baselines.yml) and pick this branch as the ref. (Only visible after the workflow file lands on the default branch — GitHub limitation for `workflow_dispatch`.)' + } > comment.md + echo "--- preview ---" + cat comment.md + + - uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const body = fs.readFileSync('comment.md', 'utf8'); + const marker = ''; + const pr = context.issue.number; + const { data: comments } = await github.rest.issues.listComments({ + ...context.repo, issue_number: pr, + }); + const existing = comments.find(c => (c.body || '').includes(marker)); + if (existing) { + await github.rest.issues.updateComment({ + ...context.repo, comment_id: existing.id, body, + }); + } else { + await github.rest.issues.createComment({ + ...context.repo, issue_number: pr, body, + }); + } diff --git a/.gitignore b/.gitignore index 214cb5d..c735c7f 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,5 @@ .cxx **/build /.idea -/.cache \ No newline at end of file +/.cache +.java-version \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index 9773e77..df4f107 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -75,11 +75,14 @@ Design decisions worth remembering: | What you want | Skill / Script | |---|---| | One-shot frame-latency measurement (single N-second capture) | `scripts/measure-frame-latency.sh [seconds]` | -| Establish a baseline with dispersion (5 × 10s by default, JSON output) | `scripts/baseline-frame-latency.sh` — invokable via `/frame-latency-baseline` | +| Establish a local baseline with dispersion (5 × 10s by default, JSON output) | `scripts/baseline-frame-latency.sh` — invokable via `/frame-latency-baseline` | +| Run the CI capture instrumentation test locally on a tethered device | `./gradlew :app:connectedReleaseAndroidTest -Pandroid.injected.build.abi=arm64-v8a -Pandroid.testInstrumentationRunnerArguments.additionalTestOutputDir=/sdcard/Android/media/com.dz.camerafast/additional_test_output` | +| Aggregate perfetto traces (from FTL or local connected test) into results.json | `scripts/aggregate-traces.py ` | +| Compare results.json against a per-GPU baseline | `scripts/compare-baseline.py benchmark/baselines/baseline-.json results.json` | | Build, install, launch, screenshot for visual verification of UI changes | `/verify-on-device` | | Discover Android-platform skills (camera, performance, perfetto-sql, etc.) | `vendor/android-skills/` submodule | -All three scripts/skills assume a single ADB device. Set `ANDROID_SERIAL=` if multiple are attached. They auto-download Perfetto's `trace_processor` to `.cache/frame-latency/` (gitignored, ~25 MB) on first run. +The bash scripts and the `:app/androidTest` capture (`FrameLatencyCapture`) both emit `.pftrace` files that `scripts/aggregate-traces.py` consumes, so there is one place for stats math. All tools assume a single ADB device locally; set `ANDROID_SERIAL=` if multiple are attached. `trace_processor` is auto-downloaded to `.cache/frame-latency/` (gitignored, ~25 MB) on first use. ## Build / install gotchas @@ -118,17 +121,33 @@ OpenGL is ~1.7 ms faster end-to-end on average (`frame_e2e` avg 13.25 vs 14.91), **Most of `frame_to_screen` is vsync wait.** `frame_to_screen.gl.avg ≈ 10.9 ms` but only ~0.57 ms of that is actual GL command submission (`frame_render.gl.avg`); the remaining ~10.3 ms is Choreographer/vsync wait. Same shape for Vulkan: ~11.7 ms total vs ~1.76 ms of work. Optimizations that shave µs off GL/VK commands won't move the e2e needle until the vsync wait is what we're trying to displace (e.g. higher refresh rate, lower-latency presentation extensions). **Which metrics to gate PRs on:** -- **Tight (±5%)**: `frame_e2e.{gl,vk}.{avg, p90, p99}`, `frame_to_screen.{gl,vk}.p90`. All sub-3% CV. -- **Looser (±10%)**: `frame_render.{gl,vk}.avg`, `frame_native_proc.{gl,vk}.avg`. CV 2–7%. -- **Watch only, no gate**: `frame_native_proc.{gl,vk}.{p90, p99}` and `frame_render.{gl,vk}.{p90, p99}` (5–25% CV — single-tail-sample noise). +- **Tight (±5% AND ±1.5 ms)**: `frame_e2e.{gl,vk}.p90`, `frame_to_screen.{gl,vk}.p90`. All sub-3% CV locally; the 1.5 ms floor is calibrated for FTL Pixel 6 / Galaxy A52s, which drift ~1 ms run-to-run on identical commits (vs ~0.25 ms on local SM-F936B). +- **Looser (±10% AND ±0.5 ms, or ±5 frames for counters)**: `frame_e2e.{gl,vk}.p99`, `dropped_frames.{gl,vk}`. CV 2–7%. p99 is the worst 1% of frames per iteration — inherently outlier-sensitive and observed to be the noisiest tight-tier metric on FTL, so it lives in loose despite frame_e2e.p90 staying tight. +- **Watch only, no gate**: every `avg` (means are skewed by one slow frame and not representative — p90 captures steady state, p99 the tail), plus `frame_native_proc.{gl,vk}.{p90, p99}` and `frame_render.{gl,vk}.{p90, p99}` (5–25% CV — single-tail-sample noise). - **Skip entirely**: every `max` (single-outlier sensitive, 15–40% CV), and `p50` on screen-facing stages (bimodal — submit-to-vsync alignment). +**Dual gate (relative + absolute floor).** Each gated tier has *both* a percentage tolerance and an absolute floor. A metric **passes** when *either* threshold is satisfied — `|Δ%| ≤ tolerance_pct` **OR** `|Δabs| ≤ abs_floor`. The absolute floor exists because sub-ms metrics like `frame_native_proc.avg` (~0.7 ms baseline) blow up to +14% on a 0.1 ms shift that is below any frame-budget significance. Real regressions exceed both thresholds; pure relative noise on tiny absolutes is filtered out. + Slice counts are deterministic to within ±1 per 10 s window: ~298 frames per renderer (~30 fps from camera). A meaningful deviation in count is itself a regression signal. -## Planned next steps (not yet implemented) +## CI pipeline + +Three required GitHub Actions checks gate every PR: + +| Check | File | What it does | +|---|---|---| +| `build` | `.github/workflows/build.yml` | `assembleRelease` + `assembleReleaseAndroidTest` (arm64-v8a), uploads APK artifacts | +| `benchmark-adreno` | `.github/workflows/benchmark.yml` | Runs `com.dz.camerafast.perf.FrameLatencyCapture` (an `:app/androidTest` instrumentation test that drives N×Ds Perfetto captures) on FTL Galaxy A52s 5G (Adreno 642L, API 34), compares against `benchmark/baselines/baseline-adreno.json` | +| `benchmark-mali` | `.github/workflows/benchmark.yml` | Same on FTL Pixel 6 (Mali-G78, API 33), compares against `benchmark/baselines/baseline-mali.json` | + +The compare step uses **two-sided tolerance gates** from `benchmark/gates.yaml` (tight ±5%/±1.5 ms, loose ±10%/±0.5 ms — pass if EITHER bound holds): +- **Exit 1 (regression)** — blocks merge; fix the performance issue. +- **Exit 2 (improvement)** — also blocks merge; copy the proposed JSON from the step summary into `benchmark/baselines/baseline-.json` and commit. +- **Exit 0** — all gated metrics within tolerance; green. + +**Device source:** Firebase Test Lab **Spark free tier** (5 physical runs/day, $0). See `docs/ci-setup.md` for one-time GCP setup (~15 min) and the swap path to BrowserStack Open Source Program (unlimited, apply separately). -- **Macrobenchmark module** wrapping the same capture flow with `TraceSectionMetric`, so the run produces the JSON straight from a Gradle task rather than a bash wrapper. The `testing/testing-setup` skill in `vendor/android-skills/` is the entry point for scaffolding. -- **CI gate via GitHub Actions** running the macrobenchmark on either Firebase Test Lab (real hardware, paid per device-minute) or Gradle Managed Devices (emulator on the GHA runner, free but GPU≠real). Likely GMD for speed, with periodic FTL runs for trend tracking. The PR check diffs against a `baseline.json` checked into the repo and fails on regressions outside the gates listed above. +**Per-GPU baseline files** live under `benchmark/baselines/`. They are placeholders until the first FTL CI run seeds them — see `benchmark/baselines/README.md`. ## Other tooling worth knowing about diff --git a/app/build.gradle b/app/build.gradle index 99e2a83..ea9a778 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -6,6 +6,11 @@ plugins { android { compileSdk 35 + // Frame-latency capture (androidTest/.../FrameLatencyCapture.kt) must attach + // to a profileable, non-debug APK. The release variant declares + // in AndroidManifest.xml. + testBuildType = "release" + defaultConfig { applicationId "com.dz.camerafast" minSdk 29 @@ -84,4 +89,9 @@ dependencies { debugImplementation "androidx.compose.ui:ui-tooling:$compose_version" implementation "androidx.compose.ui:ui-tooling-preview:$compose_version" + + // Frame-latency capture (com.dz.camerafast.perf.FrameLatencyCapture). + androidTestImplementation "junit:junit:4.13.2" + androidTestImplementation "androidx.test.ext:junit:1.3.0" + androidTestImplementation "androidx.test:runner:1.7.0" } \ No newline at end of file diff --git a/app/src/androidTest/java/com/dz/camerafast/ExampleInstrumentedTest.kt b/app/src/androidTest/java/com/dz/camerafast/ExampleInstrumentedTest.kt deleted file mode 100644 index 2a2d347..0000000 --- a/app/src/androidTest/java/com/dz/camerafast/ExampleInstrumentedTest.kt +++ /dev/null @@ -1,24 +0,0 @@ -package com.dz.camerafast - -import androidx.test.platform.app.InstrumentationRegistry -import androidx.test.ext.junit.runners.AndroidJUnit4 - -import org.junit.Test -import org.junit.runner.RunWith - -import org.junit.Assert.* - -/** - * Instrumented test, which will execute on an Android device. - * - * See [testing documentation](http://d.android.com/tools/testing). - */ -@RunWith(AndroidJUnit4::class) -class ExampleInstrumentedTest { - @Test - fun useAppContext() { - // Context of the app under test. - val appContext = InstrumentationRegistry.getInstrumentation().targetContext - assertEquals("com.dz.camerafast", appContext.packageName) - } -} \ No newline at end of file diff --git a/app/src/androidTest/java/com/dz/camerafast/perf/FrameLatencyCapture.kt b/app/src/androidTest/java/com/dz/camerafast/perf/FrameLatencyCapture.kt new file mode 100644 index 0000000..faf8ecc --- /dev/null +++ b/app/src/androidTest/java/com/dz/camerafast/perf/FrameLatencyCapture.kt @@ -0,0 +1,93 @@ +package com.dz.camerafast.perf + +import android.app.UiAutomation +import android.content.Intent +import android.os.ParcelFileDescriptor +import androidx.test.ext.junit.runners.AndroidJUnit4 +import androidx.test.platform.app.InstrumentationRegistry +import org.junit.Test +import org.junit.runner.RunWith + +// Drives CameraActivity and captures N back-to-back Perfetto traces into +// additionalTestOutputDir for FTL --directories-to-pull to export. The +// resulting *.pftrace files are aggregated by scripts/aggregate-traces.py. +// +// Runner arguments (-e on the command line, or --environment-variables on FTL): +// dz.iterations Number of capture iterations (default 5). +// dz.duration.ms Capture window per iteration in ms (default 10000). +// additionalTestOutputDir Where to write the .pftrace files. Must be a path +// the shell user can write and that FTL pulls via +// --directories-to-pull. Locally AGP injects its own +// value; on FTL we pass it via --environment-variables. +// +// Note: perfetto's short-form CLI (-t, -a, positional categories) requires +// Android 12+. .github/workflows/benchmark.yml pins both FTL devices to API 31+ +// for this reason. +@RunWith(AndroidJUnit4::class) +class FrameLatencyCapture { + + @Test + fun captureFrameLatencyTraces() { + val args = InstrumentationRegistry.getArguments() + val iterations = args.getString("dz.iterations", "5").toInt() + val durationS = args.getString("dz.duration.ms", "10000").toLong() / 1000L + val outputDir = args.getString("additionalTestOutputDir") + ?: error("Missing instrumentation arg 'additionalTestOutputDir'") + + val instrumentation = InstrumentationRegistry.getInstrumentation() + val targetContext = instrumentation.targetContext + val ui = instrumentation.uiAutomation + + ui.shell("mkdir -p $outputDir") + ui.shell("pm grant $TARGET_PKG android.permission.CAMERA") + + // The instrumentation runs in the target app's own process (default + // when androidTest lives in :app), so `am force-stop com.dz.camerafast` + // would SIGKILL the test. Launch CameraActivity once and capture N + // adjacent steady-state windows — dz.frame_* slices are emitted + // continuously by the preview pipeline. + targetContext.startActivity( + Intent().setClassName(TARGET_PKG, "$TARGET_PKG.CameraActivity") + .addFlags(Intent.FLAG_ACTIVITY_NEW_TASK) + ) + Thread.sleep(2_000L) // camera + GPU contexts spin up + + repeat(iterations) { i -> + val deviceTrace = "/data/misc/perfetto-traces/dz-frame-latency-$i.pftrace" + val outputTrace = "$outputDir/dz-frame-latency-$i.pftrace" + + // -a is mandatory: without it, app-tag atrace sections (where + // dz.frame_* lands) are filtered out. perfetto blocks for -t seconds. + ui.shell( + "perfetto -o $deviceTrace -t ${durationS}s -b 32mb " + + "-a $TARGET_PKG gfx view app sched" + ) + + // /data/misc/perfetto-traces is shell:shell — copy out into the + // FTL-collected dir (shell can write /sdcard/Android/media/). + ui.shell("cp $deviceTrace $outputTrace") + ui.shell("rm $deviceTrace") + + // UiAutomation.executeShellCommand returns the moment the command + // exits but doesn't expose its exit code; if perfetto rejects the + // command line (e.g. short-form not available on this Android + // version) the trace file is missing — fail fast with context. + val ls = ui.shell("ls -l $outputTrace") + check(ls.isNotBlank()) { + "perfetto did not produce $outputTrace on iteration $i. " + + "Output dir contents: ${ui.shell("ls -la $outputDir")}" + } + } + } + + private fun UiAutomation.shell(cmd: String): String { + val pfd: ParcelFileDescriptor = executeShellCommand(cmd) + ParcelFileDescriptor.AutoCloseInputStream(pfd).use { stream -> + return stream.readBytes().decodeToString() + } + } + + private companion object { + const val TARGET_PKG = "com.dz.camerafast" + } +} diff --git a/app/src/main/native/cpp/core_engine.cpp b/app/src/main/native/cpp/core_engine.cpp index 53242a7..12dd3d3 100644 --- a/app/src/main/native/cpp/core_engine.cpp +++ b/app/src/main/native/cpp/core_engine.cpp @@ -90,7 +90,12 @@ void CoreEngine::nativeSendCameraFrame(JNIEnv &env, const jni::ObjectprocessCameraFrame(localGpuBuffer, rotationDegrees, backCamera, frameId); } AHardwareBuffer_release(localGpuBuffer); diff --git a/benchmark/baselines/README.md b/benchmark/baselines/README.md new file mode 100644 index 0000000..a4e1f57 --- /dev/null +++ b/benchmark/baselines/README.md @@ -0,0 +1,26 @@ +# Per-GPU baselines + +`baseline-adreno.json` and `baseline-mali.json` are generated by running the +macrobenchmark on the matching FTL device, then post-processed by +`scripts/aggregate-traces.py`. + +## How to regenerate + +1. Open a throwaway PR (the initial baselines are all-zeros placeholders). +2. CI runs the benchmark on the real FTL device and fails with exit-2 (improvement). +3. The step summary prints the exact JSON block to paste here. +4. Commit the updated file and re-push. + +After the first real run, subsequent regen follows the same flow: when CI exits 2 +(metric improved beyond tolerance), copy the proposed JSON from the step summary, +paste it into the relevant baseline file, commit, and push. + +## Schema + +See `scripts/aggregate-traces.py` for the canonical output schema. +Key fields: + +- `device_model` / `ftl_model_id` — the compare script refuses to run if the FTL + model used doesn't match `ftl_model_id` here (guards against silent pool swaps). +- `stages...mean` — baseline value the gate compares against. +- `counters` — `dz.dropped_frames.{gl,vk}` total across all iterations. diff --git a/benchmark/baselines/baseline-adreno.json b/benchmark/baselines/baseline-adreno.json new file mode 100644 index 0000000..dadbfa7 --- /dev/null +++ b/benchmark/baselines/baseline-adreno.json @@ -0,0 +1,102 @@ +{ + "device_model": "Galaxy A52s 5G", + "gpu": "Adreno 642L", + "ftl_model_id": "a52sxq", + "android_sdk": 34, + "captured_at": "2026-05-28T10:20:31.199977Z", + "runs": 5, + "duration_s": 10, + "stages": { + "dz.frame_e2e.gl": { + "n": 470, + "avg": 13.929, + "p50": 13.894, + "p90": 21.087, + "p99": 23.8, + "max": 28.995, + "stdev": 5.253 + }, + "dz.frame_e2e.vk": { + "n": 470, + "avg": 14.857, + "p50": 15.006, + "p90": 21.597, + "p99": 25.001, + "max": 28.336, + "stdev": 5.198 + }, + "dz.frame_native_proc.gl": { + "n": 470, + "avg": 0.856, + "p50": 0.835, + "p90": 1.322, + "p99": 2.037, + "max": 3.785, + "stdev": 0.396 + }, + "dz.frame_native_proc.vk": { + "n": 470, + "avg": 1.307, + "p50": 1.158, + "p90": 2.201, + "p99": 2.863, + "max": 3.919, + "stdev": 0.643 + }, + "dz.frame_render.gl": { + "n": 472, + "avg": 0.531, + "p50": 0.503, + "p90": 0.847, + "p99": 1.555, + "max": 1.919, + "stdev": 0.273 + }, + "dz.frame_render.vk": { + "n": 471, + "avg": 1.464, + "p50": 1.409, + "p90": 1.915, + "p99": 2.421, + "max": 9.044, + "stdev": 0.508 + }, + "dz.frame_to_native.gl": { + "n": 470, + "avg": 2.238, + "p50": 2.016, + "p90": 3.278, + "p99": 5.339, + "max": 12.714, + "stdev": 0.992 + }, + "dz.frame_to_native.vk": { + "n": 470, + "avg": 2.126, + "p50": 1.896, + "p90": 3.156, + "p99": 5.085, + "max": 6.117, + "stdev": 0.81 + }, + "dz.frame_to_screen.gl": { + "n": 470, + "avg": 10.791, + "p50": 10.839, + "p90": 17.132, + "p99": 19.397, + "max": 19.833, + "stdev": 4.877 + }, + "dz.frame_to_screen.vk": { + "n": 471, + "avg": 11.387, + "p50": 11.438, + "p90": 18.048, + "p99": 20.157, + "max": 21.093, + "stdev": 4.99 + } + }, + "counters": {} +} \ No newline at end of file diff --git a/benchmark/baselines/baseline-mali.json b/benchmark/baselines/baseline-mali.json new file mode 100644 index 0000000..33548b8 --- /dev/null +++ b/benchmark/baselines/baseline-mali.json @@ -0,0 +1,102 @@ +{ + "device_model": "Pixel 6", + "gpu": "Mali-G78", + "ftl_model_id": "oriole", + "android_sdk": 33, + "captured_at": "2026-05-28T10:20:22.998637Z", + "runs": 5, + "duration_s": 10, + "stages": { + "dz.frame_e2e.gl": { + "n": 396, + "avg": 12.685, + "p50": 13.188, + "p90": 18.973, + "p99": 21.117, + "max": 22.772, + "stdev": 4.788 + }, + "dz.frame_e2e.vk": { + "n": 396, + "avg": 13.786, + "p50": 14.443, + "p90": 20.135, + "p99": 22.355, + "max": 24.183, + "stdev": 4.867 + }, + "dz.frame_native_proc.gl": { + "n": 396, + "avg": 0.772, + "p50": 0.636, + "p90": 1.095, + "p99": 2.844, + "max": 12.449, + "stdev": 0.77 + }, + "dz.frame_native_proc.vk": { + "n": 397, + "avg": 1.064, + "p50": 0.9, + "p90": 1.463, + "p99": 2.462, + "max": 11.943, + "stdev": 0.725 + }, + "dz.frame_render.gl": { + "n": 401, + "avg": 0.861, + "p50": 0.681, + "p90": 1.552, + "p99": 2.296, + "max": 3.752, + "stdev": 0.493 + }, + "dz.frame_render.vk": { + "n": 400, + "avg": 1.861, + "p50": 1.796, + "p90": 2.58, + "p99": 4.06, + "max": 5.021, + "stdev": 0.588 + }, + "dz.frame_to_native.gl": { + "n": 396, + "avg": 0.853, + "p50": 0.78, + "p90": 1.275, + "p99": 1.943, + "max": 2.425, + "stdev": 0.334 + }, + "dz.frame_to_native.vk": { + "n": 397, + "avg": 0.886, + "p50": 0.811, + "p90": 1.309, + "p99": 2.179, + "max": 4.33, + "stdev": 0.379 + }, + "dz.frame_to_screen.gl": { + "n": 396, + "avg": 11.029, + "p50": 11.372, + "p90": 17.472, + "p99": 19.233, + "max": 19.906, + "stdev": 4.741 + }, + "dz.frame_to_screen.vk": { + "n": 398, + "avg": 11.79, + "p50": 12.269, + "p90": 18.29, + "p99": 19.701, + "max": 21.536, + "stdev": 4.742 + } + }, + "counters": {} +} \ No newline at end of file diff --git a/benchmark/gates.yaml b/benchmark/gates.yaml new file mode 100644 index 0000000..6c5b37c --- /dev/null +++ b/benchmark/gates.yaml @@ -0,0 +1,80 @@ +# Tolerance gates for scripts/compare-baseline.py. +# Derived from CLAUDE.md baseline findings (SM-F936B, 5×10s, arm64-v8a release). +# +# Each gated tier has both a relative (tolerance_pct) and an absolute +# (abs_floor_ms, or abs_floor_count for counter metrics) threshold. A metric +# PASSES if EITHER threshold is satisfied: +# |Δ%| ≤ tolerance_pct OR |Δabs| ≤ abs_floor +# This protects sub-ms metrics from spurious red on small absolute jitter +# (e.g. a 0.1ms shift on a 0.7ms baseline is +14% but below any frame-budget +# meaningful threshold). +# +# tight — fail on >5% AND >1.5 ms deviation in either direction +# loose — fail on >10% AND >0.5 ms (or >5 frames for counters) +# watch — logged in step summary, never fails the check +# skip — not evaluated at all (high CV, single-outlier sensitive) +# +# Why 1.5 ms on tight: FTL Pixel 6 / Galaxy A52s show ~1 ms run-to-run drift +# on frame_e2e even between identical commits (vs ~0.25 ms on local SM-F936B, +# the source of CLAUDE.md's baseline calibration). 1.5 ms absorbs that +# headroom while still catching meaningful >1.5 ms regressions on a 13-25 ms +# baseline (which is what we actually want to detect). + +tight: + tolerance_pct: 5 + abs_floor_ms: 1.5 + metrics: + - dz.frame_e2e.gl.p90 + - dz.frame_e2e.vk.p90 + - dz.frame_to_screen.gl.p90 + - dz.frame_to_screen.vk.p90 + +loose: + tolerance_pct: 10 + abs_floor_ms: 0.5 + abs_floor_count: 5 + metrics: + # p99 is the worst 1% of frames per iteration — inherently outlier-sensitive + # and observably the noisiest tight-tier metric on FTL devices. Loose + # tolerance (±10% / ±0.5 ms) still catches real worst-case regressions. + - dz.frame_e2e.gl.p99 + - dz.frame_e2e.vk.p99 + - dz.dropped_frames.gl + - dz.dropped_frames.vk + +watch: + # avg is not representative — one slow frame skews the mean across hundreds + # of frames per iteration. p90 captures the steady-state, p99 the tail. + # Keep avg visible (under the collapsible "Watch-only" section) but never + # gate on it. + metrics: + - dz.frame_e2e.gl.avg + - dz.frame_e2e.vk.avg + - dz.frame_render.gl.avg + - dz.frame_render.vk.avg + - dz.frame_native_proc.gl.avg + - dz.frame_native_proc.vk.avg + - dz.frame_native_proc.gl.p90 + - dz.frame_native_proc.gl.p99 + - dz.frame_native_proc.vk.p90 + - dz.frame_native_proc.vk.p99 + - dz.frame_render.gl.p90 + - dz.frame_render.gl.p99 + - dz.frame_render.vk.p90 + - dz.frame_render.vk.p99 + +skip: + # max values — single-outlier sensitive, 15-40% CV + - dz.frame_e2e.gl.max + - dz.frame_e2e.vk.max + - dz.frame_to_screen.gl.max + - dz.frame_to_screen.vk.max + - dz.frame_render.gl.max + - dz.frame_render.vk.max + - dz.frame_native_proc.gl.max + - dz.frame_native_proc.vk.max + # p50 on screen-facing stages — bimodal (submit-to-vsync alignment) + - dz.frame_e2e.gl.p50 + - dz.frame_e2e.vk.p50 + - dz.frame_to_screen.gl.p50 + - dz.frame_to_screen.vk.p50 diff --git a/docs/ci-setup.md b/docs/ci-setup.md new file mode 100644 index 0000000..c241959 --- /dev/null +++ b/docs/ci-setup.md @@ -0,0 +1,187 @@ +# CI setup + +## Prerequisites + +This project uses two GitHub Actions workflows: + +| Workflow | File | Trigger | Required | +|---|---|---|---| +| **Build** | `.github/workflows/build.yml` | push to `main`, all PRs | always | +| **Benchmark** | `.github/workflows/benchmark.yml` | PRs only | always (required check) | + +The benchmark workflow runs on **Firebase Test Lab Spark** (free tier): +- 5 physical device-runs per day — enough for 2 full PR validations (2 devices each) + before the daily cap resets. Force-pushes and re-runs consume the same quota. +- If the cap becomes tight, apply for the + [BrowserStack Open Source Program](https://www.browserstack.com/open-source) — + see the swap instructions at the end of this doc. + +--- + +## One-time GCP / FTL setup (~15 min) + +### 1. Create a GCP project on the Spark (free) plan + +1. Go to [console.cloud.google.com](https://console.cloud.google.com) and create a new project. + **Do not** add billing — the Spark plan is no-cost. +2. Enable these APIs (Console → APIs & Services → Library): + - **Firebase Test Lab API** (`testing.googleapis.com`) + - **Cloud Storage API** (`storage.googleapis.com`) + - **Cloud Tool Results API** (`toolresults.googleapis.com`) + +### 2. Link a Firebase project + +1. Go to [console.firebase.google.com](https://console.firebase.google.com). +2. Click **Add project** → select **"Use existing Google Cloud project"** → pick the project you created above. +3. Accept the Spark plan. + +### 3. Create a service account + +In the GCP Console → IAM & Admin → Service Accounts: + +1. Create a service account (e.g. `github-ftl-runner`). +2. Grant these roles: + - `roles/firebase.testLab.admin` + - `roles/storage.admin` + - `roles/cloudtoolresults.viewer` +3. Create a JSON key and download it. + +### 4. Create a GCS results bucket + +In the GCP Console → Cloud Storage → Create bucket. +Pick any name (`camerafast-ftl-results` works) in a single region nearest to you. +Leave all other settings as default. + +Grant the service account `roles/storage.objectAdmin` on the bucket specifically +(or the `roles/storage.admin` you granted in step 3 already covers it project-wide). + +### 5. Add GitHub Secrets + +Repo → Settings → Secrets and variables → Actions → New repository secret: + +| Secret name | Value | +|---|---| +| `GCP_SA_KEY` | Full contents of the JSON key file downloaded in step 3 | +| `GCP_RESULTS_BUCKET` | Bucket name from step 4 (no `gs://` prefix) | + +--- + +## Seeding the per-GPU baselines (first run) + +The checked-in `benchmark/baselines/baseline-{adreno,mali}.json` are placeholders. +On the first PR run CI will fail with exit-2 ("improvement") because the placeholders +have no real values to compare against. + +Steps to seed them: + +1. Open the failing PR's GitHub Actions run. +2. Open the `benchmark-adreno` (or `benchmark-mali`) job. +3. In the **"Compare against baseline"** step summary, copy the JSON block under + **"Proposed updated baseline"**. +4. Paste it into `benchmark/baselines/baseline-adreno.json` (or `-mali.json`). +5. Commit and push — the benchmark jobs will now compare against the real FTL values. + +After seeding, the baselines reflect FTL device performance. The existing +`.cache/frame-latency/baseline.json` (from the local SM-F936B) is a separate +reference and will diverge — that's expected. + +--- + +## Regenerating a baseline after a real improvement + +When CI exits 2 (improvement beyond tolerance): + +1. The step summary shows a **"Proposed updated baseline"** JSON block. +2. Copy-paste it into the relevant `benchmark/baselines/baseline-.json`. +3. Commit the file and push — the check will go green. + +You can alternatively re-run the capture locally with a tethered device. +The same instrumented test that CI runs on FTL also runs via Gradle: + +```bash +./gradlew :app:installRelease :app:connectedReleaseAndroidTest \ + -Pandroid.injected.build.abi=$(adb shell getprop ro.product.cpu.abi | tr -d '\r') \ + -Pandroid.testInstrumentationRunnerArguments.additionalTestOutputDir=/sdcard/Android/media/com.dz.camerafast/additional_test_output \ + -Pandroid.testInstrumentationRunnerArguments.dz.iterations=5 \ + -Pandroid.testInstrumentationRunnerArguments.dz.duration.ms=10000 + +# AGP's UTP auto-pulls traces from the device into: +python3 scripts/aggregate-traces.py \ + "app/build/outputs/connected_android_test_additional_output/releaseAndroidTest/connected/" \ + benchmark/baselines/baseline-.json \ + --device-model "My Device" --gpu "Adreno 642L" --ftl-model-id "a52sxq" --android-sdk 34 +``` + +For ad-hoc local measurement without going through Gradle, the Bash +equivalents `scripts/measure-frame-latency.sh` and +`scripts/baseline-frame-latency.sh` capture the same `dz.frame_*` slices. + +Note: locally-captured values differ from FTL — if CI already seeded the +baseline from FTL, prefer the FTL numbers (copy from step summary). + +--- + +## Branch protection (manual, one-time) + +Repo → Settings → Branches → Add rule for `main`: + +- [x] Require a pull request before merging +- [x] Require status checks to pass: + - `build` + - `benchmark-adreno` + - `benchmark-mali` +- [x] Require branches to be up to date before merging +- [ ] Allow force pushes (leave unchecked) + +--- + +## FTL device catalogue + +The workflow pins specific `model,version` pairs so a Spark catalogue change +surfaces as a CI break rather than silent baseline drift. + +| Job | Model | Device | GPU | API | +|---|---|---|---|---| +| `benchmark-adreno` | `a52sxq` | Galaxy A52s 5G | Adreno 642L (Snapdragon 778G) | 34 | +| `benchmark-mali` | `oriole` | Pixel 6 | Mali-G78 (Google Tensor) | 33 | + +Adreno coverage on FTL Spark is awkward: the natural choice is `redfin` (Pixel 5 / Adreno 620), but FTL only offers it on Android 11 and perfetto's short-form CLI we depend on requires API 31+. `a52sxq` is the next-closest tier (mid-range Snapdragon) on a modern enough OS. + +To check current Spark availability: +```bash +gcloud firebase test android models list --filter=manufacturer=google --format=table +``` + +If a model is no longer in the Spark catalogue, update `benchmark.yml` and +regenerate both baselines. + +--- + +## Swap path: BrowserStack Open Source Program + +BrowserStack's OSS Program offers unlimited real-device automation for public +open-source repos — apply at [browserstack.com/open-source](https://www.browserstack.com/open-source). +Requirements: public repo, OSS licence, BrowserStack logo in README. + +If accepted: + +1. Add secrets: `BROWSERSTACK_USERNAME`, `BROWSERSTACK_ACCESS_KEY`. +2. Replace the FTL steps in each benchmark job with BrowserStack App Automate + (`curl -u "$BS_USER:$BS_KEY" -X POST ... `) targeting equivalent Adreno and + Mali devices. +3. The `aggregate-traces.py` / `compare-baseline.py` / baseline files are + provider-agnostic — no changes needed there. + +This lifts the 5-runs/day cap entirely. + +--- + +## Cost estimate + +| Scenario | Cost | +|---|---| +| FTL Spark, ≤5 physical runs/day | **$0/month** | +| FTL Spark cap exceeded (runs over 5/day) | Requires upgrading to Blaze; ~$1/device-min (~$10 for a 10-min run) | +| BrowserStack OSS Program (if approved) | **$0/month** | +| BrowserStack paid | ~$249/mo base | +| AWS Device Farm after free trial | ~$0.17/device-min (~$1.02 for a 6-min run, per device) | diff --git a/scripts/aggregate-traces.py b/scripts/aggregate-traces.py new file mode 100755 index 0000000..9ee29c6 --- /dev/null +++ b/scripts/aggregate-traces.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +""" +Aggregate per-iteration perfetto traces from a Macrobenchmark run into a +results.json that matches the benchmark/baselines/baseline-.json schema. + +Usage: + scripts/aggregate-traces.py [options] + + Directory containing *.perfetto-trace files (one per iteration). + Macrobenchmark writes them to: + app/build/outputs/connected_android_test_additional_output/ + releaseAndroidTest/connected// + Where to write the aggregated results. + +Options: + --trace-processor PATH Path to trace_processor binary. + Defaults to .cache/frame-latency/trace_processor; + auto-downloaded if missing. + --device-model NAME Human-readable device name (e.g. "Pixel 5"). + --gpu NAME GPU name (e.g. "Adreno 620"). + --ftl-model-id ID FTL model ID (e.g. "redfin"). + --android-sdk INT Android API level (e.g. 33). + --duration-s INT Capture window in seconds (default 10). +""" + +import argparse +import csv +import datetime +import glob +import io +import json +import os +import platform +import stat +import subprocess +import sys +import urllib.request +from collections import defaultdict +from statistics import mean, stdev + +SLICE_SQL = ( + "SELECT name, dur FROM slice WHERE name LIKE 'dz.frame_%' AND dur >= 0" +) +COUNTER_SQL = ( + "SELECT c.name, SUM(cs.value) AS total " + "FROM counter cs " + "JOIN counter_track c ON cs.track_id = c.id " + "WHERE c.name LIKE 'dz.dropped_frames.%' " + "GROUP BY c.name" +) + +METRICS = ["avg", "p50", "p90", "p99", "max"] + + +def download_trace_processor(dest: str) -> None: + system = platform.system().lower() + machine = platform.machine().lower() + if system == "linux": + url = "https://get.perfetto.dev/trace_processor" + elif system == "darwin": + url = "https://get.perfetto.dev/trace_processor" + else: + print(f"error: unsupported OS '{system}' for trace_processor auto-download", file=sys.stderr) + sys.exit(2) + print(f"Downloading trace_processor -> {dest} ...", file=sys.stderr) + urllib.request.urlretrieve(url, dest) + os.chmod(dest, os.stat(dest).st_mode | stat.S_IEXEC) + + +def run_sql(tp: str, trace: str, sql: str) -> list[dict]: + result = subprocess.run( + [tp, "query", trace, sql], + capture_output=True, text=True, check=True + ) + rows = [] + reader = csv.DictReader(io.StringIO(result.stdout)) + for row in reader: + rows.append(row) + return rows + + +def percentile(values: list[float], p: float) -> float: + s = sorted(values) + k = (len(s) - 1) * p / 100 + lo = int(k) + hi = min(lo + 1, len(s) - 1) + return s[lo] + (s[hi] - s[lo]) * (k - lo) + + +def aggregate_slices(all_durations_ns: list[int]) -> dict: + if not all_durations_ns: + return {m: 0.0 for m in ["n"] + METRICS} + ms = [d / 1e6 for d in all_durations_ns] + return { + "n": len(ms), + "avg": round(mean(ms), 3), + "p50": round(percentile(ms, 50), 3), + "p90": round(percentile(ms, 90), 3), + "p99": round(percentile(ms, 99), 3), + "max": round(max(ms), 3), + "stdev": round(stdev(ms) if len(ms) > 1 else 0.0, 3), + } + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("traces_dir") + parser.add_argument("output_json") + parser.add_argument("--trace-processor", default=None) + parser.add_argument("--device-model", default="unknown") + parser.add_argument("--gpu", default="unknown") + parser.add_argument("--ftl-model-id", default="unknown") + parser.add_argument("--android-sdk", type=int, default=0) + parser.add_argument("--duration-s", type=int, default=10) + args = parser.parse_args() + + root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + default_tp = os.path.join(root, ".cache", "frame-latency", "trace_processor") + tp = args.trace_processor or default_tp + + if not os.path.isfile(tp): + os.makedirs(os.path.dirname(tp), exist_ok=True) + download_trace_processor(tp) + + traces = sorted(glob.glob(os.path.join(args.traces_dir, "**", "*.perfetto-trace"), recursive=True)) + if not traces: + # FTL may name them .pftrace + traces = sorted(glob.glob(os.path.join(args.traces_dir, "**", "*.pftrace"), recursive=True)) + if not traces: + print(f"error: no .perfetto-trace / .pftrace files found under {args.traces_dir}", file=sys.stderr) + sys.exit(2) + print(f"Found {len(traces)} trace(s) under {args.traces_dir}", file=sys.stderr) + + slice_buckets: dict[str, list[int]] = defaultdict(list) + counter_totals: dict[str, float] = defaultdict(float) + + for trace in traces: + try: + for row in run_sql(tp, trace, SLICE_SQL): + slice_buckets[row["name"]].append(int(row["dur"])) + for row in run_sql(tp, trace, COUNTER_SQL): + counter_totals[row["name"]] += float(row["total"]) + except subprocess.CalledProcessError as e: + print(f"warning: trace_processor failed on {trace}: {e.stderr.strip()}", file=sys.stderr) + + if not slice_buckets: + print("error: no dz.frame_* slices found in any trace", file=sys.stderr) + print(" - Ensure the release APK is instrumented and -a com.dz.camerafast was passed to perfetto", file=sys.stderr) + sys.exit(1) + + stages: dict[str, dict] = {} + for name in sorted(slice_buckets): + stages[name] = aggregate_slices(slice_buckets[name]) + + counters: dict[str, float] = {k: round(v, 3) for k, v in sorted(counter_totals.items())} + + output = { + "device_model": args.device_model, + "gpu": args.gpu, + "ftl_model_id": args.ftl_model_id, + "android_sdk": args.android_sdk, + "captured_at": datetime.datetime.utcnow().isoformat() + "Z", + "runs": len(traces), + "duration_s": args.duration_s, + "stages": stages, + "counters": counters, + } + + with open(args.output_json, "w") as f: + json.dump(output, f, indent=2) + print(f"Wrote {args.output_json}", file=sys.stderr) + + # Print a summary table to stdout for humans / GHA step logs. + print(f"\n{'stage':<32} {'n':>5} {'avg':>7} {'p50':>7} {'p90':>7} {'p99':>7} {'max':>7} (ms)") + print("-" * 80) + for name, s in sorted(stages.items()): + print(f"{name:<32} {s['n']:>5} {s['avg']:>7.2f} {s['p50']:>7.2f} {s['p90']:>7.2f} {s['p99']:>7.2f} {s['max']:>7.2f}") + if counters: + print() + for name, total in sorted(counters.items()): + print(f"{name:<32} total={total:.0f}") + + +if __name__ == "__main__": + main() diff --git a/scripts/compare-baseline.py b/scripts/compare-baseline.py new file mode 100755 index 0000000..591e66e --- /dev/null +++ b/scripts/compare-baseline.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 +""" +Compare a benchmark results.json against a per-GPU baseline, enforce the +tolerance gates defined in benchmark/gates.yaml, and write a GitHub Actions +step summary (when $GITHUB_STEP_SUMMARY is set). + +Usage: + scripts/compare-baseline.py BASELINE.json RESULTS.json [--gates GATES.yaml] + +Exit codes: + 0 All gated metrics within tolerance. + 1 At least one regression beyond tolerance. + 2 At least one improvement beyond tolerance — regenerate the baseline. + (The proposed new baseline JSON is printed to stdout for easy copy-paste.) + +A metric is a (stage, stat) pair such as "dz.frame_e2e.gl.p90". +Counter metrics are keyed as "dz.dropped_frames.gl". +""" + +import argparse +import json +import os +import sys + +try: + import yaml +except ImportError: + yaml = None + + +# ── Simple YAML loader (avoids adding PyYAML as a hard dep in CI) ───────────── + +def _parse_yaml(text: str) -> dict: + """Minimal YAML parser — handles only the scalar/list/dict subset used in + gates.yaml. Falls back to PyYAML when available.""" + if yaml is not None: + return yaml.safe_load(text) + # Hand-rolled: good enough for our controlled file. + root: dict = {} + current_key: str | None = None + current_list: list | None = None + for raw in text.splitlines(): + line = raw.rstrip() + if not line or line.lstrip().startswith("#"): + continue + if not line.startswith(" "): + if ":" in line: + k, _, v = line.partition(":") + v = v.strip() + if v: + try: + root[k.strip()] = int(v) + except ValueError: + root[k.strip()] = v + else: + root[k.strip()] = {} + current_key = k.strip() + current_list = None + else: + stripped = line.lstrip() + indent = len(line) - len(stripped) + if current_key is None: + continue + if stripped.startswith("- "): + val = stripped[2:].strip() + if not isinstance(root.get(current_key), list): + root[current_key] = [] + root[current_key].append(val) + elif ":" in stripped: + k2, _, v2 = stripped.partition(":") + v2 = v2.strip() + if isinstance(root.get(current_key), dict): + try: + root[current_key][k2.strip()] = int(v2) + except ValueError: + root[current_key][k2.strip()] = v2 + return root + + +def load_gates(path: str) -> dict: + with open(path) as f: + raw = _parse_yaml(f.read()) + + # metric_key -> (tier, tolerance_pct, abs_floor_ms, abs_floor_count) + gates: dict[str, tuple[str, float | None, float, float]] = {} + + for tier in ("tight", "loose"): + block = raw.get(tier, {}) + tol = float(block.get("tolerance_pct", 5 if tier == "tight" else 10)) + floor_ms = float(block.get("abs_floor_ms", 0.0)) + floor_ct = float(block.get("abs_floor_count", 0.0)) + for m in block.get("metrics", []): + gates[m] = (tier, tol, floor_ms, floor_ct) + + for m in raw.get("watch", {}).get("metrics", []) if isinstance(raw.get("watch"), dict) else []: + gates[m] = ("watch", None, 0.0, 0.0) + + for m in (raw.get("skip") or []): + gates[m] = ("skip", None, 0.0, 0.0) + + return gates + + +# ── Value extraction ────────────────────────────────────────────────────────── + +def extract_values(data: dict) -> dict[str, float]: + """Flatten stages + counters into metric_key -> mean.""" + out: dict[str, float] = {} + for stage, stats in data.get("stages", {}).items(): + for stat, v in stats.items(): + if stat in ("n", "stdev", "cv_pct", "values"): + continue + if isinstance(v, (int, float)): + out[f"{stage}.{stat}"] = float(v) + elif isinstance(v, dict) and "mean" in v: + out[f"{stage}.{stat}"] = float(v["mean"]) + for name, v in data.get("counters", {}).items(): + if isinstance(v, (int, float)): + out[name] = float(v) + elif isinstance(v, dict) and "mean" in v: + out[name] = float(v["mean"]) + return out + + +# ── Comparison ──────────────────────────────────────────────────────────────── + +STATUS_PASS = "✅ pass" +STATUS_REGRESSION = "❌ REGRESSION" +STATUS_IMPROVED = "⚠️ IMPROVED — regen baseline" +STATUS_WATCH = "👁 watch" +STATUS_SKIP = "—" +STATUS_MISSING = "❓ missing" + + +def compare(baseline: dict, results: dict, gates: dict) -> tuple[int, list[dict]]: + """Returns (exit_code, rows) where rows drive the markdown table.""" + b_vals = extract_values(baseline) + r_vals = extract_values(results) + + all_keys = sorted(set(b_vals) | set(r_vals)) + rows = [] + has_regression = False + has_improvement = False + + for key in all_keys: + gate = gates.get(key, ("watch", None, 0.0, 0.0)) + tier, tol, floor_ms, floor_ct = gate + if tier == "skip": + continue + + b = b_vals.get(key) + r = r_vals.get(key) + + if b is None or r is None: + rows.append({"key": key, "baseline": b, "observed": r, + "delta_abs": None, "delta_pct": None, + "tier": tier, "status": STATUS_MISSING}) + continue + + delta_abs = r - b + if b == 0.0: + delta_pct = 0.0 if r == 0.0 else float("inf") + else: + delta_pct = (r - b) / b * 100.0 + + # Counters (dropped_frames) are in frame counts, everything else in ms. + is_counter = key.startswith("dz.dropped_frames") + abs_floor = floor_ct if is_counter else floor_ms + within_pct = tol is not None and abs(delta_pct) <= tol + within_abs = abs_floor > 0 and abs(delta_abs) <= abs_floor + + if tier == "watch" or tol is None: + status = STATUS_WATCH + elif within_pct or within_abs: + status = STATUS_PASS + elif delta_pct > 0: + status = STATUS_REGRESSION + has_regression = True + else: + status = STATUS_IMPROVED + has_improvement = True + + rows.append({ + "key": key, "baseline": b, "observed": r, + "delta_abs": delta_abs, "delta_pct": delta_pct, + "tier": tier, "status": status, + }) + + exit_code = 0 + if has_regression: + exit_code = 1 + elif has_improvement: + exit_code = 2 + return exit_code, rows + + +# ── Output ──────────────────────────────────────────────────────────────────── + +def fmt_ms(v: float | None) -> str: + return f"{v:.3f}" if v is not None else "—" + + +def fmt_delta(v: float | None) -> str: + if v is None: + return "—" + if v == float("inf"): + return "+∞%" + return f"{v:+.1f}%" + + +def _split_renderer(key: str) -> tuple[str | None, str]: + """Pull the 'gl'/'vk' segment out of a metric key. + + 'dz.frame_e2e.gl.avg' -> ('gl', 'dz.frame_e2e.avg') + 'dz.dropped_frames.vk' -> ('vk', 'dz.dropped_frames') + 'something.else' -> (None, 'something.else') + """ + parts = key.split(".") + for i, p in enumerate(parts): + if p in ("gl", "vk"): + return p, ".".join(parts[:i] + parts[i + 1:]) + return None, key + + +def _fmt_abs(v: float | None) -> str: + if v is None: + return "—" + return f"{v:+.3f}" + + +_TABLE_HEADER = ( + "| metric | tier | baseline | observed | Δabs | Δ% | status |\n" + "|--------|------|----------|----------|------|-----|--------|" +) + + +def _format_row(r: dict) -> str: + _, display = _split_renderer(r["key"]) + return ( + f"| `{display}` | {r['tier']} " + f"| {fmt_ms(r['baseline'])} | {fmt_ms(r['observed'])} " + f"| {_fmt_abs(r.get('delta_abs'))} | {fmt_delta(r['delta_pct'])} | {r['status']} |" + ) + + +def _render_subtable(title: str, rows: list[dict]) -> list[str]: + # Drop SKIP rows entirely; collapse WATCH rows under a
block + # so the headline gated metrics stay visible by default. + gated = [r for r in rows if r["status"] not in (STATUS_SKIP, STATUS_WATCH)] + watched = [r for r in rows if r["status"] == STATUS_WATCH] + if not gated and not watched: + return [] + + out = [f"#### {title}", ""] + + if gated: + out.append(_TABLE_HEADER) + out.extend(_format_row(r) for r in gated) + out.append("") + + if watched: + out += [ + f"
", + f"Watch-only metrics ({len(watched)}) — informational, never fail the build", + "", + _TABLE_HEADER, + ] + out.extend(_format_row(r) for r in watched) + out += ["", "
", ""] + + return out + + +def render_markdown(rows: list[dict], exit_code: int, baseline_path: str, + results_path: str, ftl_mismatch: str | None) -> str: + lines = ["## Frame-latency benchmark results", ""] + + if ftl_mismatch: + lines += [f"> ⚠️ {ftl_mismatch}", ""] + + if exit_code == 0: + lines.append("> ✅ All gated metrics within tolerance.") + elif exit_code == 1: + lines.append("> ❌ **Regression detected** — fix the performance issue before merging.") + else: + lines.append("> ⚠️ **Improvement detected** — run `scripts/aggregate-traces.py` locally " + "and commit the updated `baseline-.json` before merging.") + + lines += [ + "", + f"Baseline: `{baseline_path}` | Results: `{results_path}`", + "", + ] + + gl_rows = [r for r in rows if _split_renderer(r["key"])[0] == "gl"] + vk_rows = [r for r in rows if _split_renderer(r["key"])[0] == "vk"] + other_rows = [r for r in rows if _split_renderer(r["key"])[0] is None] + + lines += _render_subtable("OpenGL ES", gl_rows) + lines += _render_subtable("Vulkan", vk_rows) + if other_rows: + lines += _render_subtable("Other", other_rows) + + return "\n".join(lines) + "\n" + + +def proposed_baseline_json(results: dict) -> str: + """Strip _placeholder fields and pretty-print for the step summary.""" + clean = {k: v for k, v in results.items() if not k.startswith("_")} + return json.dumps(clean, indent=2) + + +# ── Main ────────────────────────────────────────────────────────────────────── + +def main() -> None: + repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + default_gates = os.path.join(repo_root, "benchmark", "gates.yaml") + + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("baseline_json") + parser.add_argument("results_json") + parser.add_argument("--gates", default=default_gates) + parser.add_argument("--output-md", default=None, + help="Also write the markdown comparison table to this file.") + args = parser.parse_args() + + with open(args.baseline_json) as f: + baseline = json.load(f) + with open(args.results_json) as f: + results = json.load(f) + + # Placeholder baselines always "improve" — that's intentional on first run. + if baseline.get("_placeholder"): + print("Baseline is a placeholder — treating all metrics as improvements.", file=sys.stderr) + print("Copy the proposed JSON below into the baseline file and re-push.", file=sys.stderr) + + # Guard against silent FTL pool swaps. + ftl_mismatch: str | None = None + b_model = baseline.get("ftl_model_id") + r_model = results.get("ftl_model_id") + if b_model and r_model and b_model != "unknown" and r_model != "unknown" and b_model != r_model: + ftl_mismatch = ( + f"FTL model mismatch: baseline captured on `{b_model}`, " + f"this run used `{r_model}`. Results are not comparable." + ) + print(f"error: {ftl_mismatch}", file=sys.stderr) + if args.output_md: + with open(args.output_md, "w") as f: + f.write(f"> ❌ {ftl_mismatch}\n") + sys.exit(3) + + gates = load_gates(args.gates) + exit_code, rows = compare(baseline, results, gates) + + # Console table. + print(f"\n{'metric':<40} {'tier':<6} {'baseline':>10} {'observed':>10} {'Δ%':>8} status") + print("-" * 90) + for r in rows: + print( + f"{r['key']:<40} {r['tier']:<6} " + f"{fmt_ms(r['baseline']):>10} {fmt_ms(r['observed']):>10} " + f"{fmt_delta(r['delta_pct']):>8} {r['status']}" + ) + + md = render_markdown(rows, exit_code, args.baseline_json, args.results_json, ftl_mismatch) + + if args.output_md: + with open(args.output_md, "w") as f: + f.write(md) + + step_summary = os.environ.get("GITHUB_STEP_SUMMARY") + if step_summary: + with open(step_summary, "a") as f: + f.write(md) + if exit_code == 2: + with open(step_summary, "a") as f: + f.write("\n### Proposed updated baseline\n\n```json\n") + f.write(proposed_baseline_json(results)) + f.write("\n```\n") + else: + print("\n" + md) + if exit_code == 2: + print("### Proposed updated baseline\n") + print(proposed_baseline_json(results)) + + sys.exit(exit_code) + + +if __name__ == "__main__": + main()