diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 059f88ce0..58a0aee1f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,10 +1,6 @@ # Generated file - DO NOT EDIT # Source: ci.yml.genie.ts -concurrency: - group: '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}' - cancel-in-progress: true - name: CI on: @@ -14,6 +10,16 @@ on: branches: [main] workflow_dispatch: inputs: + measurement_baseline_ref: + description: Optional ref/SHA to checkout before running CI measurement jobs. Used to backfill comparable baseline artifacts. + required: false + default: '' + type: string + measurement_baseline_label: + description: Optional human label for a measurement baseline backfill run, for example PR number. + required: false + default: '' + type: string debug_force_nix_diagnostics_failure: description: 'Temporary debug switch (#272): force post-validation failure to verify diagnostics artifact + summary' required: false @@ -22,6 +28,7 @@ on: jobs: typecheck: + if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }} runs-on: [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}'] timeout-minutes: 30 @@ -34,6 +41,11 @@ jobs: GITHUB_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@v6 + - name: Checkout CI measurement baseline ref + if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }} + uses: actions/checkout@v6 + with: + ref: ${{ inputs.measurement_baseline_ref }} - name: Install Nix uses: DeterminateSystems/determinate-nix-action@v3 with: @@ -208,7 +220,7 @@ jobs: local max="${NIX_GC_RACE_MAX_RETRIES:-10}" local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}" local attempt=1 - local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit + local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit start="$(date +%s)" @@ -256,7 +268,7 @@ jobs: if [ "$rc" -eq 0 ]; then echo "::notice::[ci] completed $task in $elapsed s" if [ "$attempt" -gt 1 ]; then - write_summary success "Recovered from Nix GC race after retry" + write_summary success "Recovered from transient Nix failure after retry" else write_summary success fi @@ -272,18 +284,22 @@ jobs: tr -d '[:space:]' || true) saw_invalid_path=false saw_cachix_signature=false + saw_fetch_signature=false [ -n "$path" ] && saw_invalid_path=true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true rm -f "$log" - if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then - echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race" - write_summary failure "No Nix GC race signature detected" + if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then + echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure" + write_summary failure "No transient Nix failure signature detected" return "$rc" fi - if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then + if [ "$saw_fetch_signature" = true ]; then + echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache" + elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path" elif [ "$saw_cachix_signature" = true ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)" @@ -298,8 +314,8 @@ jobs: now=$(date +%s) elapsed=$((now - start)) - echo "::error::Nix GC race retry exhausted for $task ($max attempts)" - write_summary failure "Nix GC race retry exhausted" + echo "::error::Transient Nix retry exhausted for $task ($max attempts)" + write_summary failure "Transient Nix retry exhausted" return 1 } EOF @@ -329,7 +345,7 @@ jobs: local max="${NIX_GC_RACE_MAX_RETRIES:-10}" local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}" local attempt=1 - local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit + local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit start="$(date +%s)" @@ -377,7 +393,7 @@ jobs: if [ "$rc" -eq 0 ]; then echo "::notice::[ci] completed $task in $elapsed s" if [ "$attempt" -gt 1 ]; then - write_summary success "Recovered from Nix GC race after retry" + write_summary success "Recovered from transient Nix failure after retry" else write_summary success fi @@ -393,18 +409,22 @@ jobs: tr -d '[:space:]' || true) saw_invalid_path=false saw_cachix_signature=false + saw_fetch_signature=false [ -n "$path" ] && saw_invalid_path=true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true rm -f "$log" - if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then - echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race" - write_summary failure "No Nix GC race signature detected" + if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then + echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure" + write_summary failure "No transient Nix failure signature detected" return "$rc" fi - if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then + if [ "$saw_fetch_signature" = true ]; then + echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache" + elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path" elif [ "$saw_cachix_signature" = true ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)" @@ -419,8 +439,8 @@ jobs: now=$(date +%s) elapsed=$((now - start)) - echo "::error::Nix GC race retry exhausted for $task ($max attempts)" - write_summary failure "Nix GC race retry exhausted" + echo "::error::Transient Nix retry exhausted for $task ($max attempts)" + write_summary failure "Transient Nix retry exhausted" return 1 } EOF @@ -485,7 +505,11 @@ jobs: run: | echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:" echo " https://github.com/overengineeringstudio/effect-utils/issues/201" + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-typecheck" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} lint: + if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }} runs-on: [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}'] timeout-minutes: 30 @@ -498,6 +522,11 @@ jobs: GITHUB_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@v6 + - name: Checkout CI measurement baseline ref + if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }} + uses: actions/checkout@v6 + with: + ref: ${{ inputs.measurement_baseline_ref }} - name: Install Nix uses: DeterminateSystems/determinate-nix-action@v3 with: @@ -671,7 +700,7 @@ jobs: local max="${NIX_GC_RACE_MAX_RETRIES:-10}" local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}" local attempt=1 - local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit + local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit start="$(date +%s)" @@ -719,7 +748,7 @@ jobs: if [ "$rc" -eq 0 ]; then echo "::notice::[ci] completed $task in $elapsed s" if [ "$attempt" -gt 1 ]; then - write_summary success "Recovered from Nix GC race after retry" + write_summary success "Recovered from transient Nix failure after retry" else write_summary success fi @@ -735,18 +764,22 @@ jobs: tr -d '[:space:]' || true) saw_invalid_path=false saw_cachix_signature=false + saw_fetch_signature=false [ -n "$path" ] && saw_invalid_path=true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true rm -f "$log" - if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then - echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race" - write_summary failure "No Nix GC race signature detected" + if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then + echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure" + write_summary failure "No transient Nix failure signature detected" return "$rc" fi - if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then + if [ "$saw_fetch_signature" = true ]; then + echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache" + elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path" elif [ "$saw_cachix_signature" = true ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)" @@ -761,8 +794,8 @@ jobs: now=$(date +%s) elapsed=$((now - start)) - echo "::error::Nix GC race retry exhausted for $task ($max attempts)" - write_summary failure "Nix GC race retry exhausted" + echo "::error::Transient Nix retry exhausted for $task ($max attempts)" + write_summary failure "Transient Nix retry exhausted" return 1 } EOF @@ -827,7 +860,11 @@ jobs: run: | echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:" echo " https://github.com/overengineeringstudio/effect-utils/issues/201" + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-lint" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} test: + if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }} strategy: fail-fast: false matrix: @@ -843,6 +880,11 @@ jobs: GITHUB_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@v6 + - name: Checkout CI measurement baseline ref + if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }} + uses: actions/checkout@v6 + with: + ref: ${{ inputs.measurement_baseline_ref }} - name: Install Nix uses: DeterminateSystems/determinate-nix-action@v3 with: @@ -1016,7 +1058,7 @@ jobs: local max="${NIX_GC_RACE_MAX_RETRIES:-10}" local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}" local attempt=1 - local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit + local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit start="$(date +%s)" @@ -1064,7 +1106,7 @@ jobs: if [ "$rc" -eq 0 ]; then echo "::notice::[ci] completed $task in $elapsed s" if [ "$attempt" -gt 1 ]; then - write_summary success "Recovered from Nix GC race after retry" + write_summary success "Recovered from transient Nix failure after retry" else write_summary success fi @@ -1080,18 +1122,22 @@ jobs: tr -d '[:space:]' || true) saw_invalid_path=false saw_cachix_signature=false + saw_fetch_signature=false [ -n "$path" ] && saw_invalid_path=true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true rm -f "$log" - if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then - echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race" - write_summary failure "No Nix GC race signature detected" + if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then + echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure" + write_summary failure "No transient Nix failure signature detected" return "$rc" fi - if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then + if [ "$saw_fetch_signature" = true ]; then + echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache" + elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path" elif [ "$saw_cachix_signature" = true ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)" @@ -1106,8 +1152,8 @@ jobs: now=$(date +%s) elapsed=$((now - start)) - echo "::error::Nix GC race retry exhausted for $task ($max attempts)" - write_summary failure "Nix GC race retry exhausted" + echo "::error::Transient Nix retry exhausted for $task ($max attempts)" + write_summary failure "Transient Nix retry exhausted" return 1 } EOF @@ -1172,7 +1218,11 @@ jobs: run: | echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:" echo " https://github.com/overengineeringstudio/effect-utils/issues/201" + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-test-${{ strategy.job-index }}" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} nix-check: + if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }} strategy: fail-fast: false matrix: @@ -1188,6 +1238,11 @@ jobs: GITHUB_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@v6 + - name: Checkout CI measurement baseline ref + if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }} + uses: actions/checkout@v6 + with: + ref: ${{ inputs.measurement_baseline_ref }} - name: Install Nix uses: DeterminateSystems/determinate-nix-action@v3 with: @@ -1361,7 +1416,7 @@ jobs: local max="${NIX_GC_RACE_MAX_RETRIES:-10}" local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}" local attempt=1 - local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit + local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit start="$(date +%s)" @@ -1409,7 +1464,7 @@ jobs: if [ "$rc" -eq 0 ]; then echo "::notice::[ci] completed $task in $elapsed s" if [ "$attempt" -gt 1 ]; then - write_summary success "Recovered from Nix GC race after retry" + write_summary success "Recovered from transient Nix failure after retry" else write_summary success fi @@ -1425,18 +1480,22 @@ jobs: tr -d '[:space:]' || true) saw_invalid_path=false saw_cachix_signature=false + saw_fetch_signature=false [ -n "$path" ] && saw_invalid_path=true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true rm -f "$log" - if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then - echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race" - write_summary failure "No Nix GC race signature detected" + if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then + echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure" + write_summary failure "No transient Nix failure signature detected" return "$rc" fi - if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then + if [ "$saw_fetch_signature" = true ]; then + echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache" + elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path" elif [ "$saw_cachix_signature" = true ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)" @@ -1451,8 +1510,8 @@ jobs: now=$(date +%s) elapsed=$((now - start)) - echo "::error::Nix GC race retry exhausted for $task ($max attempts)" - write_summary failure "Nix GC race retry exhausted" + echo "::error::Transient Nix retry exhausted for $task ($max attempts)" + write_summary failure "Transient Nix retry exhausted" return 1 } EOF @@ -1517,7 +1576,11 @@ jobs: run: | echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:" echo " https://github.com/overengineeringstudio/effect-utils/issues/201" + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-check-${{ strategy.job-index }}" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} nix-fod-check: + if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }} strategy: fail-fast: false matrix: @@ -1533,6 +1596,11 @@ jobs: GITHUB_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@v6 + - name: Checkout CI measurement baseline ref + if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }} + uses: actions/checkout@v6 + with: + ref: ${{ inputs.measurement_baseline_ref }} - name: Install Nix uses: DeterminateSystems/determinate-nix-action@v3 with: @@ -1614,16 +1682,127 @@ jobs: - name: Cold pnpm deps validation shell: bash run: | - set -euo pipefail - for attr in '.#genie-pnpm-deps' '.#megarepo-pnpm-deps' '.#oxc-config-plugin-pnpm-deps'; do + __nix_gc_retry_helper=$(mktemp) + cat > "$__nix_gc_retry_helper" <<'EOF' + #!/usr/bin/env bash + + run_nix_gc_race_retry() { + local task="$1" + local command="$2" + local max="${NIX_GC_RACE_MAX_RETRIES:-10}" + local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}" + local attempt=1 + local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit + + start="$(date +%s)" + + write_summary() { + [ -n "${GITHUB_STEP_SUMMARY:-}" ] || return 0 + { + echo "### CI Task" + echo "- Task: $task" + echo "- Status: $1" + echo "- Duration: $elapsed s" + echo "- Attempts: $attempt/$max" + [ -z "${2:-}" ] || echo "- Note: $2" + } >> "$GITHUB_STEP_SUMMARY" + } + + while [ "$attempt" -le "$max" ]; do + echo "::notice::[ci] starting $task (attempt $attempt/$max)" + ( + while sleep "$heartbeat"; do + now=$(date +%s) + elapsed=$((now - start)) + echo "::notice::[ci] $task still running after $elapsed s (attempt $attempt/$max)" + done + ) & + hb_pid=$! + + log=$(mktemp) + had_errexit=false + case $- in + *e*) had_errexit=true ;; + esac + set +e + eval "$command" > >(tee -a "$log") 2> >(tee -a "$log" >&2) + rc=$? + if [ "$had_errexit" = true ]; then + set -e + fi + + kill "$hb_pid" 2>/dev/null || true + wait "$hb_pid" 2>/dev/null || true + + now=$(date +%s) + elapsed=$((now - start)) + + if [ "$rc" -eq 0 ]; then + echo "::notice::[ci] completed $task in $elapsed s" + if [ "$attempt" -gt 1 ]; then + write_summary success "Recovered from transient Nix failure after retry" + else + write_summary success + fi + rm -f "$log" + return 0 + fi + + flattened=$(tr '\r\n' ' ' < "$log" | sed -E $'s/\x1B\[[0-9;]*m//g') + path=$(printf '%s' "$flattened" | + grep -o "error:[[:space:]]*path '/nix/store/[^']*'[[:space:]]*is not valid" | + head -1 | + grep -o "/nix/store/[^']*" | + tr -d '[:space:]' || true) + saw_invalid_path=false + saw_cachix_signature=false + saw_fetch_signature=false + [ -n "$path" ] && saw_invalid_path=true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true + rm -f "$log" + + if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then + echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure" + write_summary failure "No transient Nix failure signature detected" + return "$rc" + fi + + if [ "$saw_fetch_signature" = true ]; then + echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache" + elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then + echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path" + elif [ "$saw_cachix_signature" = true ]; then + echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)" + else + echo "::warning::Nix store validity race detected for $task (attempt $attempt/$max): $path" + fi + + [ -z "$path" ] || nix-store --realise "$path" 2>/dev/null || true + rm -rf ~/.cache/nix/eval-cache-* + attempt=$((attempt + 1)) + done + + now=$(date +%s) + elapsed=$((now - start)) + echo "::error::Transient Nix retry exhausted for $task ($max attempts)" + write_summary failure "Transient Nix retry exhausted" + return 1 + } + EOF + . "$__nix_gc_retry_helper" + rm -f "$__nix_gc_retry_helper" + run_nix_gc_race_retry 'Cold pnpm deps validation' 'set -euo pipefail + for attr in '"'"'.#genie-pnpm-deps'"'"' '"'"'.#megarepo-pnpm-deps'"'"' '"'"'.#oxc-config-plugin-pnpm-deps'"'"'; do echo "::group::rebuild-check $attr" # Step 1: Realize once (may substitute) so rebuild has a trusted output to compare against. - nix build --no-link "$attr" --option substituters 'https://cache.nixos.org' + nix build --no-link "$attr" --option substituters '"'"'https://cache.nixos.org'"'"' # Step 2: Rebuild and compare locally. This fails on stale fixed-output hashes without # relying on whether a shared daemon store made the prior out path disappear. - nix build --no-link --rebuild "$attr" --option substituters 'https://cache.nixos.org' + nix build --no-link --rebuild "$attr" --option substituters '"'"'https://cache.nixos.org'"'"' echo "::endgroup::" - done + done' - name: Nix diagnostics summary if: failure() shell: bash @@ -1674,7 +1853,11 @@ jobs: run: | echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:" echo " https://github.com/overengineeringstudio/effect-utils/issues/201" + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-fod-check-${{ strategy.job-index }}" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} pnpm-builder-contract: + if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }} runs-on: [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}'] timeout-minutes: 30 @@ -1687,6 +1870,11 @@ jobs: GITHUB_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@v6 + - name: Checkout CI measurement baseline ref + if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }} + uses: actions/checkout@v6 + with: + ref: ${{ inputs.measurement_baseline_ref }} - name: Install Nix uses: DeterminateSystems/determinate-nix-action@v3 with: @@ -1933,7 +2121,11 @@ jobs: run: | echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:" echo " https://github.com/overengineeringstudio/effect-utils/issues/201" + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-pnpm-builder-contract" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} pnpm-regression: + if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }} runs-on: [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}'] timeout-minutes: 30 @@ -1946,6 +2138,11 @@ jobs: GITHUB_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@v6 + - name: Checkout CI measurement baseline ref + if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }} + uses: actions/checkout@v6 + with: + ref: ${{ inputs.measurement_baseline_ref }} - name: Install Nix uses: DeterminateSystems/determinate-nix-action@v3 with: @@ -2110,6 +2307,7 @@ jobs: - name: pnpm regression suite run: | bash genie/ci-scripts/nix-gc-race-retry.test.sh + bash genie/ci-scripts/ci-measurement-comparison.test.sh bash nix/workspace-tools/lib/mk-pnpm-cli/tests/run.sh --skip-genie --skip-megarepo --skip-devenv-shell --skip-downstream-megarepo - name: Save pnpm state if: ${{ success() && steps.restore-pnpm-state.outputs.cache-hit != 'true' }} @@ -2169,10 +2367,14 @@ jobs: run: | echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:" echo " https://github.com/overengineeringstudio/effect-utils/issues/201" + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-pnpm-regression" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} devenv-perf: runs-on: [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}'] permissions: + actions: read contents: write issues: write pull-requests: write @@ -2185,10 +2387,18 @@ jobs: GITHUB_TOKEN: ${{ github.token }} ARTIFACT_DIR: tmp/devenv-perf-ci OTEL_SERVICE_NAME: devenv-perf-ci - DEVENV_PERF_REGRESSION_MODE: warn RUNNER_CLASS: 'namespace-profile-linux-x86-64,namespace-features:github.run-id=${{ github.run_id }}' + CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }} + CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }} + CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }} + CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }} steps: - uses: actions/checkout@v6 + - name: Checkout CI measurement baseline ref + if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }} + uses: actions/checkout@v6 + with: + ref: ${{ inputs.measurement_baseline_ref }} - name: Install Nix uses: DeterminateSystems/determinate-nix-action@v3 with: @@ -2350,138 +2560,85 @@ jobs: EOF echo "::warning::Intentional failure for diagnostics validation (#272)" exit 1 - - name: 'Download previous artifact: devenv-perf' + - name: Benchmark devenv surfaces shell: bash - env: - GH_TOKEN: ${{ github.token }} - BASELINE_ARTIFACT_NAME: devenv-perf - BASELINE_OUTPUT_DIR: tmp/devenv-perf-ci/baseline - BASELINE_WORKFLOW_NAME: ${{ github.workflow }} - BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }} - BASELINE_SEED_RUN_IDS: '25710204667' - BASELINE_MAX_RUNS: '5' run: | set -euo pipefail - mkdir -p "$BASELINE_OUTPUT_DIR" - - if ! command -v gh >/dev/null 2>&1; then - echo "::notice::gh is not available; skipping previous artifact download" - exit 0 - fi - - repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}" - workflow="${BASELINE_WORKFLOW_NAME:-CI}" - branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}" + ensure_ci_measurement_tool() { + tool_name="$1" + nix_attr="$2" + if command -v "$tool_name" >/dev/null 2>&1; then + return 0 + fi + if ! command -v nix >/dev/null 2>&1; then + return 1 + fi + if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then + while IFS= read -r tool_path; do + [ -n "$tool_path" ] || continue + [ -d "$tool_path/bin" ] || continue + export PATH="$tool_path/bin:$PATH" + if command -v "$tool_name" >/dev/null 2>&1; then + return 0 + fi + done </dev/null 2>&1 + } - candidate_runs="$( - gh run list \ - --repo "$repo" \ - --workflow "$workflow" \ - --branch "$branch" \ - --event push \ - --status success \ - --json databaseId,headSha \ - --limit 20 \ - --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]' - )" + require_ci_measurement_tool() { + tool_name="$1" + nix_attr="$2" + if ensure_ci_measurement_tool "$tool_name" "$nix_attr"; then + return 0 + fi + echo "::error::$tool_name is not available; unable to produce CI measurement artifact" + exit 1 + } - candidate_runs="$candidate_runs - $BASELINE_SEED_RUN_IDS" + require_ci_measurement_tool awk gawk.out + require_ci_measurement_tool jq jq.bin - max_runs="${BASELINE_MAX_RUNS:-5}" - if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then - max_runs=1 - fi + ARTIFACT_DIR="$(mkdir -p "$ARTIFACT_DIR" && cd "$ARTIFACT_DIR" && pwd -P)" + CI_MEASUREMENT_HEAD_DIR="${CI_MEASUREMENT_HEAD_DIR:-$PWD}" + CI_MEASUREMENT_BASE_DIR="${CI_MEASUREMENT_BASE_DIR:-${RUNNER_TEMP:-/tmp}/ci-measurement-base}" + CI_MEASUREMENT_PAIRED_ENABLED=0 + CI_MEASUREMENT_ORDER_SEED="${CI_MEASUREMENT_ORDER_SEED:-${GITHUB_RUN_ID:-local}-${GITHUB_RUN_ATTEMPT:-0}-${GITHUB_SHA:-unknown}}" - run_id="" - artifact_name="" - artifact_id="" - downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl" - seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt" - : >"$downloaded_runs_file" - : >"$seen_runs_file" - for candidate_run in $candidate_runs; do - if [ -z "$candidate_run" ]; then - continue + prepare_paired_base_worktree() { + if [ "${GITHUB_EVENT_NAME:-}" != "pull_request" ]; then + return 0 fi - if grep -qxF "$candidate_run" "$seen_runs_file"; then - continue + if [ -n "${CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" ]; then + return 0 fi - printf '%s\n' "$candidate_run" >>"$seen_runs_file" - if [ "$(wc -l <"$downloaded_runs_file" | tr -d ' ')" -ge "$max_runs" ]; then - break + if [ ! -f "${GITHUB_EVENT_PATH:-}" ]; then + return 0 fi - artifact_json="$( - gh api "repos/$repo/actions/runs/$candidate_run/artifacts" \ - --jq '.artifacts - | map(select(.expired == false)) - | map(select(.name == env.BASELINE_ARTIFACT_NAME or (.name | startswith(env.BASELINE_ARTIFACT_NAME + "-")))) - | sort_by(.created_at // "") - | reverse - | .[0] // empty' - )" - - if [ -n "$artifact_json" ]; then - current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')" - current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')" - current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run" - mkdir -p "$current_output_dir" - if gh run download "$candidate_run" \ - --repo "$repo" \ - --name "$current_artifact_name" \ - --dir "$current_output_dir"; then - if [ -z "$run_id" ]; then - run_id="$candidate_run" - artifact_name="$current_artifact_name" - artifact_id="$current_artifact_id" - fi - jq -cn \ - --arg runId "$candidate_run" \ - --arg artifactName "$current_artifact_name" \ - --arg artifactId "$current_artifact_id" \ - --arg path "run-$candidate_run" \ - '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \ - >>"$downloaded_runs_file" - else - echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run" - fi + local base_sha + base_sha="$(jq -r '.pull_request.base.sha // empty' "$GITHUB_EVENT_PATH")" + if [ -z "$base_sha" ]; then + echo "::notice::paired wall-clock baseline unavailable: pull_request.base.sha missing" + return 0 fi - done - - if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then - echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch" - exit 0 - fi - - jq -n \ - --slurpfile runs "$downloaded_runs_file" \ - --argjson schemaVersion 1 \ - --arg repository "$repo" \ - --arg workflow "$workflow" \ - --arg branch "$branch" \ - --arg runId "$run_id" \ - --arg artifactName "$artifact_name" \ - --arg artifactId "$artifact_id" \ - '{ - schemaVersion: $schemaVersion, - source: "github-actions-artifact", - repository: $repository, - workflow: $workflow, - branch: $branch, - runId: $runId, - artifactName: $artifactName, - artifactId: $artifactId, - runs: $runs - }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json" - echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR" + rm -rf "$CI_MEASUREMENT_BASE_DIR" + git worktree prune >/dev/null 2>&1 || true + if git fetch --no-tags --depth=1 origin "$base_sha" \ + && git worktree add --detach "$CI_MEASUREMENT_BASE_DIR" "$base_sha" >/dev/null; then + CI_MEASUREMENT_PAIRED_ENABLED=1 + echo "::notice::paired wall-clock baseline prepared at $CI_MEASUREMENT_BASE_DIR ($base_sha)" + else + echo "::warning::paired wall-clock baseline unavailable: failed to prepare base worktree $base_sha" + CI_MEASUREMENT_PAIRED_ENABLED=0 + fi + } - - name: Benchmark devenv surfaces - shell: bash - run: | - set -euo pipefail + prepare_paired_base_worktree mkdir -p "$ARTIFACT_DIR/traces" @@ -2521,6 +2678,8 @@ jobs: local stdout="$7" local stderr="$8" local trace="$9" + local gate_policy="${10}" + local metadata_json="${11}" local samples_file="$ARTIFACT_DIR/$id.samples.json" if [ "$first" -eq 0 ]; then @@ -2535,13 +2694,44 @@ jobs: --arg group "$group" \ --arg description "$description" \ --argjson status "$status" \ - --argjson durationMs "$duration_ms" \ - --arg stdout "$stdout" \ - --arg stderr "$stderr" \ - --arg trace "$trace" \ - '($samples[0] // []) as $sampleList - | ($sampleList | map(select(.status == 0) | .durationMs)) as $successfulDurations - | { + --argjson durationMs "$duration_ms" \ + --arg stdout "$stdout" \ + --arg stderr "$stderr" \ + --arg trace "$trace" \ + --argjson gatePolicy "$gate_policy" \ + --argjson metadata "$metadata_json" \ + 'def median: + sort as $sorted + | ($sorted | length) as $count + | if $count == 0 then null + elif ($count % 2) == 1 then $sorted[($count / 2 | floor)] + else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2) + end; + def percentile($p): + sort as $sorted + | ($sorted | length) as $count + | if $count == 0 then null + else $sorted[(($p * ($count - 1)) | floor)] + end; + ($samples[0] // []) as $sampleList + | ($sampleList | map(select((.subject // "head") == "head" and .phase != "warmup" and .status == 0) | .durationMs)) as $successfulDurations + | ($sampleList | map(select((.subject // "head") == "head" and .phase == "warmup"))) as $warmupSamples + | ($sampleList | map(select((.subject // "head") == "head" and .phase == "measured" and .status == 0 and .pairIndex != null))) as $headSamples + | ($sampleList | map(select(.subject == "base" and .phase == "measured" and .status == 0 and .pairIndex != null))) as $baseSamples + | ( + $headSamples + | map(. as $head | $baseSamples[]? | select(.pairIndex == $head.pairIndex) | { + pairIndex: $head.pairIndex, + currentDurationMs: $head.durationMs, + baselineDurationMs: .durationMs, + deltaMs: ($head.durationMs - .durationMs) + }) + ) as $pairedSamples + | ($pairedSamples | map(.currentDurationMs)) as $pairedCurrentDurations + | ($pairedSamples | map(.baselineDurationMs)) as $pairedBaselineDurations + | ($pairedSamples | map(.deltaMs)) as $pairedDeltaDurations + | ($pairedDeltaDurations | median) as $pairedDeltaMedian + | { id:$id, name:$id, label:$label, @@ -2552,12 +2742,34 @@ jobs: stdout:$stdout, stderr:$stderr, trace:(if $trace == "" then null else $trace end), - statistics: { + metadata:$metadata, + gatePolicy:$gatePolicy, + statistics: { sampleCount: ($sampleList | length), + warmupCount: ($warmupSamples | length), + measuredSampleCount: ( + $sampleList + | map(select((.subject // "head") == "head" and .phase != "warmup")) + | length + ), successfulSampleCount: ($successfulDurations | length), minDurationMs: ($successfulDurations | min), maxDurationMs: ($successfulDurations | max), - medianDurationMs: $durationMs + medianDurationMs: $durationMs, + pairedSampleCount: ($pairedSamples | length), + pairedCurrentMedianDurationMs: ($pairedCurrentDurations | median), + pairedBaselineMedianDurationMs: ($pairedBaselineDurations | median), + pairedDeltaMedianDurationMs: $pairedDeltaMedian, + pairedDeltaMinDurationMs: ($pairedDeltaDurations | min), + pairedDeltaMaxDurationMs: ($pairedDeltaDurations | max), + pairedDeltaP25DurationMs: ($pairedDeltaDurations | percentile(0.25)), + pairedDeltaP75DurationMs: ($pairedDeltaDurations | percentile(0.75)), + pairedDeltaMadDurationMs: ( + if $pairedDeltaMedian == null then null + else ($pairedDeltaDurations | map(. - $pairedDeltaMedian | if . < 0 then -. else . end) | median) + end + ), + pairedDeltaSampleDurationMs: $pairedDeltaDurations }, samples:$sampleList }' \ @@ -2568,10 +2780,13 @@ jobs: local id="$1" local label="$2" local group="$3" - local description="$4" - local trace_file="$5" - local repetitions="$6" - shift 6 + local description="$4" + local trace_file="$5" + local warmup_repetitions="$6" + local repetitions="$7" + local gate_policy="$8" + local metadata_json="$9" + shift 9 case "$trace_file" in '$ARTIFACT_DIR'*) trace_file="${ARTIFACT_DIR}${trace_file#'$ARTIFACT_DIR'}" ;; esac @@ -2584,11 +2799,24 @@ jobs: if ! [[ "$repetitions" =~ ^[0-9]+$ ]] || [ "$repetitions" -lt 1 ]; then repetitions=1 fi + if ! [[ "$warmup_repetitions" =~ ^[0-9]+$ ]] || [ "$warmup_repetitions" -lt 0 ]; then + warmup_repetitions=0 + fi printf '[' >"$samples_file" local sample_first=1 - local sample_index sample_stdout sample_stderr sample_trace expanded - for sample_index in $(seq 1 "$repetitions"); do + local sample_index measured_index total_repetitions phase sample_stdout sample_stderr sample_trace expanded + local order_offset + order_offset="$(printf '%s' "$CI_MEASUREMENT_ORDER_SEED:$id" | cksum | awk '{ print $1 % 2 }')" + total_repetitions=$((warmup_repetitions + repetitions)) + for sample_index in $(seq 1 "$total_repetitions"); do + if [ "$sample_index" -le "$warmup_repetitions" ]; then + phase="warmup" + measured_index="" + else + phase="measured" + measured_index=$((sample_index - warmup_repetitions)) + fi sample_stdout="$ARTIFACT_DIR/$id.$sample_index.stdout" sample_stderr="$ARTIFACT_DIR/$id.$sample_index.stderr" sample_trace="" @@ -2599,19 +2827,65 @@ jobs: fi fi - started="$(date +%s%3N)" - set +e expanded=() for arg in "$@"; do case "$arg" in '$DEVENV_BIN') expanded+=("${DEVENV_BIN:?DEVENV_BIN not set}") ;; + '$DEVENV_SHELL_TRACE_COMMAND') + if "${DEVENV_BIN:?DEVENV_BIN not set}" --help 2>&1 | grep -q -- '--trace-to'; then + expanded+=("${DEVENV_BIN:?DEVENV_BIN not set}" "--trace-to" "json:file:$sample_trace" "shell" "--no-reload" "--" "true") + elif "${DEVENV_BIN:?DEVENV_BIN not set}" --help 2>&1 | grep -q -- '--trace-format'; then + expanded+=("${DEVENV_BIN:?DEVENV_BIN not set}" "--trace-format" "json" "shell" "--no-reload" "--" "true") + sample_trace="" + else + expanded+=("${DEVENV_BIN:?DEVENV_BIN not set}" "shell" "--no-reload" "--" "true") + sample_trace="" + fi + ;; '$ARTIFACT_DIR'*) expanded+=("${ARTIFACT_DIR}${arg#'$ARTIFACT_DIR'}") ;; 'json:file:$trace_file') expanded+=("json:file:$sample_trace") ;; '$trace_file') expanded+=("file:$sample_trace") ;; *) expanded+=("$arg") ;; esac done - "${expanded[@]}" >"$sample_stdout" 2>"$sample_stderr" + + local base_ran_before_head=0 base_stdout base_stderr base_started base_ended base_status base_duration_ms + if [ "$phase" = "measured" ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ $(((measured_index + order_offset) % 2)) -eq 0 ]; then + base_ran_before_head=1 + base_stdout="$ARTIFACT_DIR/$id.$sample_index.base.stdout" + base_stderr="$ARTIFACT_DIR/$id.$sample_index.base.stderr" + base_started="$(date +%s%3N)" + set +e + (cd "$CI_MEASUREMENT_BASE_DIR" && "${expanded[@]}") >"$base_stdout" 2>"$base_stderr" + base_status=$? + set -e + base_ended="$(date +%s%3N)" + base_duration_ms=$((base_ended - base_started)) + + if [ "$sample_first" -eq 0 ]; then + printf ',' >>"$samples_file" + fi + sample_first=0 + jq -cn \ + --argjson index "$sample_index" \ + --arg measuredIndex "$measured_index" \ + --argjson status "$base_status" \ + --argjson durationMs "$base_duration_ms" \ + --arg stdout "$base_stdout" \ + --arg stderr "$base_stderr" \ + --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \ + '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"base-head",orderSeed:$orderSeed}' \ + >>"$samples_file" + + if [ "$base_status" -ne 0 ]; then + echo "::warning::$id paired baseline sample $measured_index failed after ${base_duration_ms}ms; this pair is excluded from wall-clock gating" + tail -40 "$base_stderr" || true + fi + fi + + started="$(date +%s%3N)" + set +e + (cd "$CI_MEASUREMENT_HEAD_DIR" && "${expanded[@]}") >"$sample_stdout" 2>"$sample_stderr" status=$? set -e ended="$(date +%s%3N)" @@ -2623,16 +2897,49 @@ jobs: sample_first=0 jq -cn \ --argjson index "$sample_index" \ + --arg measuredIndex "$measured_index" \ + --arg phase "$phase" \ --argjson status "$status" \ --argjson durationMs "$duration_ms" \ --arg stdout "$sample_stdout" \ --arg stderr "$sample_stderr" \ --arg trace "$sample_trace" \ - '{index:$index,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end)}' \ + --arg order "$(if [ "$phase" = "measured" ] && [ "$base_ran_before_head" -eq 1 ]; then printf base-head; else printf head-base; fi)" \ + --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \ + '{index:$index,measuredIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),pairIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),subject:"head",phase:$phase,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end),order:(if $phase == "measured" then $order else null end),orderSeed:(if $phase == "measured" then $orderSeed else null end)}' \ >>"$samples_file" - stdout="$sample_stdout" - stderr="$sample_stderr" + if [ "$phase" = "measured" ] && [ "$status" -eq 0 ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ "$base_ran_before_head" -eq 0 ]; then + base_stdout="$ARTIFACT_DIR/$id.$sample_index.base.stdout" + base_stderr="$ARTIFACT_DIR/$id.$sample_index.base.stderr" + base_started="$(date +%s%3N)" + set +e + (cd "$CI_MEASUREMENT_BASE_DIR" && "${expanded[@]}") >"$base_stdout" 2>"$base_stderr" + base_status=$? + set -e + base_ended="$(date +%s%3N)" + base_duration_ms=$((base_ended - base_started)) + + printf ',' >>"$samples_file" + jq -cn \ + --argjson index "$sample_index" \ + --arg measuredIndex "$measured_index" \ + --argjson status "$base_status" \ + --argjson durationMs "$base_duration_ms" \ + --arg stdout "$base_stdout" \ + --arg stderr "$base_stderr" \ + --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \ + '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"head-base",orderSeed:$orderSeed}' \ + >>"$samples_file" + + if [ "$base_status" -ne 0 ]; then + echo "::warning::$id paired baseline sample $measured_index failed after ${base_duration_ms}ms; this pair is excluded from wall-clock gating" + tail -40 "$base_stderr" || true + fi + fi + + stdout="$sample_stdout" + stderr="$sample_stderr" trace_file="$sample_trace" if [ "$status" -ne 0 ]; then @@ -2641,29 +2948,36 @@ jobs: done printf ']\n' >>"$samples_file" - status="$(jq -r 'map(.status) | max // 0' "$samples_file")" - duration_ms="$(jq -r 'map(select(.status == 0) | .durationMs) as $values | if ($values | length) == 0 then (map(.durationMs) | max // 0) else ($values | sort | .[(length - 1) / 2 | floor]) end' "$samples_file")" + status="$(jq -r 'map(select((.subject // "head") == "head") | .status) | max // 0' "$samples_file")" + duration_ms="$(jq -r 'map(select((.subject // "head") == "head" and .phase != "warmup" and .status == 0) | .durationMs) as $values | if ($values | length) == 0 then (map(select((.subject // "head") == "head") | .durationMs) | max // 0) else ($values | sort | .[(length - 1) / 2 | floor]) end' "$samples_file")" cp "$stdout" "$ARTIFACT_DIR/$id.stdout" 2>/dev/null || true cp "$stderr" "$ARTIFACT_DIR/$id.stderr" 2>/dev/null || true - json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file" + json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file" "$gate_policy" "$metadata_json" if [ "$status" -ne 0 ]; then - echo "::error::$id failed after ${duration_ms}ms; stderr tail follows" + if [ "${CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" = "1" ]; then + echo "::warning::$id failed after ${duration_ms}ms; keeping earlier successful baseline probes and excluding this failed probe from numeric observations" + else + echo "::error::$id failed after ${duration_ms}ms; stderr tail follows" + fi tail -80 "$stderr" || true - return "$status" + if [ "${CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" != "1" ]; then + return "$status" + fi fi } - measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '1' '$DEVENV_BIN' '--trace-to' 'json:file:$trace_file' 'shell' '--no-reload' '--' 'true' - measure 'shell_eval_warm' 'Warm shell eval' 'devenv shell' 'Evaluates a warm dev shell without reloading direnv state.' '' '3' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'true' - measure 'tasks_list' 'devenv tasks list' 'devenv cli' 'Lists devenv tasks to measure task graph loading overhead.' '' '5' '$DEVENV_BIN' 'tasks' 'list' - measure 'processes_help' 'devenv processes --help' 'devenv cli' 'Loads the devenv processes command help path.' '' '5' '$DEVENV_BIN' 'processes' '--help' - measure 'task_pnpm_install' 'pnpm install task' 'workspace setup' 'Runs the cached pnpm install devenv task.' '' '1' '$DEVENV_BIN' 'tasks' 'run' 'pnpm:install' '--mode' 'before' '--no-tui' '--show-output' - measure 'task_genie_run' 'Genie run task' 'genie' 'Runs the normal devenv genie:run task including its declared dependencies.' '' '1' '$DEVENV_BIN' 'tasks' 'run' 'genie:run' '--mode' 'before' '--no-tui' '--show-output' - measure 'task_check_quick' 'Quick check task' 'quality gates' 'Runs the fast local quality gate through devenv.' '' '1' '$DEVENV_BIN' 'tasks' 'run' 'check:quick' '--mode' 'before' '--no-tui' '--show-output' - measure 'genie_check_direct' 'Genie check direct' 'genie' 'Runs Genie directly in check mode to isolate generator runtime from devenv task dependency overhead.' '' '3' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'bun' 'packages/@overeng/genie/bin/genie.tsx' '--output' 'ci-plain' '--check' + measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '0' '1' '{"enabled":false,"minBaselineSources":10,"minCurrentSamples":3,"warnRatio":1.25,"failRatio":1.5,"warnAbs":1.5,"failAbs":3,"noiseFloor":0.5,"statisticalToleranceRatio":0.2,"statisticalToleranceAbs":1}' '{"path":[],"dimensions":{}}' '$DEVENV_SHELL_TRACE_COMMAND' + measure 'shell_eval_warm' 'Warm shell eval' 'devenv shell' 'Evaluates a warm dev shell without reloading direnv state.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '{"path":[],"dimensions":{}}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'true' + measure 'tasks_list' 'devenv tasks list' 'devenv cli' 'Lists devenv tasks to measure task graph loading overhead.' '' '1' '9' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":7,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.25,"failRatio":1.5,"warnAbs":0.05,"failAbs":0.15,"noiseFloor":0.03,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.03}' '{"path":[],"dimensions":{}}' '$DEVENV_BIN' 'tasks' 'list' + measure 'processes_help' 'devenv processes --help' 'devenv cli' 'Loads the devenv processes command help path.' '' '1' '9' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":7,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.25,"failRatio":1.5,"warnAbs":0.05,"failAbs":0.15,"noiseFloor":0.03,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.03}' '{"path":[],"dimensions":{}}' '$DEVENV_BIN' 'processes' '--help' + measure 'task_pnpm_install' 'pnpm install task' 'workspace setup' 'Runs the cached pnpm install devenv task.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '{"path":[],"dimensions":{}}' '$DEVENV_BIN' 'tasks' 'run' 'pnpm:install' '--mode' 'before' '--no-tui' '--show-output' + measure 'task_genie_run' 'Genie run task' 'genie' 'Runs the normal devenv genie:run task including its declared dependencies.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '{"path":[],"dimensions":{}}' '$DEVENV_BIN' 'tasks' 'run' 'genie:run' '--mode' 'before' '--no-tui' '--show-output' + measure 'task_check_quick_warm' 'Warm cached check:quick' 'quality gates' 'Runs the fast local quality gate through devenv after a warmup. This measures the cached no-op path and task/status orchestration overhead.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '{"path":["quality gates","check:quick"],"dimensions":{"workload":"cached-no-op","taskCacheMode":"warm"}}' '$DEVENV_BIN' 'tasks' 'run' 'check:quick' '--mode' 'before' '--no-tui' '--show-output' + measure 'task_check_quick_forced' 'Forced check:quick' 'quality gates' 'Runs the fast local quality gate through devenv with task-cache refresh. This measures the developer-facing quick-check workload rather than the cached no-op path.' '' '0' '3' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":3,"minBaselineSources":10,"minCurrentSamples":3,"warnRatio":1.15,"failRatio":1.3,"warnAbs":1.5,"failAbs":4,"noiseFloor":0.75,"statisticalToleranceRatio":0.15,"statisticalToleranceAbs":1}' '{"path":["quality gates","check:quick"],"dimensions":{"workload":"forced-task-cache","taskCacheMode":"refresh"}}' '$DEVENV_BIN' 'tasks' 'run' 'check:quick' '--mode' 'before' '--no-tui' '--show-output' '--refresh-task-cache' + measure 'genie_check_direct' 'Genie check direct' 'genie' 'Runs Genie directly in check mode to isolate generator runtime from devenv task dependency overhead.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '{"path":[],"dimensions":{}}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'bun' 'packages/@overeng/genie/bin/genie.tsx' '--output' 'ci-plain' '--check' printf ']\n' >>"$ARTIFACT_DIR/timings.json" @@ -2698,8 +3012,8 @@ jobs: --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ --arg repository "${GITHUB_REPOSITORY:-unknown}" \ --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \ - --arg ref "${GITHUB_REF:-unknown}" \ - --arg headSha "${GITHUB_SHA:-unknown}" \ + --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \ + --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \ --arg baseSha "${GITHUB_BASE_SHA:-}" \ --arg runnerName "${RUNNER_NAME:-unknown}" \ --arg runnerOs "${RUNNER_OS:-unknown}" \ @@ -2713,11 +3027,16 @@ jobs: --arg traceId "${TRACE_ID:-}" \ --arg devenvRev "${DEVENV_REV:-unknown}" \ --arg otelServiceName "${OTEL_SERVICE_NAME:-unknown}" \ + --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \ --arg targetSystem "${DEVENV_SYSTEM:-${RUNNER_OS:-unknown}}" \ '{ schemaVersion: $schemaVersion, generatedAt: $generatedAt, - producer: { name: "effect-utils-ci-measurement", version: 1 }, + producer: { + name: "effect-utils-ci-measurement", + version: 2, + measurementProtocol: "devenv-perf-warm-median-v2" + }, subject: { repo: $repository, branchKind: (if $branchKind == "" then "unknown" else $branchKind end), @@ -2739,28 +3058,113 @@ jobs: target: { kind: "devenv", id: "dev-shell", name: "dev-shell", label: "Dev shell", group: "devenv", system: $targetSystem }, observations: ( $timings[0] + | map(select(.status == 0)) | map({ id: ("devenv." + .id + ".duration"), label: .label, group: .group, + path: (.metadata.path // []), + description: .description, + measurementKind: (if (.gatePolicy.enabled == false) then "diagnostic" else "wall-clock" end), name: ("devenv." + .id + ".duration"), unit: "seconds", value: (.durationMs / 1000), + policy: .gatePolicy, + comparison: { + mode: (.gatePolicy.comparisonMode // "historical"), + pairedSampleCount: (.statistics.pairedSampleCount // 0), + baseline: ( + if (.statistics.pairedBaselineMedianDurationMs // null) == null + then null + else (.statistics.pairedBaselineMedianDurationMs / 1000) + end + ) + }, statistics: { sampleCount: (.statistics.sampleCount // 1), + warmupCount: (.statistics.warmupCount // 0), + measuredSampleCount: (.statistics.measuredSampleCount // (.statistics.sampleCount // 1)), successfulSampleCount: (.statistics.successfulSampleCount // (if .status == 0 then 1 else 0 end)), min: ((.statistics.minDurationMs // .durationMs) / 1000), max: ((.statistics.maxDurationMs // .durationMs) / 1000), - median: ((.statistics.medianDurationMs // .durationMs) / 1000) + median: ((.statistics.medianDurationMs // .durationMs) / 1000), + pairedSampleCount: (.statistics.pairedSampleCount // 0), + pairedCurrentMedian: ( + if (.statistics.pairedCurrentMedianDurationMs // null) == null + then null + else (.statistics.pairedCurrentMedianDurationMs / 1000) + end + ), + pairedBaselineMedian: ( + if (.statistics.pairedBaselineMedianDurationMs // null) == null + then null + else (.statistics.pairedBaselineMedianDurationMs / 1000) + end + ), + pairedDeltaMedian: ( + if (.statistics.pairedDeltaMedianDurationMs // null) == null + then null + else (.statistics.pairedDeltaMedianDurationMs / 1000) + end + ), + pairedDeltaMin: ( + if (.statistics.pairedDeltaMinDurationMs // null) == null + then null + else (.statistics.pairedDeltaMinDurationMs / 1000) + end + ), + pairedDeltaMax: ( + if (.statistics.pairedDeltaMaxDurationMs // null) == null + then null + else (.statistics.pairedDeltaMaxDurationMs / 1000) + end + ), + pairedDeltaP25: ( + if (.statistics.pairedDeltaP25DurationMs // null) == null + then null + else (.statistics.pairedDeltaP25DurationMs / 1000) + end + ), + pairedDeltaP75: ( + if (.statistics.pairedDeltaP75DurationMs // null) == null + then null + else (.statistics.pairedDeltaP75DurationMs / 1000) + end + ), + pairedDeltaMad: ( + if (.statistics.pairedDeltaMadDurationMs // null) == null + then null + else (.statistics.pairedDeltaMadDurationMs / 1000) + end + ), + pairedDeltaSamples: ((.statistics.pairedDeltaSampleDurationMs // []) | map(. / 1000)) }, - dimensions: { + dimensions: ((.metadata.dimensions // {}) + { probe: .id, probeLabel: .label, status: .status, sampleCount: (.statistics.sampleCount // 1), + warmupCount: (.statistics.warmupCount // 0), + measuredSampleCount: (.statistics.measuredSampleCount // (.statistics.sampleCount // 1)), + pairedSampleCount: (.statistics.pairedSampleCount // 0), + pairedOrderProtocol: ( + if (.statistics.pairedSampleCount // 0) > 0 + then "balanced-seeded-alternating-v1" + else null + end + ), + pairedOrderSeed: ( + if (.statistics.pairedSampleCount // 0) > 0 + then $orderSeed + else null + end + ), + measurementProtocol: "devenv-perf-warm-median-v2", + aggregation: "median", + phase: "warm", devenvRev: $devenvRev, otelServiceName: $otelServiceName - } + }) }) ), artifacts: [ @@ -2778,142 +3182,1696 @@ jobs: } }' >"$ARTIFACT_DIR/measurements.json" - compare_baseline() { - local baseline_path="${DEVENV_PERF_BASELINE_SUMMARY:-$ARTIFACT_DIR/baseline/summary.json}" - local mode="${DEVENV_PERF_REGRESSION_MODE:-warn}" + if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + { + echo "### Devenv perf" + echo "" + echo "| Probe | Status | Duration |" + echo "| --- | ---: | ---: |" + jq -r '.[] | "| \(.label // .id) | \(.status) | \(.durationMs) ms |"' "$ARTIFACT_DIR/timings.json" + echo "" + echo "- Artifact directory: \`$ARTIFACT_DIR\`" + echo "- OTEL service: \`${OTEL_SERVICE_NAME:-unknown}\`" + } >>"$GITHUB_STEP_SUMMARY" + fi + + cat "$ARTIFACT_DIR/timings.pretty.json" - if [ "$mode" = "off" ]; then - jq -n --argjson schemaVersion 1 --arg status skipped --arg mode "$mode" '{schemaVersion:$schemaVersion, status:$status, mode:$mode, checks:{}}' >"$ARTIFACT_DIR/perf-comparison.json" - return 0 + - name: Upload devenv perf artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: devenv-perf + path: | + tmp/devenv-perf-ci + !tmp/devenv-perf-ci/baseline/** + if-no-files-found: error + retention-days: 30 + timeout-minutes: 30 + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-devenv-perf" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} + nix-closure-sizes: + if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }} + runs-on: + [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}'] + timeout-minutes: 30 + defaults: + run: + shell: bash + permissions: + actions: read + contents: write + issues: write + pull-requests: write + env: + CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }} + CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }} + CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }} + CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }} + steps: + - uses: actions/checkout@v6 + - name: Checkout CI measurement baseline ref + if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }} + uses: actions/checkout@v6 + with: + ref: ${{ inputs.measurement_baseline_ref }} + - name: Install Nix + uses: DeterminateSystems/determinate-nix-action@v3 + with: + extra-conf: | + experimental-features = nix-command flakes + accept-flake-config = true + extra-substituters = https://devenv.cachix.org + extra-trusted-public-keys = devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw= + access-tokens = github.com=${{ github.token }} + summarize: true + - name: Provide cachix CLI from nixpkgs + shell: bash + run: | + set -euo pipefail + out=$(nix build --no-link --print-out-paths nixpkgs#cachix) + echo "$out/bin" >> "$GITHUB_PATH" + - name: Enable Cachix cache + uses: cachix/cachix-action@v17 + with: + name: overeng-effect-utils + authToken: ${{ secrets.CACHIX_AUTH_TOKEN }} + - name: Use pinned devenv from lock + run: | + DEVENV_REV=$(jq -r .nodes.devenv.locked.rev devenv.lock) + if [ -z "$DEVENV_REV" ] || [ "$DEVENV_REV" = "null" ]; then + echo '::error::devenv.lock missing .nodes.devenv.locked.rev' + exit 1 + fi + echo "DEVENV_REV=$DEVENV_REV" >> "$GITHUB_ENV" + echo "Pinned devenv rev: $DEVENV_REV" + shell: bash + - name: Isolate pnpm state + shell: bash + run: | + echo "PNPM_STORE_DIR=${{ runner.temp }}/pnpm-store/${{ github.job }}" >> "$GITHUB_ENV" + echo "PNPM_HOME=${{ github.workspace }}/.pnpm-home" >> "$GITHUB_ENV" + - id: restore-pnpm-state + name: Restore pnpm state + uses: actions/cache/restore@v4 + with: + path: | + ${{ github.workspace }}/.pnpm-home + ${{ runner.temp }}/pnpm-store/${{ github.job }} + key: "pnpm-state-v1-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/pnpm-lock.yaml') }}" + - name: Resolve devenv + run: | + DEVENV_REV=$(jq -r .nodes.devenv.locked.rev devenv.lock) + if [ -z "$DEVENV_REV" ] || [ "$DEVENV_REV" = "null" ]; then + echo '::error::devenv.lock missing .nodes.devenv.locked.rev' + exit 1 + fi + + resolve_devenv() { + nix build \ + --accept-flake-config \ + --option extra-substituters https://devenv.cachix.org \ + --option extra-trusted-public-keys devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw= \ + --no-link \ + --print-out-paths \ + "github:cachix/devenv/$DEVENV_REV#devenv" + } + + # Temporary: capture diagnostics dir for #272 root-cause analysis. + DIAG_ROOT="${RUNNER_TEMP:-/tmp}/nix-store-diagnostics-${GITHUB_JOB:-job}-${RUNNER_OS:-unknown}-${GITHUB_RUN_ATTEMPT:-0}" + mkdir -p "$DIAG_ROOT" + echo "NIX_STORE_DIAGNOSTICS_DIR=$DIAG_ROOT" >> "$GITHUB_ENV" + + { + echo "timestamp_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "runner_name=${RUNNER_NAME:-unknown}" + echo "runner_os=${RUNNER_OS:-unknown}" + echo "runner_arch=${RUNNER_ARCH:-unknown}" + echo "github_job=${GITHUB_JOB:-unknown}" + echo "github_run_id=${GITHUB_RUN_ID:-unknown}" + echo "nix_user_conf_files=${NIX_USER_CONF_FILES:-}" + nix --version || true + } > "$DIAG_ROOT/environment.txt" 2>&1 + + if ! DEVENV_OUT=$(resolve_devenv 2> >(tee "$DIAG_ROOT/resolve-devenv.log" >&2)); then + echo "::error::resolve_devenv failed. Last 30 lines of log:" + tail -30 "$DIAG_ROOT/resolve-devenv.log" || true + exit 1 + fi + DEVENV_BIN="$DEVENV_OUT/bin/devenv" + + # Fast validity check on the devenv store path (~1-2s vs ~25s for devenv info). + if ! nix-store --check-validity "$DEVENV_OUT" 2>/dev/null; then + echo "::warning::devenv store path invalid, repairing targeted path..." + nix-store --repair-path "$DEVENV_OUT" > "$DIAG_ROOT/nix-store-verify-repair.log" 2>&1 || true + rm -rf "${XDG_CACHE_HOME:-$HOME/.cache}"/nix/eval-cache-* ~/.cache/nix/eval-cache-* + if ! DEVENV_OUT=$(resolve_devenv 2> >(tee "$DIAG_ROOT/resolve-devenv-post-repair.log" >&2)); then + echo "::error::resolve_devenv failed after repair. Last 30 lines of log:" + tail -30 "$DIAG_ROOT/resolve-devenv-post-repair.log" || true + exit 1 fi + DEVENV_BIN="$DEVENV_OUT/bin/devenv" + fi - if [ ! -f "$baseline_path" ]; then - jq -n \ - --argjson schemaVersion 1 \ - --arg status baseline_missing \ - --arg mode "$mode" \ - --arg baseline "$baseline_path" \ - '{schemaVersion:$schemaVersion, status:$status, mode:$mode, baseline:$baseline, checks:{}}' \ - >"$ARTIFACT_DIR/perf-comparison.json" - echo "::notice::devenv perf baseline not found at $baseline_path; recorded current measurements only" - return 0 + echo "DEVENV_BIN=$DEVENV_BIN" >> "$GITHUB_ENV" + "$DEVENV_BIN" version | tee "$DIAG_ROOT/devenv-version.txt" + shell: bash + - name: Evict cached pnpm deps for oxlint-npm + shell: bash + run: | + targetRef='.#oxlint-npm' + entriesJson=$(mktemp) + if nix eval --json "$targetRef.passthru.depsBuildEntries" >"$entriesJson" 2>/dev/null; then + while IFS=$'\t' read -r attrName drv; do + [ -n "$drv" ] || continue + while IFS= read -r outPath; do + [ -n "$outPath" ] || continue + if nix path-info "$outPath" >/dev/null 2>&1; then + echo "evicting cached: $(basename "$outPath")" + if ! nix store delete --ignore-liveness "$outPath" >/dev/null 2>&1; then + echo "::error::failed to evict cached pnpm-deps output: $outPath" + exit 1 + fi + if nix path-info "$outPath" >/dev/null 2>&1; then + echo "::error::cached pnpm-deps output still present after eviction: $outPath" + exit 1 + fi + fi + done < <(nix-store -q --outputs "$drv" 2>/dev/null || true) + done < <(jq -r '.[] | [.attrName, (.drvPath // "")] | @tsv' "$entriesJson") + else + topDrv=$(nix path-info --derivation "$targetRef" 2>/dev/null || true) + if [ -n "$topDrv" ]; then + while IFS= read -r drv; do + [ -n "$drv" ] || continue + attrName="" + while IFS= read -r outPath; do + [ -n "$outPath" ] || continue + if nix path-info "$outPath" >/dev/null 2>&1; then + echo "evicting cached: $(basename "$outPath")" + if ! nix store delete --ignore-liveness "$outPath" >/dev/null 2>&1; then + echo "::error::failed to evict cached pnpm-deps output: $outPath" + exit 1 + fi + if nix path-info "$outPath" >/dev/null 2>&1; then + echo "::error::cached pnpm-deps output still present after eviction: $outPath" + exit 1 + fi + fi + done < <(nix-store -q --outputs "$drv" 2>/dev/null || true) + done < <(nix-store -qR "$topDrv" 2>/dev/null | grep "pnpm-deps-[a-z0-9-]*-v[0-9].*\.drv$" || true) fi + fi + rm -f "$entriesJson" + - name: Force diagnostics failure (debug) + if: ${{ github.event_name == 'workflow_dispatch' && (inputs.debug_force_nix_diagnostics_failure == true || inputs.debug_force_nix_diagnostics_failure == 'true') }} + shell: bash + run: | + diag_dir="${NIX_STORE_DIAGNOSTICS_DIR:-${RUNNER_TEMP:-/tmp}/nix-store-diagnostics-missing}" + mkdir -p "$diag_dir" + cat > "$diag_dir/synthetic-signature.log" <<'EOF' + Failed to convert config.cachix to JSON + ... while evaluating the option `cachix.package` + error: path '/nix/store/synthetic-invalid-path' is not valid + EOF + echo "::warning::Intentional failure for diagnostics validation (#272)" + exit 1 + - name: 'Measure Nix closure: genie' + shell: bash + env: + ARTIFACT_DIR: tmp/nix-closure-ci/current/genie_package + RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}' + run: | + set -euo pipefail - jq -n \ - --slurpfile current "$ARTIFACT_DIR/summary.json" \ - --slurpfile baseline "$baseline_path" \ - --argjson schemaVersion 1 \ - --arg mode "$mode" \ - --arg baselinePath "$baseline_path" \ - ' - def budget($name): - if $name == "shell_eval_traced" then - {warnRatio:1.25, failRatio:1.5, warnMs:1500, failMs:3000} - elif $name == "shell_eval_warm" then - {warnRatio:1.5, failRatio:2.0, warnMs:500, failMs:1000} - elif $name == "tasks_list" or $name == "processes_help" then - {warnRatio:2.0, failRatio:3.0, warnMs:250, failMs:1000} - else - {warnRatio:1.5, failRatio:2.0, warnMs:1000, failMs:3000} - end; - def classify($name; $current; $baseline): - budget($name) as $b - | ($current - $baseline) as $delta - | (if $baseline > 0 then ($current / $baseline) else null end) as $ratio - | ( - if $baseline <= 0 then "unknown" - elif ($delta > $b.failMs and $current > ($baseline * $b.failRatio)) then "fail" - elif ($delta > $b.warnMs and $current > ($baseline * $b.warnRatio)) then "warn" - else "pass" - end - ) as $status - | {status:$status, currentMs:$current, baselineMs:$baseline, deltaMs:$delta, ratio:$ratio, budget:$b}; - ($current[0].checks // {}) as $currentChecks - | ($baseline[0].checks // {}) as $baselineChecks - | ( - $currentChecks - | to_entries - | map( - .key as $name - | .value as $current - | ($baselineChecks[$name] // null) as $base - | { - key: $name, - value: ( - if $base == null then - {status:"missing_baseline", currentMs:$current.durationMs} - elif ($current.status != 0) then - {status:"current_failed", currentMs:$current.durationMs, baselineMs:$base.durationMs} - elif ($base.status != 0) then - {status:"baseline_failed", currentMs:$current.durationMs, baselineMs:$base.durationMs} - else - classify($name; $current.durationMs; $base.durationMs) - end - ) - } - ) - | from_entries - ) as $checks - | ( - if any($checks[]; .status == "fail") then "fail" - elif any($checks[]; .status == "warn") then "warn" - elif any($checks[]; .status == "missing_baseline") then "partial" - else "pass" - end - ) as $status - | {schemaVersion:$schemaVersion, status:$status, mode:$mode, baseline:$baselinePath, checks:$checks} - ' >"$ARTIFACT_DIR/perf-comparison.json" - - local status - status="$(jq -r '.status' "$ARTIFACT_DIR/perf-comparison.json")" - case "$status:$mode" in - fail:fail) - echo "::error::devenv perf regression detected" - jq . "$ARTIFACT_DIR/perf-comparison.json" - return 1 - ;; - fail:*|warn:*) - echo "::warning::devenv perf regression threshold exceeded" - jq . "$ARTIFACT_DIR/perf-comparison.json" - ;; - esac - } + mkdir -p "$ARTIFACT_DIR" + installable='.#genie' + target_id='genie_package' + target_name='genie' + target_label='Genie package' + target_group='packages' + target_description='the packaged Genie CLI closure' + artifact_file="$ARTIFACT_DIR/measurements.json" + target_system='x86_64-linux' + + out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")" + path_info="$ARTIFACT_DIR/nix-closure-path-info.json" + paths_file="$ARTIFACT_DIR/nix-closure-paths.json" + + nix path-info --recursive --json "$out_path" >"$path_info" + jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file" + + jq -n \ + --slurpfile paths "$paths_file" \ + --argjson schemaVersion 1 \ + --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --arg repository "${GITHUB_REPOSITORY:-unknown}" \ + --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \ + --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \ + --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \ + --arg baseSha "${GITHUB_BASE_SHA:-}" \ + --arg runnerName "${RUNNER_NAME:-unknown}" \ + --arg runnerOs "${RUNNER_OS:-unknown}" \ + --arg runnerArch "${RUNNER_ARCH:-unknown}" \ + --arg runnerClass "${RUNNER_CLASS:-unknown}" \ + --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \ + --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \ + --arg githubJob "${GITHUB_JOB:-unknown}" \ + --arg taskId "${CROSSTASK_TASK_ID:-}" \ + --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \ + --arg traceId "${TRACE_ID:-}" \ + --arg targetName "$target_name" \ + --arg targetId "$target_id" \ + --arg targetLabel "$target_label" \ + --arg targetGroup "$target_group" \ + --arg targetDescription "$target_description" \ + --arg targetSystem "$target_system" \ + --arg outPath "$out_path" \ + --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \ + --argjson targetPath '["nix","closures","packages","genie"]' \ + --argjson gatePolicy '{}' \ + ' + ($paths[0] // []) as $closurePaths + | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize + | ($closurePaths | length) as $pathCount + | ($buckets | map( + . as $bucket + | { + name: "nix.closure.bucket.nar_size", + id: "nix.closure.bucket.nar_size", + label: (($bucket.label // $bucket.name) + " closure size"), + group: "nix closure buckets", + path: ($targetPath + ["buckets", $bucket.name]), + description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex), + measurementKind: "deterministic", + unit: "bytes", + value: ( + $closurePaths + | map(select(.path | test($bucket.pathRegex)) | .narSize) + | add // 0 + ), + policy: $gatePolicy, + dimensions: { bucket: $bucket.name } + } + )) as $bucketObservations + | { + schemaVersion: $schemaVersion, + generatedAt: $generatedAt, + producer: { name: "effect-utils-ci-measurement", version: 1 }, + subject: { + repo: $repository, + branchKind: (if $branchKind == "" then "unknown" else $branchKind end), + ref: $ref, + headSha: $headSha, + baseSha: $baseSha + }, + execution: { + provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end), + workflow: "CI", + job: $githubJob, + runId: $githubRunId, + runAttempt: $githubRunAttempt, + taskId: $taskId, + attemptId: $taskAttemptId, + traceId: $traceId, + runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass } + }, + target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem }, + observations: ([ + { + id: "nix.closure.nar_size", + label: "Total closure size", + group: "nix closure", + path: ($targetPath + ["total", "nar-size"]), + description: ("Total NAR size for all paths in " + $targetDescription), + name: "nix.closure.nar_size", + measurementKind: "deterministic", + unit: "bytes", + value: $totalNarSize, + policy: $gatePolicy, + dimensions: { bucket: "total" } + }, + { + id: "nix.closure.path_count", + label: "Total closure path count", + group: "nix closure", + path: ($targetPath + ["total", "path-count"]), + description: ("Number of store paths in " + $targetDescription), + name: "nix.closure.path_count", + measurementKind: "deterministic", + unit: "count", + value: $pathCount, + policy: $gatePolicy, + dimensions: { bucket: "total" } + } + ] + $bucketObservations), + artifacts: [ + { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" }, + { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" } + ], + details: { + outPath: $outPath, + topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30]) + } + } + ' >"$artifact_file" + + cat "$artifact_file" + + - name: 'Measure Nix closure: megarepo' + shell: bash + env: + ARTIFACT_DIR: tmp/nix-closure-ci/current/megarepo_package + RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}' + run: | + set -euo pipefail + + mkdir -p "$ARTIFACT_DIR" + installable='.#megarepo' + target_id='megarepo_package' + target_name='megarepo' + target_label='Megarepo package' + target_group='packages' + target_description='the packaged megarepo CLI closure' + artifact_file="$ARTIFACT_DIR/measurements.json" + target_system='x86_64-linux' + + out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")" + path_info="$ARTIFACT_DIR/nix-closure-path-info.json" + paths_file="$ARTIFACT_DIR/nix-closure-paths.json" + + nix path-info --recursive --json "$out_path" >"$path_info" + jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file" + + jq -n \ + --slurpfile paths "$paths_file" \ + --argjson schemaVersion 1 \ + --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --arg repository "${GITHUB_REPOSITORY:-unknown}" \ + --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \ + --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \ + --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \ + --arg baseSha "${GITHUB_BASE_SHA:-}" \ + --arg runnerName "${RUNNER_NAME:-unknown}" \ + --arg runnerOs "${RUNNER_OS:-unknown}" \ + --arg runnerArch "${RUNNER_ARCH:-unknown}" \ + --arg runnerClass "${RUNNER_CLASS:-unknown}" \ + --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \ + --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \ + --arg githubJob "${GITHUB_JOB:-unknown}" \ + --arg taskId "${CROSSTASK_TASK_ID:-}" \ + --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \ + --arg traceId "${TRACE_ID:-}" \ + --arg targetName "$target_name" \ + --arg targetId "$target_id" \ + --arg targetLabel "$target_label" \ + --arg targetGroup "$target_group" \ + --arg targetDescription "$target_description" \ + --arg targetSystem "$target_system" \ + --arg outPath "$out_path" \ + --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \ + --argjson targetPath '["nix","closures","packages","megarepo"]' \ + --argjson gatePolicy '{}' \ + ' + ($paths[0] // []) as $closurePaths + | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize + | ($closurePaths | length) as $pathCount + | ($buckets | map( + . as $bucket + | { + name: "nix.closure.bucket.nar_size", + id: "nix.closure.bucket.nar_size", + label: (($bucket.label // $bucket.name) + " closure size"), + group: "nix closure buckets", + path: ($targetPath + ["buckets", $bucket.name]), + description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex), + measurementKind: "deterministic", + unit: "bytes", + value: ( + $closurePaths + | map(select(.path | test($bucket.pathRegex)) | .narSize) + | add // 0 + ), + policy: $gatePolicy, + dimensions: { bucket: $bucket.name } + } + )) as $bucketObservations + | { + schemaVersion: $schemaVersion, + generatedAt: $generatedAt, + producer: { name: "effect-utils-ci-measurement", version: 1 }, + subject: { + repo: $repository, + branchKind: (if $branchKind == "" then "unknown" else $branchKind end), + ref: $ref, + headSha: $headSha, + baseSha: $baseSha + }, + execution: { + provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end), + workflow: "CI", + job: $githubJob, + runId: $githubRunId, + runAttempt: $githubRunAttempt, + taskId: $taskId, + attemptId: $taskAttemptId, + traceId: $traceId, + runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass } + }, + target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem }, + observations: ([ + { + id: "nix.closure.nar_size", + label: "Total closure size", + group: "nix closure", + path: ($targetPath + ["total", "nar-size"]), + description: ("Total NAR size for all paths in " + $targetDescription), + name: "nix.closure.nar_size", + measurementKind: "deterministic", + unit: "bytes", + value: $totalNarSize, + policy: $gatePolicy, + dimensions: { bucket: "total" } + }, + { + id: "nix.closure.path_count", + label: "Total closure path count", + group: "nix closure", + path: ($targetPath + ["total", "path-count"]), + description: ("Number of store paths in " + $targetDescription), + name: "nix.closure.path_count", + measurementKind: "deterministic", + unit: "count", + value: $pathCount, + policy: $gatePolicy, + dimensions: { bucket: "total" } + } + ] + $bucketObservations), + artifacts: [ + { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" }, + { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" } + ], + details: { + outPath: $outPath, + topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30]) + } + } + ' >"$artifact_file" + + cat "$artifact_file" + + - name: 'Measure Nix closure: oxlint-npm' + shell: bash + env: + ARTIFACT_DIR: tmp/nix-closure-ci/current/oxlint_npm_package + RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}' + run: | + set -euo pipefail + + mkdir -p "$ARTIFACT_DIR" + installable='.#oxlint-npm' + target_id='oxlint_npm_package' + target_name='oxlint-npm' + target_label='oxlint npm package' + target_group='packages' + target_description='the packaged oxlint npm compatibility wrapper closure' + artifact_file="$ARTIFACT_DIR/measurements.json" + target_system='x86_64-linux' + + out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")" + path_info="$ARTIFACT_DIR/nix-closure-path-info.json" + paths_file="$ARTIFACT_DIR/nix-closure-paths.json" + + nix path-info --recursive --json "$out_path" >"$path_info" + jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file" + + jq -n \ + --slurpfile paths "$paths_file" \ + --argjson schemaVersion 1 \ + --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --arg repository "${GITHUB_REPOSITORY:-unknown}" \ + --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \ + --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \ + --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \ + --arg baseSha "${GITHUB_BASE_SHA:-}" \ + --arg runnerName "${RUNNER_NAME:-unknown}" \ + --arg runnerOs "${RUNNER_OS:-unknown}" \ + --arg runnerArch "${RUNNER_ARCH:-unknown}" \ + --arg runnerClass "${RUNNER_CLASS:-unknown}" \ + --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \ + --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \ + --arg githubJob "${GITHUB_JOB:-unknown}" \ + --arg taskId "${CROSSTASK_TASK_ID:-}" \ + --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \ + --arg traceId "${TRACE_ID:-}" \ + --arg targetName "$target_name" \ + --arg targetId "$target_id" \ + --arg targetLabel "$target_label" \ + --arg targetGroup "$target_group" \ + --arg targetDescription "$target_description" \ + --arg targetSystem "$target_system" \ + --arg outPath "$out_path" \ + --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \ + --argjson targetPath '["nix","closures","packages","oxlint-npm"]' \ + --argjson gatePolicy '{}' \ + ' + ($paths[0] // []) as $closurePaths + | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize + | ($closurePaths | length) as $pathCount + | ($buckets | map( + . as $bucket + | { + name: "nix.closure.bucket.nar_size", + id: "nix.closure.bucket.nar_size", + label: (($bucket.label // $bucket.name) + " closure size"), + group: "nix closure buckets", + path: ($targetPath + ["buckets", $bucket.name]), + description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex), + measurementKind: "deterministic", + unit: "bytes", + value: ( + $closurePaths + | map(select(.path | test($bucket.pathRegex)) | .narSize) + | add // 0 + ), + policy: $gatePolicy, + dimensions: { bucket: $bucket.name } + } + )) as $bucketObservations + | { + schemaVersion: $schemaVersion, + generatedAt: $generatedAt, + producer: { name: "effect-utils-ci-measurement", version: 1 }, + subject: { + repo: $repository, + branchKind: (if $branchKind == "" then "unknown" else $branchKind end), + ref: $ref, + headSha: $headSha, + baseSha: $baseSha + }, + execution: { + provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end), + workflow: "CI", + job: $githubJob, + runId: $githubRunId, + runAttempt: $githubRunAttempt, + taskId: $taskId, + attemptId: $taskAttemptId, + traceId: $traceId, + runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass } + }, + target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem }, + observations: ([ + { + id: "nix.closure.nar_size", + label: "Total closure size", + group: "nix closure", + path: ($targetPath + ["total", "nar-size"]), + description: ("Total NAR size for all paths in " + $targetDescription), + name: "nix.closure.nar_size", + measurementKind: "deterministic", + unit: "bytes", + value: $totalNarSize, + policy: $gatePolicy, + dimensions: { bucket: "total" } + }, + { + id: "nix.closure.path_count", + label: "Total closure path count", + group: "nix closure", + path: ($targetPath + ["total", "path-count"]), + description: ("Number of store paths in " + $targetDescription), + name: "nix.closure.path_count", + measurementKind: "deterministic", + unit: "count", + value: $pathCount, + policy: $gatePolicy, + dimensions: { bucket: "total" } + } + ] + $bucketObservations), + artifacts: [ + { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" }, + { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" } + ], + details: { + outPath: $outPath, + topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30]) + } + } + ' >"$artifact_file" + + cat "$artifact_file" + + - name: 'Upload CI measurements: nix-closure-measurements' + if: always() + uses: actions/upload-artifact@v4 + with: + name: nix-closure-measurements + path: | + tmp/nix-closure-ci + !tmp/nix-closure-ci/baseline/** + if-no-files-found: error + retention-days: 30 + - name: Save pnpm state + if: ${{ success() && steps.restore-pnpm-state.outputs.cache-hit != 'true' }} + uses: actions/cache/save@v4 + with: + path: | + ${{ github.workspace }}/.pnpm-home + ${{ runner.temp }}/pnpm-store/${{ github.job }} + key: "pnpm-state-v1-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/pnpm-lock.yaml') }}" + - name: Nix diagnostics summary + if: failure() + shell: bash + run: | + diag_dir="${NIX_STORE_DIAGNOSTICS_DIR:-}" + if [ -z "$diag_dir" ] || [ ! -d "$diag_dir" ]; then + echo "## Nix Store Diagnostics" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "No diagnostics directory found (validation may have failed before capture)." >> "$GITHUB_STEP_SUMMARY" + exit 0 + fi + + { + echo "## Nix Store Diagnostics" + echo "" + echo "Temporary instrumentation for #272; remove after root cause is confirmed and CI is stable." + echo "" + echo "- Diagnostics directory: \`$diag_dir\`" + echo "- Tracking issue: https://github.com/overengineeringstudio/effect-utils/issues/272" + } >> "$GITHUB_STEP_SUMMARY" + + markers_file="${RUNNER_TEMP:-/tmp}/nix-store-signature-markers.txt" + grep -R -n -E "config\\.cachix|cachix\\.package|error: path '/nix/store/.+ is not valid" --exclude="$(basename "$markers_file")" "$diag_dir" > "$markers_file" || true + + if [ -s "$markers_file" ]; then + { + echo "" + echo "### Signature markers" + echo '```text' + head -n 120 "$markers_file" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + else + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "- No signature markers found in captured diagnostics." >> "$GITHUB_STEP_SUMMARY" + fi + - name: Upload Nix diagnostics artifact + if: failure() && env.NIX_STORE_DIAGNOSTICS_DIR != '' + uses: actions/upload-artifact@v4 + with: + name: 'nix-store-diagnostics-${{ github.job }}-${{ runner.os }}-run-${{ github.run_id }}-attempt-${{ github.run_attempt }}' + path: ${{ env.NIX_STORE_DIAGNOSTICS_DIR }} + if-no-files-found: ignore + retention-days: 14 + - name: Failure note + if: failure() + shell: bash + run: | + echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:" + echo " https://github.com/overengineeringstudio/effect-utils/issues/201" + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-closure-sizes" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} + source-shape: + runs-on: + [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}'] + timeout-minutes: 30 + defaults: + run: + shell: bash + permissions: + actions: read + contents: write + issues: write + pull-requests: write + env: + CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }} + CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }} + CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }} + CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }} + steps: + - uses: actions/checkout@v6 + - name: Checkout CI measurement baseline ref + if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }} + uses: actions/checkout@v6 + with: + ref: ${{ inputs.measurement_baseline_ref }} + - name: 'Measure source shape: effect-utils' + shell: bash + env: + ARTIFACT_DIR: tmp/source-shape-ci/current/effect-utils + RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}' + run: | + set -euo pipefail + + ensure_ci_measurement_tool() { + tool_name="$1" + nix_attr="$2" + if command -v "$tool_name" >/dev/null 2>&1; then + return 0 + fi + if ! command -v nix >/dev/null 2>&1; then + return 1 + fi + if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then + while IFS= read -r tool_path; do + [ -n "$tool_path" ] || continue + [ -d "$tool_path/bin" ] || continue + export PATH="$tool_path/bin:$PATH" + if command -v "$tool_name" >/dev/null 2>&1; then + return 0 + fi + done </dev/null 2>&1 + } + + require_ci_measurement_tool() { + tool_name="$1" + nix_attr="$2" + if ensure_ci_measurement_tool "$tool_name" "$nix_attr"; then + return 0 + fi + echo "::error::$tool_name is not available; unable to produce CI measurement artifact" + exit 1 + } + + require_ci_measurement_tool node nodejs + + mkdir -p "$ARTIFACT_DIR" + target_id='effect_utils' + target_name='effect-utils' + target_label='effect-utils repository' + target_group='source' + artifact_file="$ARTIFACT_DIR/measurements.json" + target_system="${DEVENV_SYSTEM:-${RUNNER_OS:-unknown}}" + + SCOPES_JSON='[{"id":"genie_ci_workflow","label":"Genie CI workflow helpers","group":"source / ci","path":["source","effect-utils","genie","ci-workflow"],"includePaths":["genie/ci-workflow",".github/workflows/ci.yml.genie.ts"],"includeExtensions":[".ts"]},{"id":"genie_runtime","label":"Genie runtime","group":"source / genie","path":["source","effect-utils","packages","genie"],"includePaths":["packages/@overeng/genie/src"],"includeExtensions":[".ts",".tsx"]},{"id":"nix_workspace_tools","label":"Nix workspace tools","group":"source / nix","path":["source","effect-utils","nix","workspace-tools"],"includePaths":["nix/workspace-tools"],"includeExtensions":[".nix"]}]' \ + TARGET_PATH_JSON='["source","effect-utils"]' \ + TARGET_ID="$target_id" \ + TARGET_NAME="$target_name" \ + TARGET_LABEL="$target_label" \ + TARGET_GROUP="$target_group" \ + TARGET_SYSTEM="$target_system" \ + node <<'NODE' >"$artifact_file" + const cp = require('node:child_process') + const fs = require('node:fs') + const path = require('node:path') + + const normalize = (value) => { + const normalized = value.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+$/, '') + return normalized === '.' ? '' : normalized + } + const scopes = JSON.parse(process.env.SCOPES_JSON || '[]') + const targetPath = JSON.parse(process.env.TARGET_PATH_JSON || '["source"]') + const gitFiles = cp + .execFileSync('git', ['ls-files', '-z'], { encoding: 'buffer' }) + .toString('utf8') + .split('\0') + .filter(Boolean) + .map(normalize) + + const includesPath = (file, candidates) => { + if (!Array.isArray(candidates) || candidates.length === 0) return true + return candidates.map(normalize).some((candidate) => candidate === '' || file === candidate || file.startsWith(candidate + '/')) + } + + const excludesPath = (file, candidates) => + Array.isArray(candidates) && + candidates.map(normalize).some((candidate) => candidate !== '' && (file === candidate || file.startsWith(candidate + '/'))) + + const matchesExtension = (file, extensions) => { + if (!Array.isArray(extensions) || extensions.length === 0) return true + const ext = path.extname(file).toLowerCase() + return extensions.map((extension) => extension.toLowerCase()).some((extension) => ext === extension) + } + + const countLines = (file) => { + const buffer = fs.readFileSync(file) + if (buffer.includes(0)) return undefined + if (buffer.length === 0) return 0 + let lines = 0 + for (const byte of buffer) { + if (byte === 10) lines += 1 + } + return buffer[buffer.length - 1] === 10 ? lines : lines + 1 + } + + const observations = [] + const scopeSummaries = [] + + for (const scope of scopes) { + const root = normalize(scope.root || '.') + const includePaths = Array.isArray(scope.includePaths) && scope.includePaths.length > 0 ? scope.includePaths : [root] + const files = gitFiles + .filter((file) => includesPath(file, includePaths)) + .filter((file) => !excludesPath(file, scope.excludePaths)) + .filter((file) => matchesExtension(file, scope.includeExtensions)) + + let lineCount = 0 + let measuredFileCount = 0 + for (const file of files) { + const lines = countLines(file) + if (lines === undefined) continue + lineCount += lines + measuredFileCount += 1 + } + + const group = scope.group || 'source shape' + const scopePath = Array.isArray(scope.path) ? scope.path : ['source', scope.id] + const policy = scope.gate || { enabled: false, minBaselineSources: 3, minCurrentSamples: 1 } + observations.push( + { + id: 'source.lines', + label: scope.label + ' lines', + group, + path: scopePath, + description: 'Tracked non-binary source lines in the configured scope.', + measurementKind: 'deterministic', + name: 'source.lines', + unit: 'lines', + value: lineCount, + dimensions: { scope: scope.id }, + policy, + statistics: { sampleCount: 1, measuredSampleCount: measuredFileCount }, + }, + { + id: 'source.files', + label: scope.label + ' files', + group, + path: scopePath, + description: 'Tracked non-binary source files in the configured scope.', + measurementKind: 'deterministic', + name: 'source.files', + unit: 'count', + value: measuredFileCount, + dimensions: { scope: scope.id }, + policy, + statistics: { sampleCount: 1, measuredSampleCount: measuredFileCount }, + }, + ) + scopeSummaries.push({ + id: scope.id, + label: scope.label, + root, + includePaths, + excludePaths: scope.excludePaths || [], + includeExtensions: scope.includeExtensions || [], + fileCount: measuredFileCount, + lineCount, + }) + } + + const artifact = { + schemaVersion: 1, + generatedAt: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z'), + producer: { + name: 'effect-utils-ci-measurement', + version: 1, + measurementProtocol: 'source-shape-v1', + }, + subject: { + repo: process.env.GITHUB_REPOSITORY || 'unknown', + branchKind: process.env.GITHUB_EVENT_NAME || 'unknown', + ref: process.env.CI_MEASUREMENT_SUBJECT_REF || process.env.GITHUB_REF || 'unknown', + headSha: process.env.CI_MEASUREMENT_SUBJECT_SHA || process.env.GITHUB_SHA || 'unknown', + baseSha: process.env.GITHUB_BASE_SHA || '', + }, + execution: { + provider: process.env.GITHUB_RUN_ID && process.env.GITHUB_RUN_ID !== 'unknown' ? 'github-actions' : 'local', + workflow: 'CI', + job: process.env.GITHUB_JOB || 'unknown', + runId: process.env.GITHUB_RUN_ID || 'unknown', + runAttempt: process.env.GITHUB_RUN_ATTEMPT || 'unknown', + taskId: process.env.CROSSTASK_TASK_ID || '', + attemptId: process.env.CROSSTASK_ATTEMPT_ID || '', + traceId: process.env.TRACE_ID || '', + runner: { + name: process.env.RUNNER_NAME || 'unknown', + os: process.env.RUNNER_OS || 'unknown', + arch: process.env.RUNNER_ARCH || 'unknown', + class: process.env.RUNNER_CLASS || 'unknown', + }, + }, + target: { + kind: 'source-shape', + id: process.env.TARGET_ID, + name: process.env.TARGET_NAME, + label: process.env.TARGET_LABEL, + group: process.env.TARGET_GROUP, + path: targetPath, + system: process.env.TARGET_SYSTEM, + }, + observations, + details: { scopes: scopeSummaries }, + } + + process.stdout.write(JSON.stringify(artifact, null, 2) + '\n') + NODE + + cat "$artifact_file" + + - name: 'Upload CI measurements: source-shape' + if: always() + uses: actions/upload-artifact@v4 + with: + name: source-shape + path: | + tmp/source-shape-ci + !tmp/source-shape-ci/baseline/** + if-no-files-found: error + retention-days: 30 + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-source-shape" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} + ci-measurements-report: + name: ci/measurements-report + if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }} + needs: [devenv-perf, nix-closure-sizes, source-shape] + runs-on: + [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}'] + timeout-minutes: 30 + defaults: + run: + shell: bash + permissions: + actions: read + contents: write + issues: write + pull-requests: write + env: + CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }} + CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }} + CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }} + CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }} + steps: + - uses: actions/checkout@v6 + - name: Install Nix + uses: DeterminateSystems/determinate-nix-action@v3 + with: + extra-conf: | + experimental-features = nix-command flakes + accept-flake-config = true + extra-substituters = https://devenv.cachix.org + extra-trusted-public-keys = devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw= + access-tokens = github.com=${{ github.token }} + summarize: true + - name: Provide CI measurement report tools + shell: bash + run: | + set -euo pipefail + for out in $(nix build --no-link --print-out-paths nixpkgs#jq nixpkgs#nodejs nixpkgs#gh nixpkgs#resvg); do + echo "$out/bin" >> "$GITHUB_PATH" + done + - name: 'Download current measurement artifact: devenv-perf' + uses: actions/download-artifact@v4 + with: + name: devenv-perf + path: tmp/ci-measurement-report/current/devenv-perf + - name: 'Download current measurement artifact: nix-closure-measurements' + uses: actions/download-artifact@v4 + with: + name: nix-closure-measurements + path: tmp/ci-measurement-report/current/nix-closure-measurements + - name: 'Download current measurement artifact: source-shape' + uses: actions/download-artifact@v4 + with: + name: source-shape + path: tmp/ci-measurement-report/current/source-shape + - name: 'Download previous artifact: devenv-perf' + shell: bash + env: + GH_TOKEN: ${{ github.token }} + BASELINE_ARTIFACT_NAME: devenv-perf + BASELINE_OUTPUT_DIR: tmp/ci-measurement-report/baseline/devenv-perf + BASELINE_WORKFLOW_NAME: ${{ github.workflow }} + BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }} + BASELINE_SEED_RUNS_JSON: '[]' + BASELINE_MAX_RUNS: '20' + BASELINE_MAX_CANDIDATE_RUNS: '60' + BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]' + BASELINE_DOWNLOAD_TIMEOUT_SECONDS: '120' + run: | + set -euo pipefail + + mkdir -p "$BASELINE_OUTPUT_DIR" + + if command -v gh >/dev/null 2>&1; then + GH_BIN="$(command -v gh)" + else + echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix" + if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then + echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download" + exit 0 + fi + fi + echo "Using GitHub CLI: $GH_BIN" + + repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}" + workflow="${BASELINE_WORKFLOW_NAME:-CI}" + branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}" + seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json" + required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json" + printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file" + printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file" + if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \ + "$seed_runs_file" >/dev/null; then + echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields" + exit 1 + fi + if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \ + "$required_observations_file" >/dev/null; then + echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields" + exit 1 + fi + seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")" + required_observation_count="$(jq 'length' "$required_observations_file")" + max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}" + if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then + max_candidate_runs=1 + fi + + candidate_runs="$( + "$GH_BIN" run list \ + --repo "$repo" \ + --workflow "$workflow" \ + --branch "$branch" \ + --event push \ + --status success \ + --json databaseId,headSha \ + --limit "$max_candidate_runs" \ + --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]' + )" + + candidate_runs="$seed_run_ids + $candidate_runs" + + max_runs="${BASELINE_MAX_RUNS:-5}" + if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then + max_runs=1 + fi + download_timeout_seconds="${BASELINE_DOWNLOAD_TIMEOUT_SECONDS:-120}" + if ! [[ "$download_timeout_seconds" =~ ^[0-9]+$ ]] || [ "$download_timeout_seconds" -lt 1 ]; then + download_timeout_seconds=120 + fi + + write_baseline_observation_counts() { + local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt" + local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" + find "$BASELINE_OUTPUT_DIR" \ + -mindepth 2 \ + -maxdepth 2 \ + -name measurements.json \ + -type f \ + -print \ + | sort >"$measurement_index" || true + + if [ -s "$measurement_index" ]; then + xargs -r jq -s \ + --slurpfile required "$required_observations_file" \ + ' + ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts + | ($required[0] // []) as $requiredRows + | { + counts: $counts, + required: ( + $requiredRows + | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)}) + ) + } + ' <"$measurement_index" >"$counts_file" + else + jq -n --slurpfile required "$required_observations_file" \ + '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file" + fi + } + + baseline_requirements_satisfied() { + if [ "$required_observation_count" -eq 0 ]; then + return 1 + fi + write_baseline_observation_counts + jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null + } + + run_id="" + artifact_name="" + artifact_id="" + downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl" + seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt" + : >"$downloaded_runs_file" + : >"$seen_runs_file" + for candidate_run in $candidate_runs; do + if [ -z "$candidate_run" ]; then + continue + fi + if grep -qxF "$candidate_run" "$seen_runs_file"; then + continue + fi + downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')" + if [ "$downloaded_count" -ge "$max_runs" ]; then + if baseline_requirements_satisfied; then + break + fi + echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history" + fi + if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then + break + fi + printf '%s\n' "$candidate_run" >>"$seen_runs_file" + + artifact_json="$( + "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \ + | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts + | map(select(.expired == false)) + | map(select(.name == $artifactName or (.name | startswith($artifactName + "-")))) + | sort_by(.created_at // "") + | reverse + | .[0] // empty' + )" + + if [ -n "$artifact_json" ]; then + current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')" + current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')" + current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run" + mkdir -p "$current_output_dir" + if timeout "$download_timeout_seconds" "$GH_BIN" run download "$candidate_run" \ + --repo "$repo" \ + --name "$current_artifact_name" \ + --dir "$current_output_dir"; then + if [ -z "$run_id" ]; then + run_id="$candidate_run" + artifact_name="$current_artifact_name" + artifact_id="$current_artifact_id" + fi + jq -cn \ + --arg runId "$candidate_run" \ + --arg artifactName "$current_artifact_name" \ + --arg artifactId "$current_artifact_id" \ + --arg path "run-$candidate_run" \ + '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \ + >>"$downloaded_runs_file" + else + status="$?" + rm -rf "$current_output_dir" + if [ "$status" -eq 124 ]; then + echo "::notice::timed out after ${download_timeout_seconds}s downloading baseline artifact $current_artifact_name from run $candidate_run; skipping candidate" + else + echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run (exit $status)" + fi + fi + fi + done + + write_baseline_observation_counts + + if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then + echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch" + exit 0 + fi + + jq -n \ + --slurpfile runs "$downloaded_runs_file" \ + --slurpfile seedRuns "$seed_runs_file" \ + --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \ + --argjson schemaVersion 1 \ + --arg repository "$repo" \ + --arg workflow "$workflow" \ + --arg branch "$branch" \ + --arg runId "$run_id" \ + --arg artifactName "$artifact_name" \ + --arg artifactId "$artifact_id" \ + '{ + schemaVersion: $schemaVersion, + source: "github-actions-artifact", + repository: $repository, + workflow: $workflow, + branch: $branch, + runId: $runId, + artifactName: $artifactName, + artifactId: $artifactId, + seedRuns: ($seedRuns[0] // []), + runs: $runs, + observationCounts: ($observationCounts[0] // null) + }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json" + + echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR" + + - name: 'Download previous artifact: nix-closure-measurements' + shell: bash + env: + GH_TOKEN: ${{ github.token }} + BASELINE_ARTIFACT_NAME: nix-closure-measurements + BASELINE_OUTPUT_DIR: tmp/ci-measurement-report/baseline/nix-closure-measurements + BASELINE_WORKFLOW_NAME: ${{ github.workflow }} + BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }} + BASELINE_SEED_RUNS_JSON: '[]' + BASELINE_MAX_RUNS: '20' + BASELINE_MAX_CANDIDATE_RUNS: '60' + BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]' + BASELINE_DOWNLOAD_TIMEOUT_SECONDS: '120' + run: | + set -euo pipefail + + mkdir -p "$BASELINE_OUTPUT_DIR" + + if command -v gh >/dev/null 2>&1; then + GH_BIN="$(command -v gh)" + else + echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix" + if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then + echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download" + exit 0 + fi + fi + echo "Using GitHub CLI: $GH_BIN" + + repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}" + workflow="${BASELINE_WORKFLOW_NAME:-CI}" + branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}" + seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json" + required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json" + printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file" + printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file" + if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \ + "$seed_runs_file" >/dev/null; then + echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields" + exit 1 + fi + if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \ + "$required_observations_file" >/dev/null; then + echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields" + exit 1 + fi + seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")" + required_observation_count="$(jq 'length' "$required_observations_file")" + max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}" + if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then + max_candidate_runs=1 + fi + + candidate_runs="$( + "$GH_BIN" run list \ + --repo "$repo" \ + --workflow "$workflow" \ + --branch "$branch" \ + --event push \ + --status success \ + --json databaseId,headSha \ + --limit "$max_candidate_runs" \ + --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]' + )" + + candidate_runs="$seed_run_ids + $candidate_runs" + + max_runs="${BASELINE_MAX_RUNS:-5}" + if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then + max_runs=1 + fi + download_timeout_seconds="${BASELINE_DOWNLOAD_TIMEOUT_SECONDS:-120}" + if ! [[ "$download_timeout_seconds" =~ ^[0-9]+$ ]] || [ "$download_timeout_seconds" -lt 1 ]; then + download_timeout_seconds=120 + fi + + write_baseline_observation_counts() { + local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt" + local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" + find "$BASELINE_OUTPUT_DIR" \ + -mindepth 2 \ + -maxdepth 2 \ + -name measurements.json \ + -type f \ + -print \ + | sort >"$measurement_index" || true + + if [ -s "$measurement_index" ]; then + xargs -r jq -s \ + --slurpfile required "$required_observations_file" \ + ' + ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts + | ($required[0] // []) as $requiredRows + | { + counts: $counts, + required: ( + $requiredRows + | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)}) + ) + } + ' <"$measurement_index" >"$counts_file" + else + jq -n --slurpfile required "$required_observations_file" \ + '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file" + fi + } + + baseline_requirements_satisfied() { + if [ "$required_observation_count" -eq 0 ]; then + return 1 + fi + write_baseline_observation_counts + jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null + } + + run_id="" + artifact_name="" + artifact_id="" + downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl" + seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt" + : >"$downloaded_runs_file" + : >"$seen_runs_file" + for candidate_run in $candidate_runs; do + if [ -z "$candidate_run" ]; then + continue + fi + if grep -qxF "$candidate_run" "$seen_runs_file"; then + continue + fi + downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')" + if [ "$downloaded_count" -ge "$max_runs" ]; then + if baseline_requirements_satisfied; then + break + fi + echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history" + fi + if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then + break + fi + printf '%s\n' "$candidate_run" >>"$seen_runs_file" + + artifact_json="$( + "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \ + | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts + | map(select(.expired == false)) + | map(select(.name == $artifactName or (.name | startswith($artifactName + "-")))) + | sort_by(.created_at // "") + | reverse + | .[0] // empty' + )" + + if [ -n "$artifact_json" ]; then + current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')" + current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')" + current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run" + mkdir -p "$current_output_dir" + if timeout "$download_timeout_seconds" "$GH_BIN" run download "$candidate_run" \ + --repo "$repo" \ + --name "$current_artifact_name" \ + --dir "$current_output_dir"; then + if [ -z "$run_id" ]; then + run_id="$candidate_run" + artifact_name="$current_artifact_name" + artifact_id="$current_artifact_id" + fi + jq -cn \ + --arg runId "$candidate_run" \ + --arg artifactName "$current_artifact_name" \ + --arg artifactId "$current_artifact_id" \ + --arg path "run-$candidate_run" \ + '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \ + >>"$downloaded_runs_file" + else + status="$?" + rm -rf "$current_output_dir" + if [ "$status" -eq 124 ]; then + echo "::notice::timed out after ${download_timeout_seconds}s downloading baseline artifact $current_artifact_name from run $candidate_run; skipping candidate" + else + echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run (exit $status)" + fi + fi + fi + done + + write_baseline_observation_counts + + if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then + echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch" + exit 0 + fi + + jq -n \ + --slurpfile runs "$downloaded_runs_file" \ + --slurpfile seedRuns "$seed_runs_file" \ + --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \ + --argjson schemaVersion 1 \ + --arg repository "$repo" \ + --arg workflow "$workflow" \ + --arg branch "$branch" \ + --arg runId "$run_id" \ + --arg artifactName "$artifact_name" \ + --arg artifactId "$artifact_id" \ + '{ + schemaVersion: $schemaVersion, + source: "github-actions-artifact", + repository: $repository, + workflow: $workflow, + branch: $branch, + runId: $runId, + artifactName: $artifactName, + artifactId: $artifactId, + seedRuns: ($seedRuns[0] // []), + runs: $runs, + observationCounts: ($observationCounts[0] // null) + }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json" + + echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR" + + - name: 'Download previous artifact: source-shape' + shell: bash + env: + GH_TOKEN: ${{ github.token }} + BASELINE_ARTIFACT_NAME: source-shape + BASELINE_OUTPUT_DIR: tmp/ci-measurement-report/baseline/source-shape + BASELINE_WORKFLOW_NAME: ${{ github.workflow }} + BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }} + BASELINE_SEED_RUNS_JSON: '[{"runId":"26085158592","label":"main baseline","sha":"ce7cf8f8ebfaa1da6c7e9122cd195a5f95ce2fca","source":"manual-backfill","artifacts":["source-shape"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."}]' + BASELINE_MAX_RUNS: '20' + BASELINE_MAX_CANDIDATE_RUNS: '60' + BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]' + BASELINE_DOWNLOAD_TIMEOUT_SECONDS: '120' + run: | + set -euo pipefail + + mkdir -p "$BASELINE_OUTPUT_DIR" + + if command -v gh >/dev/null 2>&1; then + GH_BIN="$(command -v gh)" + else + echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix" + if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then + echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download" + exit 0 + fi + fi + echo "Using GitHub CLI: $GH_BIN" + + repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}" + workflow="${BASELINE_WORKFLOW_NAME:-CI}" + branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}" + seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json" + required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json" + printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file" + printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file" + if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \ + "$seed_runs_file" >/dev/null; then + echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields" + exit 1 + fi + if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \ + "$required_observations_file" >/dev/null; then + echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields" + exit 1 + fi + seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")" + required_observation_count="$(jq 'length' "$required_observations_file")" + max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}" + if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then + max_candidate_runs=1 + fi + + candidate_runs="$( + "$GH_BIN" run list \ + --repo "$repo" \ + --workflow "$workflow" \ + --branch "$branch" \ + --event push \ + --status success \ + --json databaseId,headSha \ + --limit "$max_candidate_runs" \ + --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]' + )" + + candidate_runs="$seed_run_ids + $candidate_runs" + + max_runs="${BASELINE_MAX_RUNS:-5}" + if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then + max_runs=1 + fi + download_timeout_seconds="${BASELINE_DOWNLOAD_TIMEOUT_SECONDS:-120}" + if ! [[ "$download_timeout_seconds" =~ ^[0-9]+$ ]] || [ "$download_timeout_seconds" -lt 1 ]; then + download_timeout_seconds=120 + fi + + write_baseline_observation_counts() { + local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt" + local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" + find "$BASELINE_OUTPUT_DIR" \ + -mindepth 2 \ + -maxdepth 2 \ + -name measurements.json \ + -type f \ + -print \ + | sort >"$measurement_index" || true + + if [ -s "$measurement_index" ]; then + xargs -r jq -s \ + --slurpfile required "$required_observations_file" \ + ' + ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts + | ($required[0] // []) as $requiredRows + | { + counts: $counts, + required: ( + $requiredRows + | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)}) + ) + } + ' <"$measurement_index" >"$counts_file" + else + jq -n --slurpfile required "$required_observations_file" \ + '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file" + fi + } + + baseline_requirements_satisfied() { + if [ "$required_observation_count" -eq 0 ]; then + return 1 + fi + write_baseline_observation_counts + jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null + } + + run_id="" + artifact_name="" + artifact_id="" + downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl" + seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt" + : >"$downloaded_runs_file" + : >"$seen_runs_file" + for candidate_run in $candidate_runs; do + if [ -z "$candidate_run" ]; then + continue + fi + if grep -qxF "$candidate_run" "$seen_runs_file"; then + continue + fi + downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')" + if [ "$downloaded_count" -ge "$max_runs" ]; then + if baseline_requirements_satisfied; then + break + fi + echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history" + fi + if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then + break + fi + printf '%s\n' "$candidate_run" >>"$seen_runs_file" - compare_baseline + artifact_json="$( + "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \ + | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts + | map(select(.expired == false)) + | map(select(.name == $artifactName or (.name | startswith($artifactName + "-")))) + | sort_by(.created_at // "") + | reverse + | .[0] // empty' + )" - if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then - { - echo "### Devenv perf" - echo "" - echo "| Probe | Status | Duration |" - echo "| --- | ---: | ---: |" - jq -r '.[] | "| \(.label // .id) | \(.status) | \(.durationMs) ms |"' "$ARTIFACT_DIR/timings.json" - echo "" - echo "- Artifact directory: \`$ARTIFACT_DIR\`" - echo "- OTEL service: \`${OTEL_SERVICE_NAME:-unknown}\`" - echo "" - echo "#### Regression comparison" - echo "" - if [ -f "$ARTIFACT_DIR/perf-comparison.json" ]; then - jq -r '["- Status: " + .status, "- Mode: " + .mode, "- Baseline: " + (.baseline // "none")] | .[]' "$ARTIFACT_DIR/perf-comparison.json" + if [ -n "$artifact_json" ]; then + current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')" + current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')" + current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run" + mkdir -p "$current_output_dir" + if timeout "$download_timeout_seconds" "$GH_BIN" run download "$candidate_run" \ + --repo "$repo" \ + --name "$current_artifact_name" \ + --dir "$current_output_dir"; then + if [ -z "$run_id" ]; then + run_id="$candidate_run" + artifact_name="$current_artifact_name" + artifact_id="$current_artifact_id" + fi + jq -cn \ + --arg runId "$candidate_run" \ + --arg artifactName "$current_artifact_name" \ + --arg artifactId "$current_artifact_id" \ + --arg path "run-$candidate_run" \ + '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \ + >>"$downloaded_runs_file" + else + status="$?" + rm -rf "$current_output_dir" + if [ "$status" -eq 124 ]; then + echo "::notice::timed out after ${download_timeout_seconds}s downloading baseline artifact $current_artifact_name from run $candidate_run; skipping candidate" + else + echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run (exit $status)" + fi fi - } >>"$GITHUB_STEP_SUMMARY" + fi + done + + write_baseline_observation_counts + + if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then + echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch" + exit 0 fi - cat "$ARTIFACT_DIR/timings.pretty.json" + jq -n \ + --slurpfile runs "$downloaded_runs_file" \ + --slurpfile seedRuns "$seed_runs_file" \ + --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \ + --argjson schemaVersion 1 \ + --arg repository "$repo" \ + --arg workflow "$workflow" \ + --arg branch "$branch" \ + --arg runId "$run_id" \ + --arg artifactName "$artifact_name" \ + --arg artifactId "$artifact_id" \ + '{ + schemaVersion: $schemaVersion, + source: "github-actions-artifact", + repository: $repository, + workflow: $workflow, + branch: $branch, + runId: $runId, + artifactName: $artifactName, + artifactId: $artifactId, + seedRuns: ($seedRuns[0] // []), + runs: $runs, + observationCounts: ($observationCounts[0] // null) + }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json" + + echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR" - name: Compare CI measurements with baseline shell: bash env: - CI_MEASUREMENT_CURRENT_DIR: tmp/devenv-perf-ci - CI_MEASUREMENT_BASELINE_DIR: tmp/devenv-perf-ci/baseline - CI_MEASUREMENT_COMPARISON_FILE: tmp/devenv-perf-ci/measurement-comparison.json + CI_MEASUREMENT_CURRENT_DIR: tmp/ci-measurement-report/current + CI_MEASUREMENT_BASELINE_DIR: tmp/ci-measurement-report/baseline + CI_MEASUREMENT_COMPARISON_FILE: tmp/ci-measurement-report/measurement-comparison.json CI_MEASUREMENT_REGRESSION_MODE: warn CI_MEASUREMENT_PR_COMMENT_ENABLED: 'true' - CI_MEASUREMENT_PR_COMMENT_TITLE: Devenv Performance - CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8' + CI_MEASUREMENT_PR_COMMENT_TITLE: CI Measurements + CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '16' CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20' CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets + GH_TOKEN: ${{ github.token }} run: | set -euo pipefail @@ -2934,8 +4892,10 @@ jobs: current_index="$(mktemp)" baseline_index="$(mktemp)" - find "$current_dir" -path "$baseline_dir" -prune -o -name measurements.json -type f -print | sort >"$current_index" || true - find "$baseline_dir" -name measurements.json -type f -print | sort >"$baseline_index" || true + find "$current_dir" -name baseline -type d -prune -o -name measurements.json -type f -print | sort >"$current_index" || true + { + find "$baseline_dir" -name baseline -type d ! -path "$baseline_dir" -prune -o -name measurements.json -type f -print + } | sort -u >"$baseline_index" || true if [ ! -s "$current_index" ]; then echo "::error::no current measurements.json files found under $current_dir" @@ -2962,7 +4922,7 @@ jobs: def identity_dimensions: (.dimensions // {}) | to_entries - | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount"] | index($key) | not)) + | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not)) | sort_by(.key) | map("\(.key)=\(.value|tostring)") | join(","); @@ -2985,6 +4945,15 @@ jobs: else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2) end; + def percentile($p): + sort as $sorted + | ($sorted | length) as $count + | if $count == 0 then null + else $sorted[(($p * ($count - 1)) | floor)] + end; + + def abs_value: if . < 0 then -. else . end; + def observations_by_key($docs): reduce $docs[]? as $doc ({}; @@ -3001,29 +4970,48 @@ jobs: def observation_stats($items): ($items | map(.observation.value)) as $values - | ($items | map(.observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount + | ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues + | ($items | map(.observation.statistics.pairedDeltaMedian // empty)) as $pairedDeltaMedianValues + | ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values + | ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values + | ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues + | ($items | map(.observation.statistics.pairedDeltaSamples // []) | add // []) as $pairedDeltaSampleValues + | ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount + | ($values | median) as $median | { target: ($items[0].target // {}), observation: ($items[-1].observation // {}), - value: ($values | median), + measurementKind: ($items[-1].observation.measurementKind // null), + value: $median, min: ($values | min), max: ($values | max), + p25: ($values | percentile(0.25)), + p75: ($values | percentile(0.75)), + p95: ($values | percentile(0.95)), + mad: ($values | map(. - $median | if . < 0 then -. else . end) | median), sourceCount: ($items | length), sampleCount: $sampleCount, + pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0), + pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end), + pairedDeltaMedianValue: (if ($pairedDeltaMedianValues | length) == 0 then null else ($pairedDeltaMedianValues | median) end), + pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end), + pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end), + pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end), + pairedDeltaSampleValues: $pairedDeltaSampleValues, generatedAt: ($items[-1].generatedAt // null) }; def budget($metric; $unit): if $metric == "nix.closure.nar_size" then - {warnRatio:1.10, failRatio:1.25, warnAbs:52428800, failAbs:209715200} + {warnRatio:1.05, failRatio:1.10, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10485760} elif $metric == "nix.closure.bucket.nar_size" then - {warnRatio:1.15, failRatio:1.35, warnAbs:52428800, failAbs:209715200} + {warnRatio:1.10, failRatio:1.20, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.05, statisticalToleranceAbs:10485760} elif $metric == "nix.closure.path_count" then - {warnRatio:1.10, failRatio:1.25, warnAbs:100, failAbs:500} + {warnRatio:1.05, failRatio:1.10, warnAbs:100, failAbs:500, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10} elif $unit == "seconds" then - {warnRatio:1.25, failRatio:1.50, warnAbs:1.5, failAbs:3.0} + {warnRatio:1.10, failRatio:1.20, warnAbs:0.25, failAbs:1, statisticalToleranceRatio:0.10, statisticalToleranceAbs:0.25} else - {warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3} + {warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3, statisticalToleranceRatio:0.10, statisticalToleranceAbs:1} end; def noise_floor($metric; $unit): @@ -3032,13 +5020,68 @@ jobs: elif $unit == "seconds" then 0.1 else 0 end; - def abs_value: if . < 0 then -. else . end; - - def classify($metric; $unit; $current; $baseline; $baselineMin; $baselineMax; $currentSamples; $baselineSources): + def default_policy($metric; $unit): budget($metric; $unit) as $b | noise_floor($metric; $unit) as $noise + | $b + { + enabled:true, + comparisonMode:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then "budget" else "historical" end), + minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then 1 else 10 end), + minCurrentSamples:(if $unit == "seconds" then 3 else 1 end), + minPairedSamples:(if $unit == "seconds" then 5 else 0 end), + noiseFloor:$noise + }; + def observation_policy($obs): + default_policy($obs.name // "unknown"; $obs.unit // "unknown") + ($obs.policy // {}); + def policy_enabled($policy): + if ($policy | has("enabled")) then $policy.enabled else true end; + + def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad; $pairedDeltaValues): + $policy as $b + | ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode + | ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise | ($current - $baseline) as $delta + | (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta + | (($policy.pairedEvidenceQuantile // 0.25) | tonumber) as $pairedEvidenceQuantile | (if $baseline > 0 then ($current / $baseline) else null end) as $ratio + | (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr + | (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr + | (($pairedDeltaP75 // $evidenceDelta) - ($pairedDeltaP25 // $evidenceDelta)) as $pairedDeltaIqr + | ([ + $noise, + (($policy.statisticalToleranceAbs // 0) | tonumber), + (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end), + (($baselineMad // 0) * 3), + (($iqr // 0) * 1.5) + ] | max) as $robustTolerance + | (if $currentSamples > 1 then ([ + $noise, + (($policy.statisticalToleranceAbs // 0) | tonumber), + (if $current > 0 then ($current * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end), + (($currentMad // 0) * 3), + (($currentIqr // 0) * 1.5) + ] | max) else 0 end) as $currentRobustTolerance + | ([ + $noise, + (($policy.statisticalToleranceAbs // 0) | tonumber), + (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end), + (($pairedDeltaMad // 0) * 3), + (($pairedDeltaIqr // 0) * 1.5) + ] | max) as $pairedDeltaTolerance + | ($baseline + $robustTolerance) as $robustUpper + | ($baseline - $robustTolerance) as $robustLower + | ($current + $currentRobustTolerance) as $currentRobustUpper + | ($current - $currentRobustTolerance) as $currentRobustLower + | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile($pairedEvidenceQuantile)) else ($evidenceDelta - $pairedDeltaTolerance) end) as $evidenceDeltaLower + | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile(1 - $pairedEvidenceQuantile)) else ($evidenceDelta + $pairedDeltaTolerance) end) as $evidenceDeltaUpper + | ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget + | ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget + | ($comparisonMode != "paired") as $needsHistoricalBaselineCount + | ( + ($current >= $robustLower and $current <= $robustUpper) + or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower) + ) as $withinRobustBand + | ($comparisonMode == "historical" and $measurementKind != "deterministic") as $canUseRobustBandSuppression | ( $baselineMin != null and $baselineMax != null @@ -3047,35 +5090,90 @@ jobs: ) as $withinBaselineRange | ( if $baseline <= 0 then "unknown" + elif $comparisonMode == "paired" and $evidenceDeltaLower > $failBudget then "fail" + elif $comparisonMode == "paired" and $evidenceDeltaLower > $warnBudget then "warn" + elif $comparisonMode == "paired" then "pass" elif ($delta > $b.failAbs and $current > ($baseline * $b.failRatio)) then "fail" elif ($delta > $b.warnAbs and $current > ($baseline * $b.warnRatio)) then "warn" else "pass" end ) as $thresholdStatus + | ( + policy_enabled($policy) == true + and $baseline > 0 + and (if $needsHistoricalBaselineCount then $baselineSources >= ($policy.minBaselineSources // 1) else true end) + and $currentSamples >= ($policy.minCurrentSamples // 1) + and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end) + and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end) + ) as $gateable + | ( + if (policy_enabled($policy) != true) then "disabled" + elif $baseline <= 0 then "missing_baseline" + elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count" + elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count" + elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count" + elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta" + else "eligible" + end + ) as $gateReason | ( if $baseline <= 0 then "unknown" + elif (policy_enabled($policy) != true) then "diagnostic" elif ($delta | abs_value) <= $noise then "noise_floor" - elif ($withinBaselineRange and $thresholdStatus == "pass") then "within_baseline_range" - elif ($baselineSources < 3 or $currentSamples < 3) then "low_sample_count" + elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count" + elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count" + elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count" + elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta" + elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain" + elif ($canUseRobustBandSuppression and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band" elif $thresholdStatus == "pass" then "within_budget" else "threshold_exceeded" end ) as $confidence | ( - if $confidence == "threshold_exceeded" then $thresholdStatus + if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus elif $thresholdStatus == "unknown" then "unknown" else "pass" end ) as $status | ( if $baseline <= 0 then "unknown" + elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then "unchanged" + elif $comparisonMode == "paired" and $evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0 then "unchanged" + elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved" + elif $comparisonMode == "paired" then "regressed" elif ($delta | abs_value) <= $noise then "unchanged" - elif ($withinBaselineRange and $thresholdStatus == "pass") then "unchanged" + elif $canUseRobustBandSuppression and $withinRobustBand then "unchanged" elif $delta < 0 then "improved" else "regressed" + end + ) as $direction + | ( + if $baseline <= 0 then null + elif (policy_enabled($policy) != true) then null + elif $comparisonMode == "paired" and ($evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0) then 0 + elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0 + elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget + elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget) + elif $canUseRobustBandSuppression and $withinRobustBand then 0 + elif ($delta | abs_value) <= $noise then 0 + elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget + elif ($confidence == "threshold_exceeded" and $delta < 0) then -(([0, ($robustLower - $currentRobustUpper), (-$delta)] | max) / $warnBudget) + elif $delta > 0 then ([0, ($currentRobustLower - $robustUpper)] | max) / $warnBudget + else -(([0, ($robustLower - $currentRobustUpper)] | max) / $warnBudget) end - ) as $direction - | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,confidence:$confidence,direction:$direction}; + ) as $semanticImpactScore + | ( + if (policy_enabled($policy) != true) then "diagnostic" + elif $semanticImpactScore == null then "unknown" + elif $semanticImpactScore == 0 then "neutral" + elif $semanticImpactScore >= ($failBudget / $warnBudget) then "fail_boundary" + elif $semanticImpactScore >= 1 then "warn_boundary" + elif $semanticImpactScore > 0 then "below_warn_boundary" + else "improvement" + end + ) as $semanticImpactKind + | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance,pairedEvidenceQuantile:$pairedEvidenceQuantile,pairedEvidenceProtocol:(if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then "paired-delta-quantile-v1" elif $comparisonMode == "paired" then "paired-summary-robust-band-v1" else null end)}; (observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs | (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs @@ -3086,38 +5184,74 @@ jobs: .key as $key | .value as $currentValue | ($baselineObs[$key] // null) as $baselineValue + | ($currentValue.observation | observation_policy(.)) as $policy + | ($policy.comparisonMode // (if ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "deterministic" or ($currentValue.observation.unit // "") != "seconds" then "budget" elif ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode + | ($currentValue.pairedBaselineValue // null) as $pairedBaselineValue + | (if $comparisonMode == "paired" and $pairedBaselineValue != null then { + value: $pairedBaselineValue, + min: $pairedBaselineValue, + max: $pairedBaselineValue, + p25: $pairedBaselineValue, + p75: $pairedBaselineValue, + p95: $pairedBaselineValue, + mad: 0, + sourceCount: $currentValue.pairedSampleCount + } else $baselineValue end) as $effectiveBaselineValue | { key: $key, value: ( - if $baselineValue == null then + if $effectiveBaselineValue == null then { status: "missing_baseline", target: $currentValue.target, observation: $currentValue.observation, - current: $currentValue.value, - currentSamples: $currentValue.sampleCount, - baselineSources: 0, - confidence: "missing_baseline", - direction: "unknown" - } - else - classify( - $currentValue.observation.name; - $currentValue.observation.unit; - $currentValue.value; - $baselineValue.value; - $baselineValue.min; - $baselineValue.max; - $currentValue.sampleCount; - $baselineValue.sourceCount - ) + { + current: $currentValue.value, + currentSamples: $currentValue.sampleCount, + baselineSources: 0, + gatePolicy: $policy, + comparisonMode: $comparisonMode, + gateable: false, + gateReason: "missing_baseline", + confidence: "missing_baseline", + direction: "unknown" + } + else + classify( + $currentValue.observation.name; + $currentValue.observation.unit; + ($currentValue.observation.measurementKind // $currentValue.measurementKind); + $policy; + $currentValue.value; + $currentValue.p25; + $currentValue.p75; + $currentValue.mad; + $effectiveBaselineValue.value; + $effectiveBaselineValue.min; + $effectiveBaselineValue.max; + $effectiveBaselineValue.p25; + $effectiveBaselineValue.p75; + $effectiveBaselineValue.p95; + $effectiveBaselineValue.mad; + $currentValue.sampleCount; + $effectiveBaselineValue.sourceCount; + $currentValue.pairedSampleCount; + $currentValue.pairedDeltaMedianValue; + $currentValue.pairedDeltaP25Value; + $currentValue.pairedDeltaP75Value; + $currentValue.pairedDeltaMadValue; + ($currentValue.pairedDeltaSampleValues // []) + ) + { target: $currentValue.target, observation: $currentValue.observation, - currentSamples: $currentValue.sampleCount, - baselineSources: $baselineValue.sourceCount, - baselineMin: $baselineValue.min, - baselineMax: $baselineValue.max - } + currentSamples: $currentValue.sampleCount, + baselineSources: $effectiveBaselineValue.sourceCount, + baselineMin: $effectiveBaselineValue.min, + baselineMax: $effectiveBaselineValue.max, + baselineP25: $effectiveBaselineValue.p25, + baselineP75: $effectiveBaselineValue.p75, + baselineP95: $effectiveBaselineValue.p95 + ,baselineMad: $effectiveBaselineValue.mad + } end ) } @@ -3127,14 +5261,38 @@ jobs: | ( if any($comparisons[]?; .status == "fail") then "fail" elif any($comparisons[]?; .status == "warn") then "warn" - elif any($comparisons[]?; .status == "missing_baseline") then "partial" + elif any($comparisons[]?; + (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end) + and (.gateReason == "missing_baseline" + or .gateReason == "low_baseline_count" + or .gateReason == "low_current_sample_count" + or .gateReason == "low_paired_sample_count" + or .gateReason == "missing_paired_delta") + ) then "partial" else "pass" end ) as $status + | ( + [$comparisons[]?] + | { + enabledCount: (map(select((if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end))) | length), + gateableCount: (map(select(.gateable == true)) | length), + missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length), + lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length), + lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length), + lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length), + missingPairedDeltaCount: (map(select(.gateReason == "missing_paired_delta")) | length) + } + | . + { + nonGateableCount: (.enabledCount - .gateableCount), + enforceable: (.enabledCount == .gateableCount) + } + ) as $readiness | { schemaVersion:$schemaVersion, status:$status, mode:$mode, + readiness:$readiness, currentDir:$currentDir, baselineDir:$baselineDir, comparisons:$comparisons @@ -3161,7 +5319,7 @@ jobs: echo "::warning::CI measurement regression threshold exceeded" ;; partial:*) - echo "::notice::CI measurement baseline is missing for one or more observations" + echo "::notice::CI measurement comparison is partial because one or more enabled observations are not gateable" ;; esac @@ -3169,10 +5327,10 @@ jobs: { echo "### ${CI_MEASUREMENT_PR_COMMENT_TITLE:-CI Measurements}" echo "" - jq -r '"- Status: " + .status + "\n- Mode: " + .mode + "\n- Baseline: " + .baselineDir' "$comparison_file" + jq -r '"- Status: " + .status + "\n- Gate: " + (if .mode == "fail" then "enforced" elif .mode == "warn" then "advisory" elif .mode == "off" then "off" else (.mode // "unknown") end) + "\n- Baseline: " + .baselineDir' "$comparison_file" echo "" - echo "| Status | Target | Observation | Current | Baseline | Delta | Ratio |" - echo "| --- | --- | --- | ---: | ---: | ---: | ---: |" + echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |" + echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |" jq -r ' .comparisons | to_entries @@ -3186,9 +5344,10 @@ jobs: | .[:20] | .[] | .value as $v - | [ - $v.status, - (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")), + | [ + $v.status, + (if ($v.gateable // false) then "yes" else ($v.gateReason // "no") end), + (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")), ($v.observation.name // "unknown"), (($v.current // $v.observation.value // 0) | tostring), (($v.baseline // "") | tostring), @@ -3200,18 +5359,47 @@ jobs: } >>"$GITHUB_STEP_SUMMARY" fi - if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ]; then + if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ]; then + if [ "${GITHUB_EVENT_NAME:-}" != "pull_request" ]; then + echo "::notice::CI measurement PR comments are produced only by pull_request workflows; skipping comment for event ${GITHUB_EVENT_NAME:-unknown}" + exit 0 + fi + can_render_pr_comment=true - if ! command -v gh >/dev/null 2>&1; then - echo "::notice::gh is not available; skipping CI measurement PR comment" + + ensure_ci_measurement_tool() { + tool_name="$1" + nix_attr="$2" + if command -v "$tool_name" >/dev/null 2>&1; then + return 0 + fi + if ! command -v nix >/dev/null 2>&1; then + return 1 + fi + if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then + export PATH="$tool_out/bin:$PATH" + fi + command -v "$tool_name" >/dev/null 2>&1 + } + + if ! ensure_ci_measurement_tool gh gh; then + echo "::error::gh is not available; unable to publish required CI measurement PR comment" can_render_pr_comment=false fi - if ! command -v jq >/dev/null 2>&1; then - echo "::notice::jq is not available; skipping CI measurement PR comment" + if ! ensure_ci_measurement_tool node nodejs; then + echo "::error::node is not available; unable to publish required CI measurement PR comment" can_render_pr_comment=false fi + if ! command -v jq >/dev/null 2>&1; then + if ensure_ci_measurement_tool jq jq; then + : + else + echo "::error::jq is not available; unable to publish required CI measurement PR comment" + can_render_pr_comment=false + fi + fi if [ -z "${GH_TOKEN:-${GITHUB_TOKEN:-}}" ]; then - echo "::notice::GH_TOKEN/GITHUB_TOKEN is not set; skipping CI measurement PR comment" + echo "::error::GH_TOKEN/GITHUB_TOKEN is not set; unable to publish required CI measurement PR comment" can_render_pr_comment=false fi @@ -3221,10 +5409,14 @@ jobs: pr_number="$(jq -r '.pull_request.number // empty' "$event_path")" fi if [ "$can_render_pr_comment" = "true" ] && [ -z "$pr_number" ]; then - echo "::notice::pull request number is unavailable; skipping CI measurement PR comment" + echo "::error::pull request number is unavailable; unable to publish required CI measurement PR comment" can_render_pr_comment=false fi + if [ "$can_render_pr_comment" != "true" ]; then + exit 1 + fi + if [ "$can_render_pr_comment" = "true" ]; then repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}" comment_tmp_dir="$(mktemp -d)" @@ -3232,6 +5424,9 @@ jobs: comment_body="$comment_tmp_dir/comment.md" comment_id_file="$comment_tmp_dir/comment-id.txt" chart_file="$comment_tmp_dir/perf-change-vs-baseline.svg" + chart_dark_file="$comment_tmp_dir/perf-change-vs-baseline-dark.svg" + chart_png_file="$comment_tmp_dir/perf-change-vs-baseline.png" + chart_dark_png_file="$comment_tmp_dir/perf-change-vs-baseline-dark.png" renderer_script="$comment_tmp_dir/render-ci-measurement-comment.mjs" if ! gh api "repos/$repo/issues/$pr_number/comments" --paginate >"$comments_json"; then @@ -3245,21 +5440,44 @@ jobs: if [ -z "$asset_title" ]; then asset_title="ci-measurements" fi - asset_head_sha="${GITHUB_HEAD_SHA:-${GITHUB_SHA:-unknown}}" + asset_head_sha="${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_HEAD_SHA:-${GITHUB_SHA:-unknown}}}" asset_run_id="${GITHUB_RUN_ID:-local}" asset_run_attempt="${GITHUB_RUN_ATTEMPT:-0}" - asset_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}.svg" + asset_svg_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}.svg" + asset_png_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}.png" + asset_dark_png_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}-dark.png" + public_asset_command="${CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND:-}" + repo_private="$(gh api "repos/$repo" --jq '.private // false' 2>/dev/null || printf 'true')" + require_public_asset=false + if [ "$repo_private" = "true" ]; then + require_public_asset=true + fi if [ "${GITHUB_SERVER_URL:-https://github.com}" = "https://github.com" ]; then - chart_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_path" + github_raw_chart_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_png_path" + github_raw_chart_dark_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_dark_png_path" + github_raw_chart_source_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_svg_path" + else + github_raw_chart_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_png_path" + github_raw_chart_dark_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_dark_png_path" + github_raw_chart_source_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_svg_path" + fi + if [ "$repo_private" = "true" ]; then + chart_url="" + chart_dark_url="" + chart_source_url="" else - chart_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_path" + chart_url="$github_raw_chart_url" + chart_dark_url="$github_raw_chart_dark_url" + chart_source_url="$github_raw_chart_source_url" fi export CI_MEASUREMENT_PR_COMMENT_CHART_URL="$chart_url" + export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="$chart_dark_url" + export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="$chart_source_url" cat > "$renderer_script" <<'EOF' import { readFileSync, writeFileSync } from 'node:fs' - const [comparisonPath, commentsPath, bodyPath, commentIdPath, chartPath] = process.argv.slice(2) + const [comparisonPath, commentsPath, bodyPath, commentIdPath, chartPath, chartDarkPath] = process.argv.slice(2) const title = process.env.CI_MEASUREMENT_PR_COMMENT_TITLE || 'CI Measurements' const maxRows = Number.parseInt(process.env.CI_MEASUREMENT_PR_COMMENT_MAX_ROWS || '10', 10) const maxHistory = Number.parseInt(process.env.CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY || '20', 10) @@ -3267,13 +5485,20 @@ jobs: const runId = process.env.GITHUB_RUN_ID || '' const runAttempt = process.env.GITHUB_RUN_ATTEMPT || '' const sha = process.env.GITHUB_SHA || '' - const headSha = process.env.GITHUB_HEAD_SHA || sha + const headSha = process.env.CI_MEASUREMENT_SUBJECT_SHA || process.env.GITHUB_HEAD_SHA || sha const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com' const workflow = process.env.GITHUB_WORKFLOW || 'CI' const job = process.env.GITHUB_JOB || '' const chartUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_URL || '' - - const marker = '' + const chartDarkUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL || '' + const chartSourceUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL || '' + + const markerScope = (process.env.CI_MEASUREMENT_PR_COMMENT_MARKER || title) + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') || 'default' + const marker = '' + const legacyMarker = '' const statePrefix = '' const stateTag = 'ci-measurement-comment-state' @@ -3284,7 +5509,9 @@ jobs: if (!Array.isArray(comments)) throw new Error('comments response must be an array') const existing = comments.find((comment) => { - return typeof comment?.body === 'string' && comment.body.includes(marker) + if (typeof comment?.body !== 'string') return false + return comment.body.includes(marker) || + (comment.body.includes(legacyMarker) && comment.body.includes('## ' + title)) }) const extractState = (body) => { @@ -3331,15 +5558,139 @@ jobs: return formatNumber(Math.round((value - 1) * 1000) / 10) + '%' } - const formatResult = (row) => { - if (row.confidence === 'low_sample_count') return 'gray needs repeat' - if (row.status === 'fail') return 'red regression' - if (row.status === 'warn') return 'yellow regression' - if (row.status === 'missing_baseline') return 'gray no baseline' - if (row.confidence === 'noise_floor') return 'gray noise floor' - if (row.confidence === 'within_baseline_range') return 'gray within range' - if (row.direction === 'improved') return 'green improved' - return 'gray unchanged' + const formatSemanticImpact = (value) => { + if (value === null || value === undefined || Number.isNaN(value)) return 'n/a' + if (Math.abs(value) < 0.005) return '0.00x' + const sign = value > 0 ? '+' : '' + return sign + formatNumber(Math.round(value * 100) / 100) + 'x' + } + + const formatRowImpact = (row) => { + if (row.confidence === 'diagnostic' || row.gateReason === 'disabled' || row.semanticImpactKind === 'diagnostic') { + return 'diagnostic' + } + return formatSemanticImpact(row.semanticImpactScore) + } + + const formatEvidence = (row) => { + const unit = row.observation?.unit + if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') { + const quantile = typeof row.pairedEvidenceQuantile === 'number' + ? Math.round(row.pairedEvidenceQuantile * 100) + : 25 + return (row.confidence || 'unknown') + + '
paired n=' + (row.pairedSamples ?? 0) + + ', ' + quantile + '-' + (100 - quantile) + '% delta ' + + formatValue(row.evidenceDeltaLower, unit) + + ' - ' + formatValue(row.evidenceDeltaUpper, unit) + + '' + } + return (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '' + } + + const interpretation = (row) => { + if (row.confidence === 'low_baseline_count') return { + label: 'Needs more baseline', + detail: 'Not enough compatible baseline runs to make this gate trustworthy.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'low_current_sample_count') return { + label: 'Needs repeat', + detail: 'Current run has too few successful measured samples.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'low_paired_sample_count') return { + label: 'Needs paired evidence', + detail: 'Wall-clock gates require same-run base/head samples before they can block merges.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'missing_paired_delta') return { + label: 'Needs paired delta stats', + detail: 'Wall-clock gates require per-pair delta statistics, not only paired medians.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'paired_uncertain') return { + label: 'Uncertain wall-clock movement', + detail: 'The paired median moved, but the paired delta band still crosses the configured budget.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'diagnostic') return { + label: 'Diagnostic only', + detail: 'Shown for investigation, but intentionally excluded from gating.', + tone: 'diagnostic', + color: '#a78bfa', + } + if (row.status === 'fail') return { + label: 'Regression - blocks merge', + detail: 'Worse than the configured fail threshold with enough samples.', + tone: 'bad', + color: '#ef4444', + } + if (row.status === 'warn') return { + label: 'Regression - review', + detail: 'Worse than the configured warning threshold.', + tone: 'warn', + color: '#f59e0b', + } + if (row.status === 'missing_baseline') return { + label: 'No baseline yet', + detail: 'Current value is measured, but no comparable baseline exists.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'noise_floor') return { + label: 'Too small to matter', + detail: 'The absolute change is below the noise floor for this metric.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'within_baseline_range') return { + label: 'Historical range only', + detail: 'Inside the full historical min/max range, but this range is not used to pass a gate.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'within_robust_band' || row.confidence === 'within_baseline_distribution') return { + label: 'Within noise band', + detail: 'Current and baseline robust noise bands overlap.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.direction === 'improved' && typeof row.semanticImpactScore === 'number' && row.semanticImpactScore <= -1) return { + label: 'Meaningfully lower', + detail: 'Lower than baseline by enough to cross the configured review threshold.', + tone: 'good', + color: '#10b981', + } + if (row.direction === 'improved') return { + label: 'Slightly lower, ok', + detail: 'Lower than baseline, but still inside the configured review budget.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.direction === 'regressed') return { + label: 'Slightly higher, ok', + detail: 'Higher than baseline but still inside the configured budget.', + tone: 'neutral', + color: '#94a3b8', + } + return { + label: 'Unchanged', + detail: 'No meaningful movement from baseline.', + tone: 'neutral', + color: '#94a3b8', + } + } + + const formatGate = (row) => { + if (row.gateable) return 'yes' + const reason = row.gateReason || row.confidence || 'unknown' + return 'no
' + reason + '' } const escapeCell = (value) => String(value ?? '-').replaceAll('|', '\\|').replaceAll('\n', '
') @@ -3361,6 +5712,8 @@ jobs: task_pnpm_install: 'pnpm:install', task_genie_run: 'genie:run', task_check_quick: 'check:quick', + task_check_quick_warm: 'Warm cached check:quick', + task_check_quick_forced: 'Forced check:quick', } if (probe && labels[probe]) return labels[probe] if (name.startsWith('devenv.') && name.endsWith('.duration')) { @@ -3369,6 +5722,22 @@ jobs: return name } + const semanticPath = (row) => { + const parts = [ + ...(Array.isArray(row.target?.path) ? row.target.path : []), + row.target?.group, + ...(Array.isArray(row.observation?.path) ? row.observation.path : []), + row.observation?.group, + ].filter((value) => typeof value === 'string' && value.length > 0) + const seen = new Set() + const unique = parts.filter((part) => { + if (seen.has(part)) return false + seen.add(part) + return true + }) + return unique.length > 0 ? unique.join(' / ') : '-' + } + const chartProbe = (row) => { if (row.observation?.label) return row.observation.label const probe = row.observation?.dimensions?.probe @@ -3380,6 +5749,8 @@ jobs: task_pnpm_install: 'pnpm:install', task_genie_run: 'genie:run', task_check_quick: 'check:quick', + task_check_quick_warm: 'Warm cached check:quick', + task_check_quick_forced: 'Forced check:quick', } if (probe && labels[probe]) return labels[probe] return humanProbe(row) @@ -3397,40 +5768,165 @@ jobs: const rank = (row) => { if (row.status === 'fail') return 0 if (row.status === 'warn') return 1 - if (row.status === 'missing_baseline') return 2 - return 3 + if (row.status === 'missing_baseline') return 3 + return 2 } const allRows = Object.values(comparison.comparisons || {}).sort((left, right) => { const byRank = rank(left) - rank(right) if (byRank !== 0) return byRank - return (right.delta || 0) - (left.delta || 0) + const leftImpact = typeof left.semanticImpactScore === 'number' ? Math.abs(left.semanticImpactScore) : 0 + const rightImpact = typeof right.semanticImpactScore === 'number' ? Math.abs(right.semanticImpactScore) : 0 + if (rightImpact !== leftImpact) return rightImpact - leftImpact + const leftDelta = typeof left.delta === 'number' ? Math.abs(left.delta) : 0 + const rightDelta = typeof right.delta === 'number' ? Math.abs(right.delta) : 0 + if (rightDelta !== leftDelta) return rightDelta - leftDelta + return humanProbe(left).localeCompare(humanProbe(right)) }) + const protocolLabel = (() => { + const protocols = new Set( + allRows + .map((row) => row.observation?.dimensions?.measurementProtocol) + .filter((value) => typeof value === 'string' && value.length > 0), + ) + return protocols.size > 0 ? Array.from(protocols).join(', ') : 'legacy' + })() const visibleLimit = Number.isFinite(maxRows) && maxRows > 0 ? maxRows : 10 const comparableRows = allRows.filter((row) => typeof row.baseline === 'number') const hasComparableBaseline = comparableRows.length > 0 + const isDiagnosticRow = (row) => + row.status === 'missing_baseline' || + row.confidence === 'diagnostic' || + row.gateReason === 'disabled' || + row.semanticImpactKind === 'diagnostic' || + (!row.gateable && typeof row.baseline !== 'number') + const isZeroImpactRow = (row) => + typeof row.semanticImpactScore === 'number' && + !Number.isNaN(row.semanticImpactScore) && + Math.abs(row.semanticImpactScore) < 0.005 + const actionableComparableRows = comparableRows.filter((row) => !isDiagnosticRow(row)) const visibleRows = (hasComparableBaseline - ? allRows.filter((row) => typeof row.baseline === 'number') - : allRows.slice().sort((left, right) => (right.current || 0) - (left.current || 0)) + ? actionableComparableRows + : allRows.filter((row) => !isDiagnosticRow(row)).sort((left, right) => (right.current || 0) - (left.current || 0)) ).slice(0, visibleLimit) + const nonZeroImpactRows = actionableComparableRows.filter((row) => !isZeroImpactRow(row)) + const zeroImpactRows = actionableComparableRows.filter(isZeroImpactRow) + const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit) + const diagnosticRows = allRows.filter(isDiagnosticRow) + + const baselineToCurrent = (row) => { + const unit = row.observation?.unit + return formatValue(row.baseline, unit) + ' -> ' + formatValue(row.current, unit) + } + + const rawChange = (row) => { + const unit = row.observation?.unit + return formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio) + } + + const confidenceSummary = (row) => { + const unit = row.observation?.unit + if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') { + const quantile = typeof row.pairedEvidenceQuantile === 'number' + ? Math.round(row.pairedEvidenceQuantile * 100) + : 25 + return 'paired n=' + (row.pairedSamples ?? 0) + + ', ' + quantile + '-' + (100 - quantile) + '% delta ' + + formatValue(row.evidenceDeltaLower, unit) + + '..' + formatValue(row.evidenceDeltaUpper, unit) + } + return (row.confidence || 'unknown') + ', baseline n=' + (row.baselineSources ?? 0) + ', current n=' + (row.currentSamples ?? 1) + } + + const scanDecision = (row) => { + if (row.status === 'fail') return 'regression blocks' + if (row.status === 'warn') return 'regression review' + if (row.status === 'missing_baseline') return 'needs baseline' + if (row.direction === 'improved') return 'faster' + if (row.direction === 'regressed') return 'no material impact' + return 'unchanged' + } + + const scanTable = (rows) => { + if (rows.length === 0) return 'No non-zero actionable measurement impact detected.' + return [ + '| What changed? | Probe | Baseline -> current | Raw change | Impact | Confidence |', + '| --- | --- | --- | ---: | ---: | --- |', + ...rows.map((row) => { + return '| ' + [ + scanDecision(row), + humanProbe(row), + baselineToCurrent(row), + rawChange(row), + formatRowImpact(row), + confidenceSummary(row), + ].map(escapeCell).join(' | ') + ' |' + }), + ].join('\n') + } + + const zeroImpactTable = (rows) => { + if (rows.length === 0) return 'No zero-impact measurements.' + return [ + '| Probe | Baseline -> current | Raw change | Impact | Gate | Evidence | Why hidden |', + '| --- | --- | ---: | ---: | --- | --- | --- |', + ...rows.map((row) => { + const meaning = interpretation(row) + return '| ' + [ + humanProbe(row), + baselineToCurrent(row), + rawChange(row), + formatRowImpact(row), + row.gateable ? 'yes' : (row.gateReason || 'no'), + confidenceSummary(row), + meaning.label, + ].map(escapeCell).join(' | ') + ' |' + }), + ].join('\n') + } + + const diagnosticTable = (rows) => { + if (rows.length === 0) return 'No diagnostic or ungated measurements.' + return [ + '| Probe | Current | Baseline | Impact | Gate | Reason | Evidence |', + '| --- | ---: | ---: | ---: | --- | --- | --- |', + ...rows.map((row) => { + return '| ' + [ + humanProbe(row), + formatValue(row.current, row.observation?.unit), + formatValue(row.baseline, row.observation?.unit), + formatRowImpact(row), + row.gateable ? 'yes' : (row.gateReason || row.status || 'no'), + interpretation(row).label, + confidenceSummary(row), + ].map(escapeCell).join(' | ') + ' |' + }), + ].join('\n') + } const comparisonTable = (rows) => { if (rows.length === 0) return 'No measurement regressions detected.' return [ - '| Probe | Baseline | Current | Change | Result | Confidence |', - '| --- | ---: | ---: | ---: | --- | --- |', + '| Group | Measurement | Baseline | Current | Raw change | Impact | Meaning | Gate | Evidence |', + '| --- | --- | ---: | ---: | ---: | ---: | --- | --- | --- |', ...rows.map((row) => { const unit = row.observation?.unit - const baselineRange = typeof row.baselineMin === 'number' && typeof row.baselineMax === 'number' && row.baselineMin !== row.baselineMax - ? '
range ' + formatValue(row.baselineMin, unit) + ' - ' + formatValue(row.baselineMax, unit) + '' + const baselineRange = typeof row.baselineRobustLower === 'number' && typeof row.baselineRobustUpper === 'number' && row.baselineRobustLower !== row.baselineRobustUpper + ? '
noise band ' + formatValue(row.baselineRobustLower, unit) + ' - ' + formatValue(row.baselineRobustUpper, unit) + '' + : typeof row.baselineMin === 'number' && typeof row.baselineMax === 'number' && row.baselineMin !== row.baselineMax + ? '
range ' + formatValue(row.baselineMin, unit) + ' - ' + formatValue(row.baselineMax, unit) + '' : '' + const meaning = interpretation(row) return '| ' + [ + semanticPath(row), humanProbe(row), formatValue(row.baseline, unit) + baselineRange, formatValue(row.current, unit), formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio), - formatResult(row), - (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '', + formatRowImpact(row), + meaning.label + '
' + meaning.detail + '', + formatGate(row), + formatEvidence(row), ].map(escapeCell).join(' | ') + ' |' }), ].join('\n') @@ -3439,10 +5935,10 @@ jobs: const currentOnlyTable = (rows) => { if (rows.length === 0) return 'No current measurements found.' return [ - '| Probe | Current |', - '| --- | ---: |', + '| Group | Measurement | Current |', + '| --- | --- | ---: |', ...rows.map((row) => { - return '| ' + [humanProbe(row), formatValue(row.current, row.observation?.unit)].map(escapeCell).join(' | ') + ' |' + return '| ' + [semanticPath(row), humanProbe(row), formatValue(row.current, row.observation?.unit)].map(escapeCell).join(' | ') + ' |' }), ].join('\n') } @@ -3450,12 +5946,13 @@ jobs: const allMeasurementsTable = (rows) => { if (rows.length === 0) return 'No measurement regressions detected.' return [ - '| Status | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio |', - '| --- | --- | --- | --- | ---: | ---: | ---: | ---: |', + '| Status | Gate | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio | Impact |', + '| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: |', ...rows.map((row) => { const unit = row.observation?.unit return '| ' + [ row.status, + row.gateable ? 'yes' : (row.gateReason || 'no'), row.target?.label || row.target?.name || 'unknown', row.observation?.label || row.observation?.name || 'unknown', dimensions(row), @@ -3463,11 +5960,38 @@ jobs: formatValue(row.current, unit), formatDelta(row.delta, unit), formatRatio(row.ratio), + formatRowImpact(row), ].map(escapeCell).join(' | ') + ' |' }), ].join('\n') } + const sourceMeasurement = (row) => ({ + id: row.observation?.dimensions?.probe || row.observation?.name || humanProbe(row), + label: humanProbe(row), + group: semanticPath(row), + status: row.status, + direction: row.direction, + gateable: row.gateable, + gateReason: row.gateReason, + confidence: row.confidence, + comparisonMode: row.comparisonMode, + unit: row.observation?.unit, + baseline: row.baseline ?? null, + current: row.current ?? null, + delta: row.delta ?? null, + ratio: row.ratio ?? null, + semanticImpactScore: row.semanticImpactScore ?? null, + semanticImpactKind: row.semanticImpactKind ?? null, + baselineSources: row.baselineSources ?? null, + currentSamples: row.currentSamples ?? null, + pairedSamples: row.pairedSamples ?? null, + evidenceDeltaLower: row.evidenceDeltaLower ?? null, + evidenceDeltaUpper: row.evidenceDeltaUpper ?? null, + pairedEvidenceQuantile: row.pairedEvidenceQuantile ?? null, + dimensions: row.observation?.dimensions || {}, + }) + const truncate = (value, maxLength) => { const text = String(value) if (text.length <= maxLength) return text @@ -3475,70 +5999,121 @@ jobs: return text.slice(0, Math.max(0, maxLength - 3)) + '...' } - const renderPerfChangeSvg = (rows) => { + const renderPerfChangeSvg = (rows, theme = 'adaptive') => { const chartRows = rows - .filter((row) => row.observation?.unit === 'seconds') .filter((row) => typeof row.current === 'number' && typeof row.baseline === 'number') - .filter((row) => typeof row.ratio === 'number') - .sort((left, right) => ((left.ratio || 1) - 1) - ((right.ratio || 1) - 1)) + .filter((row) => row.gateable === true) + .filter((row) => typeof row.semanticImpactScore === 'number') + .sort((left, right) => (left.semanticImpactScore || 0) - (right.semanticImpactScore || 0)) .slice(0, visibleLimit) if (chartRows.length === 0) return '' - const percentages = chartRows.map((row) => ((row.ratio || 1) - 1) * 100) - const minPct = Math.min(-1, ...percentages) - const maxPct = Math.max(1, ...percentages) - const lower = Math.floor(minPct) - const upper = Math.ceil(maxPct) + const impactScores = chartRows.map((row) => row.semanticImpactScore || 0) + const minImpact = Math.min(-1, ...impactScores) + const maxImpact = Math.max(1, ...impactScores) + const lower = Math.floor(minImpact) + const upper = Math.ceil(maxImpact) const span = upper - lower || 1 - const width = 900 - const rowHeight = 42 - const height = 96 + chartRows.length * rowHeight + 34 - const labelX = 238 - const plotX = 260 - const plotWidth = 342 - const percentX = 626 - const nominalX = 704 - const topY = 78 + const width = 1040 + const rowHeight = 46 + const height = 112 + chartRows.length * rowHeight + 34 + const labelX = 230 + const plotX = 252 + const plotWidth = 320 + const impactX = 596 + const nominalX = 672 + const meaningX = 804 + const topY = 92 const barHeight = 18 const zeroX = plotX + ((0 - lower) / span) * plotWidth + const themeCss = theme === 'dark' + ? [ + ' .chart-bg { fill: #0d1117; }', + ' .chart-border { fill: none; stroke: #30363d; }', + ' .chart-title { fill: #f0f6fc; }', + ' .chart-muted { fill: #8b949e; }', + ' .chart-axis { stroke: #8b949e; }', + ' .chart-label { fill: #c9d1d9; }', + ' .chart-value { fill: #8b949e; }', + ' .chart-track { fill: #21262d; }', + ] + : [ + ' .chart-bg { fill: #ffffff; }', + ' .chart-border { fill: none; stroke: #d0d7de; }', + ' .chart-title { fill: #24292f; }', + ' .chart-muted { fill: #57606a; }', + ' .chart-axis { stroke: #8c959f; }', + ' .chart-label { fill: #24292f; }', + ' .chart-value { fill: #57606a; }', + ' .chart-track { fill: #f6f8fa; }', + ...(theme === 'adaptive' + ? [ + ' @media (prefers-color-scheme: dark) {', + ' .chart-bg { fill: #0d1117; }', + ' .chart-border { stroke: #30363d; }', + ' .chart-title { fill: #f0f6fc; }', + ' .chart-muted { fill: #8b949e; }', + ' .chart-axis { stroke: #8b949e; }', + ' .chart-label { fill: #c9d1d9; }', + ' .chart-value { fill: #8b949e; }', + ' .chart-track { fill: #21262d; }', + ' }', + ] + : []), + ] const svg = [ '', '', - '', - 'Perf change vs baseline (%)', - 'faster', - 'slower', - 'baseline -> current', - '', + '', + '', + '', + 'Actionable measurement impact', + '0 means no actionable PR impact; 1x reaches the warning budget.', + 'improved', + 'regressed', + 'impact', + 'baseline -> current', + 'meaning', + '', ] for (const [index, row] of chartRows.entries()) { - const pct = ((row.ratio || 1) - 1) * 100 + const impact = row.semanticImpactScore || 0 const y = topY + index * rowHeight - const valueWidth = Math.max(2, Math.abs(pct) / span * plotWidth) - const x = pct < 0 ? zeroX - valueWidth : zeroX - const color = pct < 0 ? '#20d6a3' : '#fb6b6b' - const formattedPct = (pct > 0 ? '+' : '') + formatNumber(Math.round(pct * 10) / 10) + '%' + const valueWidth = Math.max(2, Math.abs(impact) / span * plotWidth) + const x = impact < 0 ? zeroX - valueWidth : zeroX + const meaning = interpretation(row) + const color = meaning.color + const formattedImpact = formatSemanticImpact(impact) const label = chartProbe(row) const nominal = formatValue(row.baseline, row.observation?.unit).replaceAll(' ', '') + ' -> ' + formatValue(row.current, row.observation?.unit).replaceAll(' ', '') + const barOpacity = meaning.tone === 'neutral' ? '0.65' : '1' + const dash = meaning.tone === 'diagnostic' ? ' stroke-dasharray="3 3"' : '' svg.push( - '' + escapeXml(label) + '' + escapeXml(truncate(label, 30)) + '', - '', - '', - '' + escapeXml(formattedPct) + '', - '' + escapeXml(nominal) + '' + escapeXml(truncate(nominal, 24)) + '', + '' + escapeXml(label) + '' + escapeXml(truncate(label, 28)) + '', + '', + '', + '' + escapeXml(formattedImpact) + '', + '' + escapeXml(nominal) + '' + escapeXml(truncate(nominal, 21)) + '', + '' + escapeXml(meaning.detail) + '' + escapeXml(truncate(meaning.label, 30)) + '', ) } svg.push( - '0%', + '0', '', ) return svg.join('\n') } const statusWord = comparison.status || 'unknown' + const readiness = comparison.readiness || {} + const readinessLabel = readiness.enforceable + ? 'enforceable' + : 'partial (' + (readiness.gateableCount ?? 0) + '/' + (readiness.enabledCount ?? 0) + ' enabled observations gateable)' const runUrl = runId ? serverUrl + '/' + repo + '/actions/runs/' + runId : undefined const shortSha = (headSha || sha || 'unknown').slice(0, 7) const existingState = extractState(existing?.body) @@ -3556,22 +6131,35 @@ jobs: status: row.status, target: row.target?.label || row.target?.name || 'unknown', observation: row.observation?.label || row.observation?.name || 'unknown', + meaning: interpretation(row).label, dimensions: dimensions(row).replaceAll('
', ', '), baseline: formatValue(row.baseline, row.observation?.unit), current: formatValue(row.current, row.observation?.unit), delta: formatDelta(row.delta, row.observation?.unit), ratio: formatRatio(row.ratio), + impact: formatSemanticImpact(row.semanticImpactScore), })), } - const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha) + const hasComparableHistory = (run) => Array.isArray(run.visibleRows) && run.visibleRows.some((row) => + row.status !== 'missing_baseline' && + row.baseline !== 'n/a' && + row.ratio !== 'n/a' + ) + const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha && hasComparableHistory(run)) const historyLimit = Number.isFinite(maxHistory) && maxHistory > 0 ? maxHistory : 20 const state = { _tag: stateTag, schemaVersion, title, runs: [currentRun, ...previousRuns].slice(0, historyLimit) } + const gateModeLabel = (mode) => { + if (mode === 'fail') return 'enforced' + if (mode === 'warn') return 'advisory' + if (mode === 'off') return 'off' + return mode || 'unknown' + } const historyRows = state.runs.slice(1).map((run) => { const link = run.runUrl ? '[' + run.shortSha + '](' + run.runUrl + ')' : run.shortSha const top = Array.isArray(run.visibleRows) && run.visibleRows.length > 0 - ? run.visibleRows.slice(0, 3).map((row) => row.status + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
') + ? run.visibleRows.slice(0, 3).map((row) => (row.meaning || row.status) + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
') : 'No regressions' - return '| ' + [link, run.status, run.mode, top].map(escapeCell).join(' | ') + ' |' + return '| ' + [link, run.status, gateModeLabel(run.mode), top].map(escapeCell).join(' | ') + ' |' }) const runLink = runUrl ? '[workflow run](' + runUrl + ')' : 'workflow run unavailable' @@ -3580,26 +6168,102 @@ jobs: ? '[main run ' + baselineProvenance.runId + '](' + serverUrl + '/' + repo + '/actions/runs/' + baselineProvenance.runId + ')' + (Array.isArray(baselineProvenance.runs) && baselineProvenance.runs.length > 1 ? ' + ' + (baselineProvenance.runs.length - 1) + ' older baseline runs' : '') : 'not available' - const chartSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows) : '' + const sourceOfTruth = { + schemaVersion, + title, + status: statusWord, + gate: gateModeLabel(comparison.mode), + readiness: readinessLabel, + commit: { + shortSha, + sha: headSha || sha || 'unknown', + }, + run: { + id: runId || null, + attempt: runAttempt || null, + url: runUrl || null, + }, + baseline: baselineProvenance || null, + protocol: protocolLabel, + chart: { + meaning: 'semantic-impact', + zeroImpactMeaning: 'no actionable PR impact after budgets, noise floor, and robust evidence checks', + svg: chartSourceUrl || null, + lightPng: chartUrl || null, + darkPng: chartDarkUrl || null, + }, + measurements: allRows.map(sourceMeasurement), + } + const chartSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows) : '' + const chartDarkSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows, 'dark') : '' if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg) - const chartMarkdown = chartUrl && chartSvg ? '![Perf change vs baseline chart](' + chartUrl + ')' : '' + if (chartDarkPath && chartDarkSvg) writeFileSync(chartDarkPath, chartDarkSvg) + const chartImageMarkdown = chartUrl && chartSvg + ? (chartDarkUrl + ? '\n' + + ' \n' + + ' \n' + + ' Measurement change vs baseline chart\n' + + '' + : '![Measurement change vs baseline chart](' + chartUrl + ')') + : '' + const chartMarkdown = chartImageMarkdown + ? chartImageMarkdown + + (chartSourceUrl ? '\n\n[SVG source](' + chartSourceUrl + ')' : '') + : '' + + const regressionCount = allRows.filter((row) => row.status === 'fail' || row.status === 'warn').length + const improvementCount = comparableRows.filter((row) => row.direction === 'improved' && !isZeroImpactRow(row)).length + const neutralCount = zeroImpactRows.length + diagnosticRows.length + const humanSummary = hasComparableBaseline + ? regressionCount > 0 + ? String(regressionCount) + ' regression' + (regressionCount === 1 ? '' : 's') + ' need review.' + : improvementCount > 0 + ? 'No regressions. ' + String(improvementCount) + ' probe' + (improvementCount === 1 ? '' : 's') + ' got faster; ' + String(neutralCount) + ' neutral or ungated row' + (neutralCount === 1 ? '' : 's') + ' are collapsed below.' + : 'No regressions. Comparable movement is below the semantic impact threshold; neutral rows are collapsed below.' + : 'No compatible baseline was available, so this run shows current measurements only.' const summaryLines = [ '## ' + title, '', - '- Status: ' + statusWord, - '- Mode: ' + (comparison.mode || 'unknown'), - '- Commit: ' + shortSha, - '- Run: ' + runLink, - '- Baseline: ' + baselineLabel, + '**' + statusWord + '** - ' + gateModeLabel(comparison.mode) + ' gate - readiness ' + readinessLabel + ' - commit ' + shortSha + ' - protocol ' + protocolLabel + '', '', - hasComparableBaseline - ? 'Chart: performance change versus baseline median. Green is faster, red is slower, gray is within noise or baseline range.' - : 'No compatible baseline was available, so this run shows current measurements only.', + '> ' + humanSummary, '', chartMarkdown, '', - hasComparableBaseline ? comparisonTable(visibleRows) : currentOnlyTable(visibleRows), + hasComparableBaseline + ? scanTable(visibleNonZeroImpactRows) + : currentOnlyTable(visibleRows), + ] + + if (hasComparableBaseline && zeroImpactRows.length > 0) { + summaryLines.push( + '', + '
', + 'Unchanged / 0-impact measurements (' + zeroImpactRows.length + ')', + '', + 'These rows had compatible baseline data, but their semantic impact rounded to 0.00x because the movement was below the configured budget, below the noise floor, or inside the robust noise band.', + '', + zeroImpactTable(zeroImpactRows), + '', + '
', + ) + } + + if (diagnosticRows.length > 0) { + summaryLines.push( + '', + '
', + 'Diagnostic / ungated measurements (' + diagnosticRows.length + ')', + '', + diagnosticTable(diagnosticRows), + '', + '
', + ) + } + + summaryLines.push( '', '
', 'All measurements', @@ -3607,7 +6271,7 @@ jobs: allMeasurementsTable(allRows), '', '
', - ] + ) if (historyRows.length > 0) { summaryLines.push( @@ -3615,7 +6279,7 @@ jobs: '
', 'Previous runs', '', - '| Commit | Status | Mode | Top changes |', + '| Commit | Status | Gate | Top changes |', '| --- | --- | --- | --- |', ...historyRows, '', @@ -3623,14 +6287,50 @@ jobs: ) } + summaryLines.push( + '', + '
', + 'Source-of-truth JSON', + '', + '~~~json', + JSON.stringify(sourceOfTruth, null, 2), + '~~~', + '', + '
', + ) + summaryLines.push('', marker, statePrefix + JSON.stringify(state, null, 2) + stateSuffix) writeFileSync(bodyPath, summaryLines.join('\n') + '\n') writeFileSync(commentIdPath, existing?.id ? String(existing.id) : '') EOF - node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" + node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file" if [ -s "$chart_file" ]; then + if [ "$require_public_asset" = "true" ] && [ -z "$public_asset_command" ]; then + echo "::error::CI measurement chart was rendered for a private repository, but CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND is not configured. Private raw GitHub URLs cannot be embedded in PR comments." + exit 1 + fi + + if ensure_ci_measurement_tool resvg resvg; then + resvg_font_args=() + if command -v nix >/dev/null 2>&1; then + if font_out="$(nix build --no-link --print-out-paths nixpkgs#dejavu_fonts 2>/dev/null)"; then + resvg_font_args+=(--use-fonts-dir "$font_out/share/fonts/truetype") + fi + fi + if ! resvg --background '#ffffff' "${resvg_font_args[@]}" "$chart_file" "$chart_png_file"; then + echo "::notice::unable to render CI measurement chart PNG" + rm -f "$chart_png_file" + fi + if [ -s "$chart_dark_file" ] && ! resvg --background '#0d1117' "${resvg_font_args[@]}" "$chart_dark_file" "$chart_dark_png_file"; then + echo "::notice::unable to render dark CI measurement chart PNG" + rm -f "$chart_dark_png_file" + fi + else + echo "::notice::resvg is not available; skipping embedded CI measurement chart PNG" + fi + if ! gh api "repos/$repo/git/ref/heads/$asset_branch" >/dev/null 2>&1; then default_branch_sha="$(gh api "repos/$repo/git/ref/heads/${GITHUB_BASE_REF:-main}" --jq '.object.sha' 2>/dev/null || true)" if [ -z "$default_branch_sha" ]; then @@ -3641,19 +6341,77 @@ jobs: fi fi chart_content="$(base64 <"$chart_file" | tr -d '\n')" - if ! gh api "repos/$repo/contents/$asset_path" --method PUT --field message="Update CI measurement chart for PR #$pr_number" --field content="$chart_content" --field branch="$asset_branch" >/dev/null; then - echo "::notice::unable to upload CI measurement chart asset" - sed -i.bak '/!\[Perf change vs baseline chart\]/d' "$comment_body" + if ! gh api "repos/$repo/contents/$asset_svg_path" --method PUT --field message="Update CI measurement chart SVG for PR #$pr_number" --field content="$chart_content" --field branch="$asset_branch" >/dev/null; then + echo "::notice::unable to upload CI measurement chart SVG asset" + if [ -z "$public_asset_command" ]; then + sed -i.bak '/\[SVG source\]/d' "$comment_body" + fi + fi + if [ -s "$chart_png_file" ]; then + chart_png_content="$(base64 <"$chart_png_file" | tr -d '\n')" + if ! gh api "repos/$repo/contents/$asset_png_path" --method PUT --field message="Update CI measurement chart PNG for PR #$pr_number" --field content="$chart_png_content" --field branch="$asset_branch" >/dev/null; then + echo "::notice::unable to upload CI measurement chart PNG asset" + if [ -z "$public_asset_command" ]; then + sed -i.bak '/!\[Measurement change vs baseline chart\]/d; /!\[Perf change vs baseline chart\]/d; //,/<\\/picture>/d' "$comment_body" + fi + fi + else + sed -i.bak '/!\[Measurement change vs baseline chart\]/d; /!\[Perf change vs baseline chart\]/d; //,/<\\/picture>/d' "$comment_body" + fi + if [ -s "$chart_dark_png_file" ]; then + chart_dark_png_content="$(base64 <"$chart_dark_png_file" | tr -d '\n')" + if ! gh api "repos/$repo/contents/$asset_dark_png_path" --method PUT --field message="Update dark CI measurement chart PNG for PR #$pr_number" --field content="$chart_dark_png_content" --field branch="$asset_branch" >/dev/null; then + echo "::notice::unable to upload dark CI measurement chart PNG asset" + if [ -z "$public_asset_command" ]; then + export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="" + node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file" + fi + fi + fi + + if [ -n "$public_asset_command" ] && [ -s "$chart_png_file" ]; then + if public_chart_url="$(bash -c "$public_asset_command" _ "$chart_png_file" png)" && [ -n "$public_chart_url" ]; then + chart_url="$public_chart_url" + export CI_MEASUREMENT_PR_COMMENT_CHART_URL="$chart_url" + else + echo "::notice::unable to publish CI measurement chart PNG to public asset host" + export CI_MEASUREMENT_PR_COMMENT_CHART_URL="" + fi + if [ -s "$chart_dark_png_file" ] && public_chart_dark_url="$(bash -c "$public_asset_command" _ "$chart_dark_png_file" png)" && [ -n "$public_chart_dark_url" ]; then + chart_dark_url="$public_chart_dark_url" + export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="$chart_dark_url" + else + echo "::notice::unable to publish dark CI measurement chart PNG to public asset host" + export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="" + fi + if public_chart_source_url="$(bash -c "$public_asset_command" _ "$chart_file" svg)" && [ -n "$public_chart_source_url" ]; then + chart_source_url="$public_chart_source_url" + export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="$chart_source_url" + else + echo "::notice::unable to publish CI measurement chart SVG to public asset host" + export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="" + fi + if [ "$require_public_asset" = "true" ] && [ -z "$chart_url" ]; then + echo "::error::unable to publish CI measurement chart PNG to a public asset host for private repository $repo" + exit 1 + fi + if [ "$require_public_asset" = "true" ] && [ -s "$chart_dark_png_file" ] && [ -z "$chart_dark_url" ]; then + echo "::error::unable to publish dark CI measurement chart PNG to a public asset host for private repository $repo" + exit 1 + fi + node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file" fi fi comment_id="$(cat "$comment_id_file")" + comment_payload_file="$comment_body.payload.json" + node -e "const fs=require('node:fs'); fs.writeFileSync(process.argv[2], JSON.stringify({ body: fs.readFileSync(process.argv[1], 'utf8') }))" "$comment_body" "$comment_payload_file" if [ -n "$comment_id" ]; then - if ! gh api "repos/$repo/issues/comments/$comment_id" --method PATCH --field body="$(cat "$comment_body")" >/dev/null; then + if ! gh api "repos/$repo/issues/comments/$comment_id" --method PATCH --input "$comment_payload_file" >/dev/null; then echo "::notice::unable to update CI measurement PR comment" fi else - if ! gh api "repos/$repo/issues/$pr_number/comments" --method POST --field body="$(cat "$comment_body")" >/dev/null; then + if ! gh api "repos/$repo/issues/$pr_number/comments" --method POST --input "$comment_payload_file" >/dev/null; then echo "::notice::unable to create CI measurement PR comment" fi fi @@ -3661,20 +6419,26 @@ jobs: fi fi + if [ "$exit_code" -ne 0 ]; then exit "$exit_code" fi - - name: Upload devenv perf artifacts + - name: 'Upload CI measurements: ci-measurements-report' if: always() uses: actions/upload-artifact@v4 with: - name: devenv-perf - path: tmp/devenv-perf-ci + name: ci-measurements-report + path: | + tmp/ci-measurement-report + !tmp/ci-measurement-report/baseline/** if-no-files-found: error retention-days: 30 - timeout-minutes: 30 + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-ci-measurements-report" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} test-integration-notion: + if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }} runs-on: [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}'] timeout-minutes: 30 @@ -3688,6 +6452,11 @@ jobs: NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }} steps: - uses: actions/checkout@v6 + - name: Checkout CI measurement baseline ref + if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }} + uses: actions/checkout@v6 + with: + ref: ${{ inputs.measurement_baseline_ref }} - name: Install Nix uses: DeterminateSystems/determinate-nix-action@v3 with: @@ -3861,7 +6630,7 @@ jobs: local max="${NIX_GC_RACE_MAX_RETRIES:-10}" local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}" local attempt=1 - local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit + local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit start="$(date +%s)" @@ -3909,7 +6678,7 @@ jobs: if [ "$rc" -eq 0 ]; then echo "::notice::[ci] completed $task in $elapsed s" if [ "$attempt" -gt 1 ]; then - write_summary success "Recovered from Nix GC race after retry" + write_summary success "Recovered from transient Nix failure after retry" else write_summary success fi @@ -3925,18 +6694,22 @@ jobs: tr -d '[:space:]' || true) saw_invalid_path=false saw_cachix_signature=false + saw_fetch_signature=false [ -n "$path" ] && saw_invalid_path=true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true rm -f "$log" - if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then - echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race" - write_summary failure "No Nix GC race signature detected" + if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then + echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure" + write_summary failure "No transient Nix failure signature detected" return "$rc" fi - if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then + if [ "$saw_fetch_signature" = true ]; then + echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache" + elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path" elif [ "$saw_cachix_signature" = true ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)" @@ -3951,8 +6724,8 @@ jobs: now=$(date +%s) elapsed=$((now - start)) - echo "::error::Nix GC race retry exhausted for $task ($max attempts)" - write_summary failure "Nix GC race retry exhausted" + echo "::error::Transient Nix retry exhausted for $task ($max attempts)" + write_summary failure "Transient Nix retry exhausted" return 1 } EOF @@ -4017,7 +6790,11 @@ jobs: run: | echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:" echo " https://github.com/overengineeringstudio/effect-utils/issues/201" + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-test-integration-notion" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} deploy-storybooks: + if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }} runs-on: [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}'] timeout-minutes: 30 @@ -4034,6 +6811,11 @@ jobs: NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} steps: - uses: actions/checkout@v6 + - name: Checkout CI measurement baseline ref + if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }} + uses: actions/checkout@v6 + with: + ref: ${{ inputs.measurement_baseline_ref }} - name: Install Nix uses: DeterminateSystems/determinate-nix-action@v3 with: @@ -4216,7 +6998,7 @@ jobs: local max="${NIX_GC_RACE_MAX_RETRIES:-10}" local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}" local attempt=1 - local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit + local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit start="$(date +%s)" @@ -4264,7 +7046,7 @@ jobs: if [ "$rc" -eq 0 ]; then echo "::notice::[ci] completed $task in $elapsed s" if [ "$attempt" -gt 1 ]; then - write_summary success "Recovered from Nix GC race after retry" + write_summary success "Recovered from transient Nix failure after retry" else write_summary success fi @@ -4280,18 +7062,22 @@ jobs: tr -d '[:space:]' || true) saw_invalid_path=false saw_cachix_signature=false + saw_fetch_signature=false [ -n "$path" ] && saw_invalid_path=true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true rm -f "$log" - if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then - echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race" - write_summary failure "No Nix GC race signature detected" + if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then + echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure" + write_summary failure "No transient Nix failure signature detected" return "$rc" fi - if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then + if [ "$saw_fetch_signature" = true ]; then + echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache" + elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path" elif [ "$saw_cachix_signature" = true ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)" @@ -4306,8 +7092,8 @@ jobs: now=$(date +%s) elapsed=$((now - start)) - echo "::error::Nix GC race retry exhausted for $task ($max attempts)" - write_summary failure "Nix GC race retry exhausted" + echo "::error::Transient Nix retry exhausted for $task ($max attempts)" + write_summary failure "Transient Nix retry exhausted" return 1 } EOF @@ -4325,7 +7111,7 @@ jobs: local max="${NIX_GC_RACE_MAX_RETRIES:-10}" local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}" local attempt=1 - local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit + local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit start="$(date +%s)" @@ -4373,7 +7159,7 @@ jobs: if [ "$rc" -eq 0 ]; then echo "::notice::[ci] completed $task in $elapsed s" if [ "$attempt" -gt 1 ]; then - write_summary success "Recovered from Nix GC race after retry" + write_summary success "Recovered from transient Nix failure after retry" else write_summary success fi @@ -4389,18 +7175,22 @@ jobs: tr -d '[:space:]' || true) saw_invalid_path=false saw_cachix_signature=false + saw_fetch_signature=false [ -n "$path" ] && saw_invalid_path=true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true rm -f "$log" - if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then - echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race" - write_summary failure "No Nix GC race signature detected" + if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then + echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure" + write_summary failure "No transient Nix failure signature detected" return "$rc" fi - if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then + if [ "$saw_fetch_signature" = true ]; then + echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache" + elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path" elif [ "$saw_cachix_signature" = true ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)" @@ -4415,8 +7205,8 @@ jobs: now=$(date +%s) elapsed=$((now - start)) - echo "::error::Nix GC race retry exhausted for $task ($max attempts)" - write_summary failure "Nix GC race retry exhausted" + echo "::error::Transient Nix retry exhausted for $task ($max attempts)" + write_summary failure "Transient Nix retry exhausted" return 1 } EOF @@ -4768,14 +7558,15 @@ jobs: if [ ! -s /tmp/storybook-preview-comment.md ]; then exit 0 fi + node -e "const fs=require('node:fs'); fs.writeFileSync(process.argv[2], JSON.stringify({ body: fs.readFileSync(process.argv[1], 'utf8') }))" /tmp/storybook-preview-comment.md /tmp/storybook-preview-comment-payload.json export NIX_CONFIG="${NIX_CONFIG:+$NIX_CONFIG$'\n'}access-tokens = github.com=${GH_TOKEN}" if [ "${{ github.event_name }}" != "pull_request" ]; then exit 0 fi if [ -n "$comment_id" ]; then - nix run nixpkgs#gh -- api "repos/$GH_REPO/issues/comments/$comment_id" --method PATCH --field body="$(cat /tmp/storybook-preview-comment.md)" >/dev/null + nix run nixpkgs#gh -- api "repos/$GH_REPO/issues/comments/$comment_id" --method PATCH --input /tmp/storybook-preview-comment-payload.json >/dev/null else - nix run nixpkgs#gh -- api "repos/$GH_REPO/issues/${{ github.event.pull_request.number }}/comments" --method POST --field body="$(cat /tmp/storybook-preview-comment.md)" >/dev/null + nix run nixpkgs#gh -- api "repos/$GH_REPO/issues/${{ github.event.pull_request.number }}/comments" --method POST --input /tmp/storybook-preview-comment-payload.json >/dev/null fi - name: Save pnpm state if: ${{ success() && steps.restore-pnpm-state.outputs.cache-hit != 'true' }} @@ -4835,6 +7626,9 @@ jobs: run: | echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:" echo " https://github.com/overengineeringstudio/effect-utils/issues/201" + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-deploy-storybooks" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} notify-alignment: runs-on: [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}'] @@ -4848,7 +7642,7 @@ jobs: - pnpm-builder-contract - pnpm-regression - deploy-storybooks - if: (github.ref == 'refs/heads/main') && github.event_name == 'push' + if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }} steps: - name: Dispatch alignment to coordinator env: @@ -4863,3 +7657,6 @@ jobs: --header "X-GitHub-Api-Version: 2022-11-28" \ --data "$payload" shell: bash + concurrency: + group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-notify-alignment" + cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }} diff --git a/.github/workflows/ci.yml.genie.ts b/.github/workflows/ci.yml.genie.ts index 166591fc0..a65f5c5e1 100644 --- a/.github/workflows/ci.yml.genie.ts +++ b/.github/workflows/ci.yml.genie.ts @@ -15,9 +15,19 @@ import { savePnpmStateStep, standardCIEnv, ciWorkflow, + ciMeasurementBaselineCheckoutStep, + ciMeasurementBaselineWorkflowDispatchInputs, + ciMeasurementNotBaselineBackfillPredicate, + ciMeasurementSubjectEnv, ciMeasurementsCommentPermissions, + ciMeasurementsArtifactStep, + compareCiMeasurementsStep, + defaultNixClosureMeasurementBuckets, devenvPerfJob, + downloadPreviousGitHubArtifactStep, namespaceRunner, + nixClosureMeasurementSteps, + sourceShapeMeasurementStep, validateColdPnpmDepsStep, nixDiagnosticsArtifactStep, netlifyDeployStep, @@ -30,6 +40,7 @@ import { type GitHubWorkflowArgs } from '../../packages/@overeng/genie/src/runti const baseSteps = [ checkoutStep(), + ciMeasurementBaselineCheckoutStep, installNixStep(), cachixCliBuildStep, cachixStep({ name: 'overeng-effect-utils', authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' }), @@ -140,8 +151,10 @@ const nixDiagnosticsSummaryStep = { } as const const jobTimeoutMinutes = 30 +const normalCiIf = `\${{ ${ciMeasurementNotBaselineBackfillPredicate} }}` const job = (step: { name: string; run: string }, extraSteps: readonly any[] = []) => ({ + if: normalCiIf, 'runs-on': namespaceRunner({ profile: 'namespace-profile-linux-x86-64', runId: '${{ github.run_id }}', @@ -161,6 +174,7 @@ const job = (step: { name: string; run: string }, extraSteps: readonly any[] = [ }) const multiPlatformJob = (step: { name: string; run: string }) => ({ + if: normalCiIf, strategy: { 'fail-fast': false, matrix: { @@ -186,6 +200,7 @@ const multiPlatformJob = (step: { name: string; run: string }) => ({ const strictNixJobBaseSteps = [ checkoutStep(), + ciMeasurementBaselineCheckoutStep, installNixStep(), cachixCliBuildStep, cachixStep({ name: 'overeng-effect-utils', authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' }), @@ -193,6 +208,7 @@ const strictNixJobBaseSteps = [ ] as const const multiPlatformStrictNixJob = (step: ReturnType) => ({ + if: normalCiIf, strategy: { 'fail-fast': false, matrix: { @@ -255,12 +271,70 @@ const jobs: Record | ReturnType + ({ + name: `Download current measurement artifact: ${artifactName}`, + uses: 'actions/download-artifact@v4', + with: { + name: artifactName, + path: outputDir, + }, + }) as const + +const ciMeasurementReportToolStep = { + name: 'Provide CI measurement report tools', + shell: 'bash', + run: [ + 'set -euo pipefail', + 'for out in $(nix build --no-link --print-out-paths nixpkgs#jq nixpkgs#nodejs nixpkgs#gh nixpkgs#resvg); do', + ' echo "$out/bin" >> "$GITHUB_PATH"', + 'done', + ].join('\n'), +} as const + +const nixClosureMeasurementTargets = [ + { + installable: '.#genie', + id: 'genie_package', + name: 'genie', + label: 'Genie package', + group: 'packages', + path: ['nix', 'closures', 'packages', 'genie'], + description: 'the packaged Genie CLI closure', + system: 'x86_64-linux', + }, + { + installable: '.#megarepo', + id: 'megarepo_package', + name: 'megarepo', + label: 'Megarepo package', + group: 'packages', + path: ['nix', 'closures', 'packages', 'megarepo'], + description: 'the packaged megarepo CLI closure', + system: 'x86_64-linux', + }, + { + installable: '.#oxlint-npm', + id: 'oxlint_npm_package', + name: 'oxlint-npm', + label: 'oxlint npm package', + group: 'packages', + path: ['nix', 'closures', 'packages', 'oxlint-npm'], + description: 'the packaged oxlint npm compatibility wrapper closure', + system: 'x86_64-linux', + }, +] as const // Non-required jobs (separate from CIJobName — not required status checks) const extraJobs: Record = { @@ -271,7 +345,44 @@ const extraJobs: Record = { runId: '${{ github.run_id }}', }), artifactName: 'devenv-perf', - baselineSeedRunIds: ['25710204667'], + baselineSeedRuns: [ + ...[ + ['25959801150', '655', 'df0420cd0397ffc6928d3c6ccc9c23052d6bc255'], + ['25959802067', '657', '62833cba5d83b1c13462728edeafa684e61c006f'], + ['25959802958', '656', '21029998522a0e9435df151259611650fb948a20'], + ['25959803805', '651', '95515f971b27ef279e39c982f52e46cf9e8270e9'], + ['25959804678', '654', '58e96b9a2b87b3703de6920b6d9571f3805d0171'], + ['25959805512', '653', 'd1cca16339f19d7e1a27b001edc4c2c7ecd13dc4'], + ['25959806473', '652', 'acd6c63f5e235e7e5f2710fc62b2231e0ba904a6'], + ['25959807303', '648', 'a5a07703ff951fb7396a40844e9491d88ed40edf'], + ['25959808097', '649', '360ff47c59a206064711dfcb6c610afd0e6b0d53'], + ['25959808775', '647', '8d1810b2c359ae95f245e56329018aab5020f8c0'], + ['25959809449', '646', '89e1396766ccd2a813680acd440cb78f540ca6c1'], + ['25959810069', '643', '239715520370436901a3f2218d162dc7b12f4b4c'], + ['25959810666', '641', '6b3751b4684ba45f496f1a1bff8b86ef6ba8275b'], + ['25959811321', '640', 'fed50ae2502ac0a65395bbef5af43fcf384d5d04'], + ['25959811864', '639', '0e03df2c6f20e4d154f286fd69a4e2980d21a12d'], + ['25959812634', '636', '7efdbee4b571f2c80f5b6173bc9a84b51fbef5eb'], + ['25959813189', '638', '350d1b98baa943dcae63412eeffded7b5160bc8a'], + ['25959813761', '637', 'f25336193b9f6b042eb027eca27acc4cc75a69d6'], + ['25959814335', '634', '4ba441d4ad8b6c49e9ee03d9cdfd2f04a129b714'], + ['25959814835', '632', '1ad5fd735c7f45ad5e07c8033e5b68a642ada69c'], + ].map(([runId, pr, sha]) => ({ + runId, + label: `PR #${pr}`, + sha, + source: 'manual-backfill', + artifacts: ['devenv-perf'], + notes: + 'Backfilled with the current measurement workflow for the effect-utils #658 rollout.', + })), + ], + baselineMaxRuns: 20, + // Wall-clock measurements are advisory until they have paired same-run + // base/head evidence. Deterministic measurements such as closure sizes + // can still use budget-style gates in consuming repos. + regressionMode: 'warn', + env: ciMeasurementSubjectEnv, setupSteps: baseSteps, taskProbes: [ { @@ -279,18 +390,47 @@ const extraJobs: Record = { label: 'pnpm install task', group: 'workspace setup', description: 'Runs the cached pnpm install devenv task.', + warmupRepetitions: 1, + repetitions: 5, }, { task: 'genie:run', label: 'Genie run task', group: 'genie', description: 'Runs the normal devenv genie:run task including its declared dependencies.', + warmupRepetitions: 1, + repetitions: 5, }, { task: 'check:quick', - label: 'Quick check task', + id: 'task_check_quick_warm', + label: 'Warm cached check:quick', group: 'quality gates', - description: 'Runs the fast local quality gate through devenv.', + path: ['quality gates', 'check:quick'], + description: + 'Runs the fast local quality gate through devenv after a warmup. This measures the cached no-op path and task/status orchestration overhead.', + dimensions: { + workload: 'cached-no-op', + taskCacheMode: 'warm', + }, + warmupRepetitions: 1, + repetitions: 5, + }, + { + task: 'check:quick', + id: 'task_check_quick_forced', + label: 'Forced check:quick', + group: 'quality gates', + path: ['quality gates', 'check:quick'], + description: + 'Runs the fast local quality gate through devenv with task-cache refresh. This measures the developer-facing quick-check workload rather than the cached no-op path.', + dimensions: { + workload: 'forced-task-cache', + taskCacheMode: 'refresh', + }, + extraArgs: ['--refresh-task-cache'], + warmupRepetitions: 0, + repetitions: 3, }, ], probes: [ @@ -300,7 +440,8 @@ const extraJobs: Record = { group: 'genie', description: 'Runs Genie directly in check mode to isolate generator runtime from devenv task dependency overhead.', - repetitions: 3, + warmupRepetitions: 1, + repetitions: 5, command: [ '$DEVENV_BIN', 'shell', @@ -315,8 +456,9 @@ const extraJobs: Record = { }, ], permissions: ciMeasurementsCommentPermissions, + compare: false, prComment: { - enabled: true, + enabled: false, title: 'Devenv Performance', maxRows: 8, maxHistory: 20, @@ -324,8 +466,166 @@ const extraJobs: Record = { }), 'timeout-minutes': jobTimeoutMinutes, }, + 'nix-closure-sizes': { + if: normalCiIf, + 'runs-on': namespaceRunner({ + profile: 'namespace-profile-linux-x86-64', + runId: '${{ github.run_id }}', + }), + 'timeout-minutes': jobTimeoutMinutes, + defaults: bashShellDefaults, + permissions: ciMeasurementsCommentPermissions, + env: ciMeasurementSubjectEnv, + steps: [ + ...baseSteps, + ...nixClosureMeasurementSteps({ + artifactName: 'nix-closure-measurements', + artifactDir: nixClosureMeasurementsDir, + baselineMaxRuns: 20, + targets: nixClosureMeasurementTargets, + buckets: defaultNixClosureMeasurementBuckets, + compare: false, + regressionMode: 'warn', + prComment: { + enabled: false, + title: 'Nix Closure Measurements', + maxRows: 8, + maxHistory: 20, + }, + }), + savePnpmStateStep(), + nixDiagnosticsSummaryStep, + nixDiagnosticsArtifactStep(), + failureReminderStep, + ], + }, + 'source-shape': { + 'runs-on': namespaceRunner({ + profile: 'namespace-profile-linux-x86-64', + runId: '${{ github.run_id }}', + }), + 'timeout-minutes': jobTimeoutMinutes, + defaults: bashShellDefaults, + permissions: ciMeasurementsCommentPermissions, + env: ciMeasurementSubjectEnv, + steps: [ + checkoutStep(), + ciMeasurementBaselineCheckoutStep, + sourceShapeMeasurementStep({ + artifactDir: `${sourceShapeMeasurementsDir}/current/effect-utils`, + targetId: 'effect_utils', + targetName: 'effect-utils', + targetLabel: 'effect-utils repository', + targetGroup: 'source', + targetPath: ['source', 'effect-utils'], + scopes: [ + { + id: 'genie_ci_workflow', + label: 'Genie CI workflow helpers', + group: 'source / ci', + path: ['source', 'effect-utils', 'genie', 'ci-workflow'], + includePaths: ['genie/ci-workflow', '.github/workflows/ci.yml.genie.ts'], + includeExtensions: ['.ts'], + }, + { + id: 'genie_runtime', + label: 'Genie runtime', + group: 'source / genie', + path: ['source', 'effect-utils', 'packages', 'genie'], + includePaths: ['packages/@overeng/genie/src'], + includeExtensions: ['.ts', '.tsx'], + }, + { + id: 'nix_workspace_tools', + label: 'Nix workspace tools', + group: 'source / nix', + path: ['source', 'effect-utils', 'nix', 'workspace-tools'], + includePaths: ['nix/workspace-tools'], + includeExtensions: ['.nix'], + }, + ], + }), + ciMeasurementsArtifactStep({ + artifactName: 'source-shape', + path: sourceShapeMeasurementsDir, + }), + ], + }, + 'ci-measurements-report': { + name: 'ci/measurements-report', + if: normalCiIf, + needs: ['devenv-perf', 'nix-closure-sizes', 'source-shape'], + 'runs-on': namespaceRunner({ + profile: 'namespace-profile-linux-x86-64', + runId: '${{ github.run_id }}', + }), + 'timeout-minutes': jobTimeoutMinutes, + defaults: bashShellDefaults, + permissions: ciMeasurementsCommentPermissions, + env: ciMeasurementSubjectEnv, + steps: [ + checkoutStep(), + installNixStep(), + ciMeasurementReportToolStep, + downloadCurrentMeasurementArtifactStep( + 'devenv-perf', + `${ciMeasurementReportDir}/current/devenv-perf`, + ), + downloadCurrentMeasurementArtifactStep( + 'nix-closure-measurements', + `${ciMeasurementReportDir}/current/nix-closure-measurements`, + ), + downloadCurrentMeasurementArtifactStep( + 'source-shape', + `${ciMeasurementReportDir}/current/source-shape`, + ), + downloadPreviousGitHubArtifactStep({ + artifactName: 'devenv-perf', + outputDir: `${ciMeasurementReportDir}/baseline/devenv-perf`, + maxRuns: 20, + }), + downloadPreviousGitHubArtifactStep({ + artifactName: 'nix-closure-measurements', + outputDir: `${ciMeasurementReportDir}/baseline/nix-closure-measurements`, + maxRuns: 20, + }), + downloadPreviousGitHubArtifactStep({ + artifactName: 'source-shape', + outputDir: `${ciMeasurementReportDir}/baseline/source-shape`, + seedRuns: [ + { + runId: '26085158592', + label: 'main baseline', + sha: 'ce7cf8f8ebfaa1da6c7e9122cd195a5f95ce2fca', + source: 'manual-backfill', + artifacts: ['source-shape'], + notes: + 'Backfilled with the current measurement workflow for the effect-utils #658 rollout.', + }, + ], + maxRuns: 20, + }), + compareCiMeasurementsStep({ + currentDir: `${ciMeasurementReportDir}/current`, + baselineDir: `${ciMeasurementReportDir}/baseline`, + outputFile: `${ciMeasurementReportDir}/measurement-comparison.json`, + regressionMode: 'warn', + prComment: { + enabled: true, + title: 'CI Measurements', + maxRows: 16, + maxHistory: 20, + }, + }), + ciMeasurementsArtifactStep({ + artifactName: 'ci-measurements-report', + path: ciMeasurementReportDir, + }), + ], + }, /** Integration tests for Notion API (requires NOTION_TOKEN secret) */ 'test-integration-notion': { + if: normalCiIf, 'runs-on': namespaceRunner({ profile: 'namespace-profile-linux-x86-64', runId: '${{ github.run_id }}', @@ -352,6 +652,7 @@ const extraJobs: Record = { const deployJobs: Record = { 'deploy-storybooks': { + if: normalCiIf, 'runs-on': namespaceRunner({ profile: 'namespace-profile-linux-x86-64', runId: '${{ github.run_id }}', @@ -386,6 +687,7 @@ export default ciWorkflow({ pull_request: { branches: ['main'] }, workflow_dispatch: { inputs: { + ...ciMeasurementBaselineWorkflowDispatchInputs, debug_force_nix_diagnostics_failure: { description: 'Temporary debug switch (#272): force post-validation failure to verify diagnostics artifact + summary', @@ -400,13 +702,16 @@ export default ciWorkflow({ ...jobs, ...extraJobs, ...deployJobs, - 'notify-alignment': notifyAlignmentJob({ - targetRepo: 'schickling/megarepo-all', - needs: [...Object.keys(jobs), ...Object.keys(deployJobs)], - runner: [ - 'namespace-profile-linux-x86-64', - 'namespace-features:github.run-id=${{ github.run_id }}', - ], - }), + 'notify-alignment': { + ...notifyAlignmentJob({ + targetRepo: 'schickling/megarepo-all', + needs: [...Object.keys(jobs), ...Object.keys(deployJobs)], + runner: [ + 'namespace-profile-linux-x86-64', + 'namespace-features:github.run-id=${{ github.run_id }}', + ], + }), + if: normalCiIf, + }, }, } satisfies GitHubWorkflowArgs) diff --git a/context/ci-measurement-engine.md b/context/ci-measurement-engine.md new file mode 100644 index 000000000..ff54aced4 --- /dev/null +++ b/context/ci-measurement-engine.md @@ -0,0 +1,175 @@ +# CI Measurement Engine + +This document specifies the reusable CI measurement engine. It builds on +[ci-measurements.md](./ci-measurements.md). + +## Status + +Draft - architecture target for replacing generated shell/jq comparison code +with a typed reusable implementation. + +## Scope + +This spec defines: + +- the stable measurement artifact contract; +- comparison policy semantics; +- the native engine boundary; +- external-tool integration boundaries; +- the rollout path from generated shell/jq to a packaged CLI. + +This spec does not define individual probes. Devenv, Nix closure, source-shape, +LOC, and complexity probes remain producer adapters that emit the shared +artifact format. + +## Architecture + +```text +producer adapters + devenv wall-clock + nix closure size + source shape + future LOC / complexity + | + v +measurements.json + | + v +ci-measure native engine + schema validation + compatibility matching + comparison policy + gate decision + report projection + | + +--> measurement-comparison.json + +--> GitHub Markdown comment + +--> SVG/PNG chart payload + +--> optional trend export +``` + +The engine owns comparison and rendering. Workflows own checkout, dependency +setup, artifact upload, and GitHub API calls. + +## Measurement Registry + +Every observation is interpreted through a registry entry: + +| Field | Purpose | +| ------------------------- | ------------------------------------------------------ | +| `id` | Stable public identity. | +| `label` | Human review label. | +| `semanticPath` | Hierarchical grouping for comments and charts. | +| `measurementKind` | `deterministic`, `wall-clock`, or `diagnostic`. | +| `unit` | Canonical unit for values and deltas. | +| `direction` | Whether larger values are better, worse, or neutral. | +| `defaultComparisonMode` | `budget`, `paired`, `historical`, or `diagnostic`. | +| `gatePolicy` | Absolute/relative budgets and sample requirements. | +| `compatibilityDimensions` | Which dimensions must match for historical comparison. | +| `displayPolicy` | Visibility, sorting, and chart inclusion behavior. | +| `rawSampleSchema` | Optional schema for per-sample evidence. | + +The registry is the public API for cross-repo reuse. Repos may add local entries, +but they must not fork comparison semantics. + +Wall-clock registry entries should include a workload dimension when the same +logical command can be measured under different cache conditions. For example, +`task_check_quick_warm` and `task_check_quick_forced` intentionally share the +semantic path `devenv / quality gates / check:quick`, but they are separate IDs +because one measures the warm cached no-op path while the other refreshes the +devenv task cache. This avoids false product claims such as treating a cached +orchestration improvement as a full developer quick-check improvement. + +## Comparison Semantics + +| Kind | Merge-gate mode | Evidence model | +| --------------- | --------------- | -------------------------------------------------- | +| `deterministic` | `budget` | Exact comparable value plus configured budget. | +| `wall-clock` | `paired` | Same-run base/head pairs and paired delta samples. | +| `wall-clock` | `historical` | Advisory trend context only. | +| `diagnostic` | `diagnostic` | Non-gating explanatory data. | + +Wall-clock PR gates must not depend on historical timing alone. Historical +timing is useful for drift detection, A/A calibration, and dashboards, but it +does not prove PR causality. + +Paired wall-clock gates use nonparametric evidence by default: + +```text +paired_delta_i = current_duration_i - baseline_duration_i +evidence_lower = quantile(paired_delta, pairedEvidenceQuantile) +evidence_upper = quantile(paired_delta, 1 - pairedEvidenceQuantile) +fail = evidence_lower > fail_budget +warn = evidence_lower > warn_budget +``` + +The engine may add bootstrap or permutation intervals for selected probes, but +it must keep the raw paired delta samples in the artifact so decisions remain +auditable. + +## Native CLI Boundary + +The long-term implementation should be a packaged `ci-measure` CLI. + +```text +ci-measure validate --input measurements.json +ci-measure compare --current DIR --baseline DIR --output comparison.json +ci-measure render-comment --comparison comparison.json --output comment.md +ci-measure render-chart --comparison comparison.json --theme light --output chart.svg +ci-measure export-trends --comparison comparison.json --format bencher-json +``` + +Rust is the preferred implementation language for the engine because it gives: + +- typed schemas for artifact compatibility; +- deterministic rendering without ad hoc heredocs; +- fast startup in generated CI workflows; +- property tests for policy classification; +- snapshot tests for Markdown/SVG output; +- a single packaged binary for all repos. + +Shell remains appropriate for probe execution because probes invoke arbitrary +repo-local commands, Nix, devenv, and GitHub workflow primitives. + +## External Tool Boundary + +External tools may be exporters, not authorities. + +| Tool class | Allowed role | Not allowed role | +| -------------------------- | ----------------------------------------- | -------------------------------------- | +| Bencher / trend stores | Historical storage, dashboards, alerting. | Primary PR gate for paired wall-clock. | +| CodSpeed-style instruments | Language-level benchmark suites. | Devenv/Nix shell gate replacement. | +| OTEL backends | Trace explanation and runner diagnostics. | Canonical numeric regression decision. | +| GitHub artifacts/comments | Current authoritative review projection. | Long-term statistical trend database. | + +This keeps the merge contract under our control while still allowing the best +external system to own trend visualization or specialized microbenchmarking. + +The Bencher experiment in +[ci-measurement-experiments.md](./ci-measurement-experiments.md) confirms this +boundary: Bencher is useful for historical storage and scalar threshold alerts, +but it does not natively gate on same-run paired base/head evidence. + +## Rollout + +1. Keep the current generated workflow behavior and comment shape stable. +2. Add schema fixtures from existing production `measurements.json` artifacts. +3. Implement `ci-measure compare` behind a workflow environment switch. +4. Run generated jq and native CLI comparisons side by side in CI. +5. Require byte-for-byte compatible `measurement-comparison.json` for existing + fixtures, except for intentional schema-version changes. +6. Move Markdown and SVG rendering into the native CLI after comparison parity. +7. Remove generated jq/Node snippets once all megarepo consumers use the CLI. + +The branch-protection surface must keep the same job names during rollout. + +## Open Questions + +- **DQ1 Bootstrap intervals:** Which probes are valuable enough to pay for + bootstrap or permutation intervals instead of quantile evidence? +- **DQ2 Trend backend:** Should historical trend export target Bencher, an + object-store-backed JSON index, Prometheus/OTEL metrics, or more than one? +- **DQ3 Registry location:** Should shared registry entries live in effect-utils + source, generated repo config, or both? +- **DQ4 Calibration lane:** Which repos should run scheduled A/A and injected + regression calibration first? diff --git a/context/ci-measurement-experiments.md b/context/ci-measurement-experiments.md new file mode 100644 index 000000000..4f89b21db --- /dev/null +++ b/context/ci-measurement-experiments.md @@ -0,0 +1,97 @@ +# CI Measurement Experiments + +This document records experiments that inform +[ci-measurement-engine.md](./ci-measurement-engine.md). + +## Bencher Fit Experiment + +Date: 2026-05-19. + +Purpose: evaluate whether Bencher should replace or complement the +GitHub-native CI measurement gate. + +### Setup + +The experiment used a local self-hosted Bencher instance and synthetic metrics +that mimic our current measurement families: + +- wall-clock duration; +- deterministic Nix closure size; +- deterministic store path count; +- diagnostic counters. + +Commands exercised: + +```bash +docker run --rm ghcr.io/bencherdev/bencher --version + +bencher up --detach --pull missing \ + --console-port 33080 \ + --api-port 61018 \ + --console-env BENCHER_API_URL=http://localhost:61018 + +bencher run --host http://localhost:61018 \ + --project effect-utils-ci-measurements \ + --branch main \ + --testbed github-ubuntu-latest \ + --adapter json \ + --file measurements-base.json \ + --format json + +bencher run --host http://localhost:61018 \ + --project effect-utils-ci-measurements \ + --branch pr-658 \ + --start-point main \ + --start-point-clone-thresholds \ + --start-point-reset \ + --testbed github-ubuntu-latest \ + --error-on-alert \ + --adapter json \ + --file measurements-head.json \ + --format json +``` + +### Findings + +Bencher worked well for: + +- storing historical benchmark rows by project, branch, testbed, benchmark, + and measure; +- cloning thresholds from a main start point into a PR branch; +- failing CI through `--error-on-alert`; +- percentage thresholds for coarse performance trend alerts; +- static thresholds for simple absolute deterministic budgets; +- multi-measure reports through Bencher Metric Format JSON; +- local self-hosting through Docker. + +Bencher did not model our primary wall-clock gate: + +- same-run base/head paired samples are not first-class; +- multiple files in one report become iterations, not paired comparisons; +- alerting compares scalar metric values against thresholds; +- stored lower/upper metric fields are not treated as paired evidence + intervals for gating; +- comments and checks would be Bencher-shaped alerts, not our semantic PR + report with paired `n` and delta evidence intervals. + +### Decision + +Bencher is not the authority for PR merge gates. + +Allowed use: + +- optional trend backend; +- historical dashboards; +- coarse scheduled alerts; +- export target for already-computed metrics, including paired summary metrics + and deterministic budget ratios. + +Disallowed use: + +- replacing the GitHub-native PR comment; +- replacing paired wall-clock gate decisions; +- replacing deterministic budget evaluation when budgets are metric-specific. + +The native `ci-measure` engine should own gate semantics. A future Bencher +exporter can publish selected observations after `ci-measure compare` has +produced the authoritative decision. diff --git a/context/ci-measurements.md b/context/ci-measurements.md new file mode 100644 index 000000000..b86c98d7b --- /dev/null +++ b/context/ci-measurements.md @@ -0,0 +1,282 @@ +# CI Measurements + +This document specifies the shared CI measurement architecture used by generated workflows. + +## Status + +Active. + +## Measurement Classes + +| Class | Examples | Primary Question | Gate Model | +| --------------- | ------------------------------------------------------ | --------------------------------------------------------------------- | -------------------------------------------------------- | +| `deterministic` | Nix closure size, source lines, file counts | Did a structural quantity exceed its budget? | Budget/diff against a comparable baseline. | +| `wall-clock` | Devenv shell eval, task runtime, CLI command latency | Did this PR make this operation slower on the same runner conditions? | Paired same-run base/head samples before merge blocking. | +| `diagnostic` | OTEL-traced shell eval, host context, trace breakdowns | Where did time go? | Never merge-blocking; explains measurements. | + +The class is part of the observation contract through `measurementKind`. +The comparison policy is part of the gate contract through `comparisonMode`. + +`measurementKind` defines the physical meaning of the number. `comparisonMode` +defines how the number is compared. A producer may only combine them when the +semantics match: + +| `measurementKind` | Gateable `comparisonMode` | Baseline Meaning | Uncertainty Model | +| ----------------- | ------------------------- | --------------------------------------------- | ----------------------------------------- | +| `deterministic` | `budget` | Same target on a comparable ref | None by default; exact value plus budget. | +| `wall-clock` | `paired` | Same PR run, same runner, base/head pairs | Per-pair delta evidence interval. | +| `wall-clock` | `historical` | Previous comparable successful artifacts | Advisory robust bands only. | +| `diagnostic` | none | Optional context artifact or trace attachment | Not gateable. | + +Historical comparison is not a substitute for paired wall-clock evidence. +Budget comparison is not a substitute for owner-approved semantic budgets. + +## Observation Contract + +Every observation has a stable `id`, human `label`, semantic `group`/`path`, +numeric `value`, `unit`, `measurementKind`, and a gate `policy`. + +```json +{ + "id": "devenv.shell_eval_warm.duration", + "label": "Warm shell eval", + "measurementKind": "wall-clock", + "unit": "seconds", + "value": 6.067, + "policy": { + "enabled": true, + "comparisonMode": "paired", + "minPairedSamples": 5, + "minCurrentSamples": 5, + "pairedEvidenceQuantile": 0.25 + } +} +``` + +Observation IDs are public API. They should be stable, dotted names whose +prefix names the domain and whose suffix names the measured quantity, for +example `devenv.shell_eval_warm.duration`, `nix.closure.nar_size`, or +`source.lines`. Labels are review UI, not identity. Paths and groups may change +to improve hierarchy, but IDs should only change when the measurement protocol +or semantic target changes. + +New measurement producers should emit the shared artifact format directly: + +```text +producer adapter + -> typed observation(s) + -> shared comparison policy + -> shared report/comment/SVG projection +``` + +This keeps probe-specific collection code separate from the reusable regression +system. A new probe should not fork comparison, markdown rendering, or asset +publication logic. + +The reusable engine boundary is specified in +[ci-measurement-engine.md](./ci-measurement-engine.md). The long-term direction +is to keep this artifact and comment contract as the source of truth while +moving comparison and rendering out of generated shell/jq snippets into a typed +native CLI. + +## Gate Semantics + +Deterministic observations use `comparisonMode: "budget"`. +They require a comparable baseline and then evaluate configured absolute and +relative budgets. Historical variance is context only; it does not neutralize +a budget-exceeding deterministic movement. This keeps Nix closure sizes, +source-shape counts, lines of code, complexity scores, and similar structural +measurements separate from wall-clock noise handling. + +Wall-clock observations use `comparisonMode: "paired"` for enforced gates. +They need same-run base/head evidence before they can block a merge. Historical +baselines remain useful for trend context, but they do not prove PR causality. +For PR runs, the wall-clock producer checks out the PR base commit in a sibling +worktree and alternates measured pair order (`head -> base`, then +`base -> head`) from a recorded seed to reduce cache and time drift bias +without making order a hidden variable. The current artifact stores the paired +baseline median and paired sample count, and the comparison engine uses that +embedded paired baseline for the gate. + +The gate evaluates per-pair deltas, not only the difference between medians. New +artifacts carry the raw paired delta samples in the observation statistics. The +comparison engine derives a nonparametric evidence interval from those samples +using `pairedEvidenceQuantile` (default `0.25`, so the displayed interval is the +25th-75th percentile by default). A paired wall-clock row blocks only when the +lower evidence quantile clears the configured failure budget. If the point +estimate moved but the paired delta evidence still crosses the budget, the row +renders as `paired_uncertain` and does not block. Older artifacts that only have +summary statistics use a conservative robust-band fallback and are labeled with +that evidence protocol. This follows the same principle used by continuous +benchmark tools: a point estimate without uncertainty is not enough evidence +for a regression. + +Paired wall-clock gates do not require a historical baseline source count. The +same-run paired baseline is the comparable evidence. Historical runs may still +appear in the report as trend context, but they do not decide whether paired PR +evidence is gateable. + +Historical wall-clock comparison may be used as an advisory transition mode. +It can warn, visualize trends, and guide investigation, but it must not be the +required merge gate for noisy runner-dependent timings. Robust baseline/current +bands may suppress historical wall-clock noise; they are not applied as a +semantic escape hatch for deterministic budget rows. + +Diagnostic observations set `enabled: false` or `measurementKind: "diagnostic"`. +They appear in reports, but their impact is rendered as `diagnostic` and they +are excluded from actionable impact charts. + +## Data Flow + +```text +probe execution + -> measurements.json artifact + -> comparison engine + -> PR summary/comment + SVG asset + -> optional branch-protection gate +``` + +The artifact is the source of truth. OTEL traces and host context are evidence +attachments, not the canonical numeric store. PR comments are projections of +the artifact and can be regenerated. New measurement families should add +producer adapters that emit this artifact contract; comparison, policy +evaluation, charting, and comment rendering stay shared. + +## Wall-Clock Soundness + +Wall-clock timings on CI runners are noisy, often non-normal, and affected by +load, caches, CPU frequency, storage, network fetches, and process scheduling. +For merge-blocking use, same-run paired measurement is required: + +```text +base warmup +head warmup +sample pair 1: seeded order chooses base/head or head/base +sample pair 2: opposite order +... +``` + +The comparison operates on per-pair deltas. A wall-clock row becomes gateable +only when the configured minimum paired sample count is present. Until then, +the row is partial/advisory even if the historical raw delta is large. + +Wall-clock probe IDs must name the workload they actually measure. Repeated +warm probes are useful for shell and task-orchestration overhead, but they are +not a proxy for an uncached developer workflow. For example: + +| Probe | Workload | Interprets As | +| ------------------------- | ------------------------ | -------------------------------------------------- | +| `task_check_quick_warm` | Warm cached no-op path | Devenv task/status orchestration overhead. | +| `task_check_quick_forced` | `--refresh-task-cache` | Developer-facing quick-check work with cache miss. | +| `shell_eval_warm` | Warm shell entry | Shell evaluation and setup overhead. | +| `shell_eval_traced` | Trace capture diagnostic | Explanation input, not a gate. | + +The label and `dimensions.workload` must make this distinction visible in the +PR comment so reviewers do not read a cached-path movement as an end-to-end +developer speedup. + +For PR gates, the preferred evidence protocol is `paired-delta-quantile-v1`: + +```text +paired deltas = current_duration(pair_i) - baseline_duration(pair_i) +evidence lower = quantile(paired deltas, pairedEvidenceQuantile) +evidence upper = quantile(paired deltas, 1 - pairedEvidenceQuantile) +gate fail = evidence lower > semantic fail budget +gate warn = evidence lower > semantic warn budget +``` + +This is intentionally nonparametric because CI timings are often skewed, +heavy-tailed, and not normally distributed. A future scheduled calibration lane +can increase sample counts or move to bootstrap intervals for selected +high-value probes, but the PR gate should remain understandable from the raw +pair deltas in the artifact. + +## Deterministic Measurements + +Nix closure size, source shape, code complexity, lines of code, and file counts +are deterministic or near-deterministic structural measurements. They are not +wall-clock performance probes and must not use paired timing statistics or +historical timing-style robust-band suppression. They should use explicit +budgets and semantic buckets. A closure-size regression is actionable because +the same installable and lock graph should produce a stable closure. +Source-shape or complexity growth is an architecture signal and should remain +advisory unless a repo defines an explicit owner-approved budget. + +Deterministic budgets should prefer absolute units when the user impact is +absolute, such as bytes or path counts, and relative thresholds when scale is +the meaningful signal. A deterministic row may show historical values for +review context, but the pass/fail decision is the budget decision. + +## Policy Lifecycle + +Each observation should move through explicit policy stages: + +| Stage | Use Case | Merge Behavior | +| ------------ | --------------------------------------------- | --------------------------------------------------- | +| `diagnostic` | New metric, trace attachment, host context | Render only. | +| `advisory` | Historical trend before calibration is mature | Comment and warn, but do not block merge. | +| `gateable` | Calibrated wall-clock or deterministic budget | Block only when the measurement class proves it. | +| `required` | Stable semantic invariant | Repo branch protection may depend on the gate name. | + +Wall-clock probes should start advisory until paired evidence and a noise +profile exist for that repo/runner. Deterministic probes can become gateable +earlier when their target identity and budget are explicit. + +## Baseline Model + +Baselines are comparable evidence, not arbitrary previous numbers. + +| Measurement Class | Baseline Source | Backfill Rule | +| ----------------- | ---------------------------------------------------- | ----------------------------------------------------- | +| `deterministic` | Current main artifacts or manually seeded exact runs | Backfill past merged PRs when introducing the metric. | +| `wall-clock` | Same-run paired base checkout for PR gates | Historical backfill is trend context only. | +| `diagnostic` | Trace or host artifact for the same run | No baseline required. | + +Manual baseline seeds must record the source run, ref, SHA, and reason. Seeded +data is acceptable when it was produced by the same probe protocol and target +identity; it is not acceptable to copy a chart value into the baseline store. + +## State-of-the-Art Alignment + +The design follows current continuous benchmarking practice: + +- Wall-clock gates need repeated measurements, warmup, and uncertainty, not + single raw timing deltas. +- Paired base/head runs reduce runner-load, cache, and time-drift bias. +- Outliers and wide variance reduce confidence instead of being silently + averaged away. +- Diagnostic traces explain regressions; they do not define the canonical + numeric result. +- Human review should show raw values, nominal deltas, percent deltas, and an + actionable impact scale so large noisy movements are not mistaken for proven + PR regressions. + +## Visualization + +Reports must distinguish raw movement from actionable evidence. + +- Raw delta and percentage are always shown. +- Actionable impact is only shown for gateable rows. +- Diagnostic rows render as `diagnostic`, not `0.00x`. +- Non-gateable paired wall-clock rows render as needing paired evidence. +- Noisy paired wall-clock rows render as uncertain, with neutral actionable + impact, even when the raw percentage delta is large. + +This prevents a large historical wall-clock delta from looking like a proven +PR regression when the measurement lacks causal evidence. + +## External Tools + +External benchmarking tools may complement this system, but they do not replace +the merge-gate contract. + +- Bencher-like systems may store historical trends, apply threshold models, and + provide dashboards. +- CodSpeed-like instrumentation may be useful for language-level benchmark + suites whose execution model matches the tool. +- OTEL backends remain diagnostic evidence for explaining where time went. +- GitHub comments remain the human review surface for PR decisions. + +For wall-clock PR gates, the authoritative evidence is still same-run paired +base/head samples emitted in `measurements.json`. For deterministic quantities, +the authoritative evidence is the comparable value and its configured budget. diff --git a/genie/ci-scripts/ci-measurement-comparison.test.sh b/genie/ci-scripts/ci-measurement-comparison.test.sh new file mode 100755 index 000000000..80260faa5 --- /dev/null +++ b/genie/ci-scripts/ci-measurement-comparison.test.sh @@ -0,0 +1,309 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +cd "$ROOT" + +tmp_dir="$(mktemp -d)" +trap 'rm -rf "$tmp_dir"' EXIT + +run_bun() { + if command -v bun >/dev/null 2>&1; then + bun "$@" + elif [ -n "${DEVENV_BIN:-}" ]; then + "$DEVENV_BIN" shell --no-reload -- bun "$@" + else + echo "bun is not available and DEVENV_BIN is not set" >&2 + return 127 + fi +} + +emit_compare_script() { + run_bun -e "import { compareCiMeasurementsStep } from './genie/ci-workflow/measurements.ts'; process.stdout.write(compareCiMeasurementsStep({ currentDir: '$tmp_dir/current', baselineDir: '$tmp_dir/baseline', outputFile: '$tmp_dir/comparison.json', regressionMode: 'warn' }).run)" >"$tmp_dir/compare.sh" +} + +write_measurement() { + local file="$1" + local value="$2" + local protocol="$3" + local policy="$4" + mkdir -p "$(dirname "$file")" + jq -n \ + --argjson value "$value" \ + --arg protocol "$protocol" \ + --argjson policy "$policy" \ + '{ + schemaVersion: 1, + generatedAt: "2026-05-14T00:00:00Z", + producer: { name: "test", version: (if $protocol == "legacy" then 1 else 2 end) }, + target: { kind: "devenv", id: "dev-shell", name: "dev-shell", label: "Dev shell", group: "devenv", system: "Linux" }, + observations: [ + { + id: "devenv.task.duration", + label: "Task", + group: "test", + name: "devenv.task.duration", + unit: "seconds", + value: $value, + policy: $policy, + statistics: { sampleCount: 6, warmupCount: 1, measuredSampleCount: 5, successfulSampleCount: 5, min: $value, max: $value, median: $value }, + dimensions: ( + { probe: "task", probeLabel: "Task", status: 0, sampleCount: 6, warmupCount: 1, measuredSampleCount: 5 } + + if $protocol == "legacy" then {} else { measurementProtocol: $protocol, aggregation: "median", phase: "warm" } end + ) + } + ] + }' >"$file" +} + +run_compare() { + CI_MEASUREMENT_CURRENT_DIR="$tmp_dir/current" \ + CI_MEASUREMENT_BASELINE_DIR="$tmp_dir/baseline" \ + CI_MEASUREMENT_COMPARISON_FILE="$tmp_dir/comparison.json" \ + CI_MEASUREMENT_REGRESSION_MODE=warn \ + CI_MEASUREMENT_PR_COMMENT_ENABLED=false \ + bash "$tmp_dir/compare.sh" +} + +policy='{"enabled":true,"minBaselineSources":1,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}' +paired_policy='{"enabled":true,"comparisonMode":"paired","minBaselineSources":1,"minCurrentSamples":5,"minPairedSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}' +strict_paired_policy='{"enabled":true,"comparisonMode":"paired","minBaselineSources":20,"minCurrentSamples":5,"minPairedSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}' +emit_compare_script + +rm -rf "$tmp_dir/current" "$tmp_dir/baseline" +write_measurement "$tmp_dir/current/measurements.json" 12 legacy "$policy" +write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 legacy "$policy" +write_measurement "$tmp_dir/baseline/run-1/baseline/run-old/measurements.json" 1 legacy "$policy" +run_compare +actual_sources="$(jq -r '.comparisons[] | .baselineSources' "$tmp_dir/comparison.json")" +actual_baseline="$(jq -r '.comparisons[] | .baseline' "$tmp_dir/comparison.json")" +if [ "$actual_sources" != "1" ] || [ "$actual_baseline" != "10" ]; then + echo "expected clean top-level baseline only; got sources=$actual_sources baseline=$actual_baseline" >&2 + exit 1 +fi + +rm -rf "$tmp_dir/current" "$tmp_dir/baseline" +write_measurement "$tmp_dir/current/measurements.json" 12 devenv-perf-warm-median-v2 "$policy" +write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 legacy "$policy" +run_compare +actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")" +actual_gate="$(jq -r '.comparisons[] | .gateReason' "$tmp_dir/comparison.json")" +actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")" +if [ "$actual_status" != "partial" ] || [ "$actual_gate" != "missing_baseline" ] || [ "$actual_enforceable" != "false" ]; then + echo "expected protocol mismatch to be missing_baseline and unenforceable; got status=$actual_status gate=$actual_gate enforceable=$actual_enforceable" >&2 + exit 1 +fi + +rm -rf "$tmp_dir/current" "$tmp_dir/baseline" +write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$policy" +write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$policy" +run_compare +actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")" +actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")" +actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")" +actual_impact="$(jq -r '.comparisons[] | .semanticImpactScore' "$tmp_dir/comparison.json")" +actual_impact_kind="$(jq -r '.comparisons[] | .semanticImpactKind' "$tmp_dir/comparison.json")" +if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ] || [ "$actual_enforceable" != "true" ] || [ "$actual_impact_kind" != "fail_boundary" ] || ! awk "BEGIN { exit !($actual_impact > 1) }"; then + echo "expected confirmed regression to fail and have fail-boundary impact; got status=$actual_status row=$actual_row enforceable=$actual_enforceable impact=$actual_impact kind=$actual_impact_kind" >&2 + exit 1 +fi + +rm -rf "$tmp_dir/current" "$tmp_dir/baseline" +write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$paired_policy" +write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$paired_policy" +run_compare +actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")" +actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")" +actual_gate="$(jq -r '.comparisons[] | .gateReason' "$tmp_dir/comparison.json")" +actual_confidence="$(jq -r '.comparisons[] | .confidence' "$tmp_dir/comparison.json")" +actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")" +actual_low_paired="$(jq -r '.readiness.lowPairedSampleCount' "$tmp_dir/comparison.json")" +if [ "$actual_status" != "partial" ] || [ "$actual_row" != "pass" ] || [ "$actual_gate" != "low_paired_sample_count" ] || [ "$actual_confidence" != "low_paired_sample_count" ] || [ "$actual_enforceable" != "false" ] || [ "$actual_low_paired" != "1" ]; then + echo "expected paired wall-clock policy without paired evidence to be partial/non-enforceable; got status=$actual_status row=$actual_row gate=$actual_gate confidence=$actual_confidence enforceable=$actual_enforceable lowPaired=$actual_low_paired" >&2 + exit 1 +fi + +rm -rf "$tmp_dir/current" "$tmp_dir/baseline" +write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$paired_policy" +jq '.observations[0].comparison = { mode: "paired", baseline: 12.95, pairedSampleCount: 5 } + | .observations[0].statistics.pairedSampleCount = 5 + | .observations[0].statistics.pairedDeltaMedian = 0.05 + | .observations[0].statistics.pairedDeltaP25 = 0.04 + | .observations[0].statistics.pairedDeltaP75 = 0.06 + | .observations[0].statistics.pairedDeltaMad = 0.01' \ + "$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json" +mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json" +write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$paired_policy" +run_compare +actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")" +actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")" +actual_gate="$(jq -r '.comparisons[] | .gateReason' "$tmp_dir/comparison.json")" +actual_baseline="$(jq -r '.comparisons[] | .baseline' "$tmp_dir/comparison.json")" +actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")" +if [ "$actual_status" != "pass" ] || [ "$actual_row" != "pass" ] || [ "$actual_gate" != "eligible" ] || [ "$actual_baseline" != "12.95" ] || [ "$actual_enforceable" != "true" ]; then + echo "expected paired current artifact baseline to override historical baseline; got status=$actual_status row=$actual_row gate=$actual_gate baseline=$actual_baseline enforceable=$actual_enforceable" >&2 + exit 1 +fi + +rm -rf "$tmp_dir/current" "$tmp_dir/baseline" +write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$strict_paired_policy" +jq '.observations[0].comparison = { mode: "paired", baseline: 12.95, pairedSampleCount: 5 } + | .observations[0].statistics.pairedSampleCount = 5 + | .observations[0].statistics.pairedDeltaMedian = 0.05 + | .observations[0].statistics.pairedDeltaP25 = 0.04 + | .observations[0].statistics.pairedDeltaP75 = 0.06 + | .observations[0].statistics.pairedDeltaMad = 0.01' \ + "$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json" +mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json" +write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$strict_paired_policy" +run_compare +actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")" +actual_gate="$(jq -r '.comparisons[] | .gateReason' "$tmp_dir/comparison.json")" +actual_confidence="$(jq -r '.comparisons[] | .confidence' "$tmp_dir/comparison.json")" +actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")" +if [ "$actual_status" != "pass" ] || [ "$actual_gate" != "eligible" ] || [ "$actual_confidence" != "noise_floor" ] || [ "$actual_enforceable" != "true" ]; then + echo "expected paired wall-clock gate to ignore historical minBaselineSources when paired evidence is present; got status=$actual_status gate=$actual_gate confidence=$actual_confidence enforceable=$actual_enforceable" >&2 + exit 1 +fi + +rm -rf "$tmp_dir/current" "$tmp_dir/baseline" +write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$paired_policy" +jq '.observations[0].comparison = { mode: "paired", baseline: 10, pairedSampleCount: 5 } + | .observations[0].statistics.pairedSampleCount = 5 + | .observations[0].statistics.pairedDeltaMedian = 1.2 + | .observations[0].statistics.pairedDeltaP25 = -1 + | .observations[0].statistics.pairedDeltaP75 = 3 + | .observations[0].statistics.pairedDeltaMad = 1' \ + "$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json" +mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json" +write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$paired_policy" +run_compare +actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")" +actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")" +actual_confidence="$(jq -r '.comparisons[] | .confidence' "$tmp_dir/comparison.json")" +actual_impact="$(jq -r '.comparisons[] | .semanticImpactScore' "$tmp_dir/comparison.json")" +actual_lower="$(jq -r '.comparisons[] | .evidenceDeltaLower' "$tmp_dir/comparison.json")" +if [ "$actual_status" != "pass" ] || [ "$actual_row" != "pass" ] || [ "$actual_confidence" != "paired_uncertain" ] || [ "$actual_impact" != "0" ] || ! awk "BEGIN { exit !($actual_lower < 0) }"; then + echo "expected noisy paired delta to stay pass/uncertain; got status=$actual_status row=$actual_row confidence=$actual_confidence impact=$actual_impact lower=$actual_lower" >&2 + exit 1 +fi + +rm -rf "$tmp_dir/current" "$tmp_dir/baseline" +write_measurement "$tmp_dir/current/measurements.json" 12.1 devenv-perf-warm-median-v2 "$paired_policy" +jq '.observations[0].comparison = { mode: "paired", baseline: 10, pairedSampleCount: 5 } + | .observations[0].statistics.pairedSampleCount = 5 + | .observations[0].statistics.pairedDeltaMedian = 2.1 + | .observations[0].statistics.pairedDeltaP25 = 2.05 + | .observations[0].statistics.pairedDeltaP75 = 2.2 + | .observations[0].statistics.pairedDeltaMad = 0.1 + | .observations[0].statistics.pairedDeltaSamples = [0.2, 2.05, 2.1, 2.2, 2.3]' \ + "$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json" +mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json" +write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$paired_policy" +run_compare +actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")" +actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")" +actual_protocol="$(jq -r '.comparisons[] | .pairedEvidenceProtocol' "$tmp_dir/comparison.json")" +actual_lower="$(jq -r '.comparisons[] | .evidenceDeltaLower' "$tmp_dir/comparison.json")" +if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ] || [ "$actual_protocol" != "paired-delta-quantile-v1" ] || ! awk "BEGIN { exit !($actual_lower > 2) }"; then + echo "expected raw paired delta quantile evidence to fail only when the lower evidence quantile exceeds budget; got status=$actual_status row=$actual_row protocol=$actual_protocol lower=$actual_lower" >&2 + exit 1 +fi + +rm -rf "$tmp_dir/current" "$tmp_dir/baseline" +write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$paired_policy" +jq '.observations[0].comparison = { mode: "paired", baseline: 10, pairedSampleCount: 5 } + | .observations[0].statistics.pairedSampleCount = 5 + | .observations[0].statistics.pairedDeltaMedian = 3.2 + | .observations[0].statistics.pairedDeltaP25 = 3.15 + | .observations[0].statistics.pairedDeltaP75 = 3.25 + | .observations[0].statistics.pairedDeltaMad = 0.03' \ + "$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json" +mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json" +write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$paired_policy" +run_compare +actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")" +actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")" +actual_confidence="$(jq -r '.comparisons[] | .confidence' "$tmp_dir/comparison.json")" +actual_impact="$(jq -r '.comparisons[] | .semanticImpactScore' "$tmp_dir/comparison.json")" +actual_lower="$(jq -r '.comparisons[] | .evidenceDeltaLower' "$tmp_dir/comparison.json")" +if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ] || [ "$actual_confidence" != "threshold_exceeded" ] || ! awk "BEGIN { exit !($actual_impact > 1) }" || ! awk "BEGIN { exit !($actual_lower > 2) }"; then + echo "expected stable paired delta over fail budget to fail; got status=$actual_status row=$actual_row confidence=$actual_confidence impact=$actual_impact lower=$actual_lower" >&2 + exit 1 +fi + +rm -rf "$tmp_dir/current" "$tmp_dir/baseline" +write_measurement "$tmp_dir/current/run-1/measurements.json" 5.1 devenv-perf-warm-median-v2 "$policy" +write_measurement "$tmp_dir/current/run-2/measurements.json" 5.2 devenv-perf-warm-median-v2 "$policy" +write_measurement "$tmp_dir/current/run-3/measurements.json" 7.0 devenv-perf-warm-median-v2 "$policy" +write_measurement "$tmp_dir/current/run-4/measurements.json" 7.2 devenv-perf-warm-median-v2 "$policy" +write_measurement "$tmp_dir/current/run-5/measurements.json" 7.4 devenv-perf-warm-median-v2 "$policy" +write_measurement "$tmp_dir/baseline/run-1/measurements.json" 4.0 devenv-perf-warm-median-v2 "$policy" +write_measurement "$tmp_dir/baseline/run-2/measurements.json" 4.2 devenv-perf-warm-median-v2 "$policy" +write_measurement "$tmp_dir/baseline/run-3/measurements.json" 4.4 devenv-perf-warm-median-v2 "$policy" +run_compare +actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")" +actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")" +actual_confidence="$(jq -r '.comparisons[] | .confidence' "$tmp_dir/comparison.json")" +actual_current_lower="$(jq -r '.comparisons[] | .currentRobustLower' "$tmp_dir/comparison.json")" +actual_baseline_upper="$(jq -r '.comparisons[] | .baselineRobustUpper' "$tmp_dir/comparison.json")" +actual_impact="$(jq -r '.comparisons[] | .semanticImpactScore' "$tmp_dir/comparison.json")" +actual_impact_kind="$(jq -r '.comparisons[] | .semanticImpactKind' "$tmp_dir/comparison.json")" +if [ "$actual_status" != "pass" ] || [ "$actual_row" != "pass" ] || [ "$actual_confidence" != "within_robust_band" ] || [ "$actual_impact" != "0" ] || [ "$actual_impact_kind" != "neutral" ] || ! awk "BEGIN { exit !($actual_current_lower <= $actual_baseline_upper) }"; then + echo "expected overlapping current/baseline robust bands to pass with neutral impact; got status=$actual_status row=$actual_row confidence=$actual_confidence impact=$actual_impact kind=$actual_impact_kind currentLower=$actual_current_lower baselineUpper=$actual_baseline_upper" >&2 + exit 1 +fi + +deterministic_policy='{"enabled":true,"comparisonMode":"budget","minBaselineSources":1,"minCurrentSamples":1,"warnRatio":1.01,"failRatio":1.02,"warnAbs":10,"failAbs":20,"noiseFloor":0,"statisticalToleranceAbs":1000,"statisticalToleranceRatio":0}' +rm -rf "$tmp_dir/current" "$tmp_dir/baseline" +write_measurement "$tmp_dir/current/measurements.json" 130 devenv-perf-warm-median-v2 "$deterministic_policy" +jq '.observations[0].id = "nix.closure.nar_size" + | .observations[0].name = "nix.closure.nar_size" + | .observations[0].label = "Nix closure size" + | .observations[0].unit = "bytes" + | .observations[0].measurementKind = "deterministic" + | .observations[0].statistics.sampleCount = 1 + | .observations[0].statistics.measuredSampleCount = 1 + | .observations[0].statistics.successfulSampleCount = 1' \ + "$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json" +mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json" +write_measurement "$tmp_dir/baseline/run-1/measurements.json" 100 devenv-perf-warm-median-v2 "$deterministic_policy" +jq '.observations[0].id = "nix.closure.nar_size" + | .observations[0].name = "nix.closure.nar_size" + | .observations[0].label = "Nix closure size" + | .observations[0].unit = "bytes" + | .observations[0].measurementKind = "deterministic" + | .observations[0].statistics.sampleCount = 1 + | .observations[0].statistics.measuredSampleCount = 1 + | .observations[0].statistics.successfulSampleCount = 1' \ + "$tmp_dir/baseline/run-1/measurements.json" >"$tmp_dir/baseline/run-1/measurements.updated.json" +mv "$tmp_dir/baseline/run-1/measurements.updated.json" "$tmp_dir/baseline/run-1/measurements.json" +run_compare +actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")" +actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")" +actual_confidence="$(jq -r '.comparisons[] | .confidence' "$tmp_dir/comparison.json")" +actual_impact="$(jq -r '.comparisons[] | .semanticImpactScore' "$tmp_dir/comparison.json")" +actual_impact_kind="$(jq -r '.comparisons[] | .semanticImpactKind' "$tmp_dir/comparison.json")" +actual_within_band="$(jq -r '.comparisons[] | .withinBaselineRange' "$tmp_dir/comparison.json")" +if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ] || [ "$actual_confidence" != "threshold_exceeded" ] || [ "$actual_impact_kind" != "fail_boundary" ] || ! awk "BEGIN { exit !($actual_impact > 1) }"; then + echo "expected deterministic budget regression to fail even when robust tolerance is wide; got status=$actual_status row=$actual_row confidence=$actual_confidence impact=$actual_impact kind=$actual_impact_kind withinBaselineRange=$actual_within_band" >&2 + exit 1 +fi + +low_baseline_policy='{"enabled":true,"minBaselineSources":2,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}' +rm -rf "$tmp_dir/current" "$tmp_dir/baseline" +write_measurement "$tmp_dir/current/measurements.json" 10.5 devenv-perf-warm-median-v2 "$low_baseline_policy" +write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$low_baseline_policy" +run_compare +actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")" +actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")" +actual_gate="$(jq -r '.comparisons[] | .gateReason' "$tmp_dir/comparison.json")" +actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")" +actual_gateable_count="$(jq -r '.readiness.gateableCount' "$tmp_dir/comparison.json")" +actual_enabled_count="$(jq -r '.readiness.enabledCount' "$tmp_dir/comparison.json")" +if [ "$actual_status" != "partial" ] || [ "$actual_row" != "pass" ] || [ "$actual_gate" != "low_baseline_count" ] || [ "$actual_enforceable" != "false" ] || [ "$actual_gateable_count" != "0" ] || [ "$actual_enabled_count" != "1" ]; then + echo "expected low baseline count to be partial but not enforceable; got status=$actual_status row=$actual_row gate=$actual_gate enforceable=$actual_enforceable readiness=$actual_gateable_count/$actual_enabled_count" >&2 + exit 1 +fi + +echo "ci-measurement-comparison tests passed" diff --git a/genie/ci-scripts/nix-gc-race-retry.sh b/genie/ci-scripts/nix-gc-race-retry.sh index 3f2bce50a..e7d3d056f 100644 --- a/genie/ci-scripts/nix-gc-race-retry.sh +++ b/genie/ci-scripts/nix-gc-race-retry.sh @@ -6,7 +6,7 @@ run_nix_gc_race_retry() { local max="${NIX_GC_RACE_MAX_RETRIES:-10}" local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}" local attempt=1 - local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit + local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit start="$(date +%s)" @@ -54,7 +54,7 @@ run_nix_gc_race_retry() { if [ "$rc" -eq 0 ]; then echo "::notice::[ci] completed $task in $elapsed s" if [ "$attempt" -gt 1 ]; then - write_summary success "Recovered from Nix GC race after retry" + write_summary success "Recovered from transient Nix failure after retry" else write_summary success fi @@ -70,18 +70,22 @@ run_nix_gc_race_retry() { tr -d '[:space:]' || true) saw_invalid_path=false saw_cachix_signature=false + saw_fetch_signature=false [ -n "$path" ] && saw_invalid_path=true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true rm -f "$log" - if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then - echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race" - write_summary failure "No Nix GC race signature detected" + if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then + echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure" + write_summary failure "No transient Nix failure signature detected" return "$rc" fi - if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then + if [ "$saw_fetch_signature" = true ]; then + echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache" + elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path" elif [ "$saw_cachix_signature" = true ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)" @@ -96,7 +100,7 @@ run_nix_gc_race_retry() { now=$(date +%s) elapsed=$((now - start)) - echo "::error::Nix GC race retry exhausted for $task ($max attempts)" - write_summary failure "Nix GC race retry exhausted" + echo "::error::Transient Nix retry exhausted for $task ($max attempts)" + write_summary failure "Transient Nix retry exhausted" return 1 } diff --git a/genie/ci-scripts/nix-gc-race-retry.test.sh b/genie/ci-scripts/nix-gc-race-retry.test.sh index 7c85a8fc9..300589dc0 100644 --- a/genie/ci-scripts/nix-gc-race-retry.test.sh +++ b/genie/ci-scripts/nix-gc-race-retry.test.sh @@ -80,7 +80,28 @@ chmod +x "$cachix_fixture" CI_PROGRESS_HEARTBEAT_SECONDS=1 NIX_GC_RACE_MAX_RETRIES=2 run_nix_gc_race_retry "cachix-fixture" "$cachix_fixture" >/dev/null assert_eq "2" "$(cat "$test_dir/cachix-attempt")" "cachix wrapper retry count" -echo "Test 3: does not retry when literal signature strings appear outside Nix error context" +echo "Test 3: retries truncated Nix input tarball failures" +fetch_fixture="$test_dir/fetch-fixture.sh" +cat > "$fetch_fixture" < "\$attempt_file" + echo "error: cannot read file from tarball: Truncated tar archive detected while reading data" >&2 + exit 1 +fi +echo "fetch recovered" +EOF +chmod +x "$fetch_fixture" +CI_PROGRESS_HEARTBEAT_SECONDS=1 NIX_GC_RACE_MAX_RETRIES=2 run_nix_gc_race_retry "fetch-fixture" "$fetch_fixture" >/dev/null +assert_eq "2" "$(cat "$test_dir/fetch-attempt")" "truncated tarball retry count" + +echo "Test 4: does not retry when literal signature strings appear outside Nix error context" false_positive_fixture="$test_dir/false-positive-fixture.sh" cat > "$false_positive_fixture" <<'EOF' #!/usr/bin/env bash @@ -96,7 +117,7 @@ exit_code=$? set -e assert_exit_code 9 "$exit_code" "non-error-context strings do not trigger retries" -echo "Test 4: preserves the original exit code when no GC-race signature is present" +echo "Test 5: preserves the original exit code when no retry signature is present" non_retry_fixture="$test_dir/non-retry-fixture.sh" cat > "$non_retry_fixture" <<'EOF' #!/usr/bin/env bash diff --git a/genie/ci-workflow.ts b/genie/ci-workflow.ts index 5f2dda0cb..a3edf6068 100644 --- a/genie/ci-workflow.ts +++ b/genie/ci-workflow.ts @@ -49,14 +49,23 @@ export { } from './ci-workflow/shared.ts' export { ciMeasurementMetrics, + ciMeasurementBaselineBackfillPredicate, + ciMeasurementBaselineCheckoutStep, + ciMeasurementBaselineWorkflowDispatchInputs, + ciMeasurementNotBaselineBackfillPredicate, + ciMeasurementSubjectEnv, ciMeasurementsArtifactStep, ciMeasurementsCommentPermissions, compareCiMeasurementsStep, + defaultNixClosureMeasurementBuckets, downloadPreviousGitHubArtifactStep, devenvPerfArtifactStep, devenvPerfBenchmarkStep, devenvPerfJob, + nixClosureMeasurementSteps, + nixClosureMeasurementsJob, nixClosureMeasurementStep, + sourceShapeMeasurementStep, type CiMeasurementDescriptor, type CiMeasurementObservation, type CiMeasurementsArtifactStepOptions, @@ -67,6 +76,11 @@ export { type GitHubPreviousArtifactStepOptions, type NixClosureMeasurementBucket, type NixClosureMeasurementStepOptions, + type NixClosureMeasurementTarget, + type NixClosureMeasurementsJobOptions, + type NixClosureMeasurementsStepsOptions, + type SourceShapeMeasurementScope, + type SourceShapeMeasurementStepOptions, } from './ci-workflow/measurements.ts' export { appendGitHubAccessTokenToNixConfigStep, diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts index 53df6b724..6341d8f16 100644 --- a/genie/ci-workflow/measurements.ts +++ b/genie/ci-workflow/measurements.ts @@ -17,23 +17,116 @@ export type CiMeasurementDescriptor = { readonly id: string readonly label: string readonly group?: string + readonly path?: readonly string[] readonly description?: string + readonly dimensions?: Record +} + +export type CiMeasurementGatePolicy = { + readonly enabled?: boolean + readonly comparisonMode?: 'budget' | 'historical' | 'paired' + readonly minBaselineSources?: number + readonly minCurrentSamples?: number + readonly minPairedSamples?: number + readonly noiseFloor?: number + readonly statisticalToleranceRatio?: number + readonly statisticalToleranceAbs?: number + readonly pairedEvidenceQuantile?: number + readonly warnRatio?: number + readonly failRatio?: number + readonly warnAbs?: number + readonly failAbs?: number } export type DevenvPerfProbe = CiMeasurementDescriptor & { readonly command: readonly [string, ...string[]] readonly traceOutput?: string + readonly warmupRepetitions?: number readonly repetitions?: number + readonly gate?: CiMeasurementGatePolicy } export type CiMeasurementObservation = { readonly id?: string readonly label?: string readonly group?: string + readonly path?: readonly string[] + readonly description?: string + readonly measurementKind?: 'deterministic' | 'wall-clock' | 'diagnostic' | (string & {}) readonly name: string - readonly unit: string + readonly unit: CiMeasurementUnit readonly value: number readonly dimensions?: Record + readonly policy?: CiMeasurementGatePolicy + readonly comparison?: { + readonly mode?: 'budget' | 'historical' | 'paired' | (string & {}) + readonly baseline?: number + readonly pairedSampleCount?: number + } + readonly statistics?: { + readonly sampleCount?: number + readonly measuredSampleCount?: number + readonly min?: number + readonly max?: number + readonly median?: number + readonly p25?: number + readonly p75?: number + readonly p95?: number + readonly pairedSampleCount?: number + readonly pairedBaselineMedian?: number + readonly pairedCurrentMedian?: number + readonly pairedDeltaMedian?: number + readonly pairedDeltaMin?: number + readonly pairedDeltaMax?: number + readonly pairedDeltaP25?: number + readonly pairedDeltaP75?: number + readonly pairedDeltaMad?: number + readonly pairedDeltaSamples?: readonly number[] + } +} + +export type CiMeasurementUnit = + | 'seconds' + | 'milliseconds' + | 'bytes' + | 'count' + | 'lines' + | 'score' + | 'ratio' + | 'percent' + | (string & {}) + +export type CiMeasurementTarget = { + readonly kind: string + readonly id: string + readonly name?: string + readonly label?: string + readonly group?: string + readonly path?: readonly string[] + readonly system?: string +} + +export type CiMeasurementArtifact = { + readonly schemaVersion: number + readonly generatedAt: string + readonly producer: { + readonly name: string + readonly version: number + readonly measurementProtocol: string + } + readonly subject?: { + readonly ref?: string + readonly sha?: string + readonly label?: string + } + readonly target: CiMeasurementTarget + readonly observations: readonly CiMeasurementObservation[] + readonly attachments?: readonly { + readonly name: string + readonly path: string + readonly contentType?: string + }[] + readonly summary?: unknown } export const ciMeasurementMetrics = { @@ -41,6 +134,9 @@ export const ciMeasurementMetrics = { nixClosureNarSize: 'nix.closure.nar_size', nixClosurePathCount: 'nix.closure.path_count', nixClosureBucketNarSize: 'nix.closure.bucket.nar_size', + sourceLines: 'source.lines', + sourceFiles: 'source.files', + codeComplexity: 'code.complexity', } as const export type NixClosureMeasurementBucket = { @@ -55,10 +151,71 @@ export type NixClosureMeasurementStepOptions = { readonly targetName?: string readonly targetLabel?: string readonly targetGroup?: string + readonly targetPath?: readonly string[] + readonly targetDescription?: string readonly targetSystem?: string readonly artifactDir?: string readonly artifactFile?: string readonly buckets?: readonly NixClosureMeasurementBucket[] + readonly gate?: CiMeasurementGatePolicy +} + +export type NixClosureMeasurementTarget = { + readonly installable: string + readonly id: string + readonly name?: string + readonly label: string + readonly group: string + readonly path?: readonly string[] + readonly description: string + readonly system?: string + readonly buckets?: readonly NixClosureMeasurementBucket[] + readonly gate?: CiMeasurementGatePolicy +} + +export type NixClosureMeasurementsStepsOptions = { + readonly artifactDir?: string + readonly artifactName: string + readonly baselineArtifactName?: string + readonly baselineSeedRuns?: readonly CiMeasurementBaselineSeedRun[] + readonly baselineSeedRunIds?: readonly string[] + readonly baselineMaxRuns?: number + readonly baselineMaxCandidateRuns?: number + readonly targets: readonly [NixClosureMeasurementTarget, ...NixClosureMeasurementTarget[]] + readonly buckets?: readonly NixClosureMeasurementBucket[] + readonly retentionDays?: number + readonly compare?: boolean + readonly regressionMode?: 'off' | 'warn' | 'fail' + readonly prComment?: CiMeasurementsComparisonStepOptions['prComment'] +} + +export type NixClosureMeasurementsJobOptions = NixClosureMeasurementsStepsOptions & { + readonly runsOn?: readonly string[] + readonly setupSteps?: readonly GitHubWorkflowArgs['jobs'][string]['steps'][number][] + readonly ifExpr?: string + readonly timeoutMinutes?: number + readonly env?: Record + readonly permissions?: GitHubWorkflowArgs['jobs'][string]['permissions'] +} + +export type SourceShapeMeasurementScope = CiMeasurementDescriptor & { + readonly root?: string + readonly includePaths?: readonly string[] + readonly excludePaths?: readonly string[] + readonly includeExtensions?: readonly string[] + readonly gate?: CiMeasurementGatePolicy +} + +export type SourceShapeMeasurementStepOptions = { + readonly targetId?: string + readonly targetName?: string + readonly targetLabel?: string + readonly targetGroup?: string + readonly targetPath?: readonly string[] + readonly targetSystem?: string + readonly artifactDir?: string + readonly artifactFile?: string + readonly scopes: readonly [SourceShapeMeasurementScope, ...SourceShapeMeasurementScope[]] } export type GitHubPreviousArtifactStepOptions = { @@ -66,11 +223,29 @@ export type GitHubPreviousArtifactStepOptions = { readonly outputDir: string readonly workflowName?: string readonly branch?: string + readonly seedRuns?: readonly CiMeasurementBaselineSeedRun[] readonly seedRunIds?: readonly string[] readonly maxRuns?: number + readonly maxCandidateRuns?: number + readonly downloadTimeoutSeconds?: number + readonly requiredObservations?: readonly CiMeasurementRequiredBaselineObservation[] readonly tokenExpression?: string } +export type CiMeasurementBaselineSeedRun = { + readonly runId: string + readonly label?: string + readonly sha?: string + readonly source?: 'manual-backfill' | 'main-history' | 'pr-history' | string + readonly artifacts?: readonly string[] + readonly notes?: string +} + +export type CiMeasurementRequiredBaselineObservation = { + readonly id: string + readonly minSources: number +} + export type CiMeasurementsComparisonStepOptions = { readonly currentDir?: string readonly baselineDir?: string @@ -82,6 +257,8 @@ export type CiMeasurementsComparisonStepOptions = { readonly maxRows?: number readonly maxHistory?: number readonly assetBranch?: string + readonly publicAssetCommand?: string + readonly publicAssetEnv?: Readonly> readonly tokenExpression?: string } } @@ -94,11 +271,62 @@ export type CiMeasurementsArtifactStepOptions = { /** Job-level permissions required when CI measurement comparison posts PR comments. */ export const ciMeasurementsCommentPermissions = { + actions: 'read', contents: 'write', issues: 'write', 'pull-requests': 'write', } as const +/** Workflow-dispatch inputs used to recreate measurement baselines for older commits. */ +export const ciMeasurementBaselineWorkflowDispatchInputs = { + measurement_baseline_ref: { + description: + 'Optional ref/SHA to checkout before running CI measurement jobs. Used to backfill comparable baseline artifacts.', + required: false, + default: '', + type: 'string', + }, + measurement_baseline_label: { + description: + 'Optional human label for a measurement baseline backfill run, for example PR number.', + required: false, + default: '', + type: 'string', + }, +} as const + +export const ciMeasurementBaselineBackfillPredicate = + "github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != ''" as const +export const ciMeasurementNotBaselineBackfillPredicate = + `!(${ciMeasurementBaselineBackfillPredicate})` as const + +export const defaultNixClosureMeasurementBuckets = [ + { name: 'node', label: 'Node / pnpm', pathRegex: 'node_modules|npm-deps|pnpm' }, + { name: 'nix-sources', label: 'Nix sources', pathRegex: '-source$' }, + { name: 'rust', label: 'Rust', pathRegex: 'cargo|rust|rustc' }, +] as const satisfies readonly NixClosureMeasurementBucket[] + +/** Conditional checkout step that replaces the default checkout with the baseline subject. */ +export const ciMeasurementBaselineCheckoutStep = { + name: 'Checkout CI measurement baseline ref', + if: `\${{ ${ciMeasurementBaselineBackfillPredicate} }}`, + uses: 'actions/checkout@v6', + with: { + ref: '${{ inputs.measurement_baseline_ref }}', + }, +} as const + +/** Subject metadata env for measurement artifacts produced by a baseline backfill run. */ +export const ciMeasurementSubjectEnv = { + CI_MEASUREMENT_SUBJECT_REF: + '${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }}', + CI_MEASUREMENT_SUBJECT_SHA: + '${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }}', + CI_MEASUREMENT_SUBJECT_LABEL: '${{ inputs.measurement_baseline_label }}', + CI_MEASUREMENT_ALLOW_PROBE_FAILURES: + "${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}", +} as const + type DevenvPerfSetupStep = GitHubWorkflowArgs['jobs'][string]['steps'][number] export type DevenvPerfTaskProbe = | string @@ -108,7 +336,12 @@ export type DevenvPerfTaskProbe = readonly label?: string readonly group?: string readonly description?: string + readonly path?: readonly string[] + readonly dimensions?: Record + readonly extraArgs?: readonly string[] + readonly warmupRepetitions?: number readonly repetitions?: number + readonly gate?: CiMeasurementGatePolicy } export type DevenvPerfJobOptions = { @@ -116,23 +349,105 @@ export type DevenvPerfJobOptions = { readonly artifactDir?: string readonly artifactName?: string readonly baselineArtifactName?: string + readonly baselineSeedRuns?: readonly CiMeasurementBaselineSeedRun[] readonly baselineSeedRunIds?: readonly string[] readonly baselineMaxRuns?: number + readonly baselineMaxCandidateRuns?: number readonly setupSteps?: readonly DevenvPerfSetupStep[] readonly env?: Record readonly taskProbes?: readonly DevenvPerfTaskProbe[] readonly probes?: readonly DevenvPerfProbe[] readonly retentionDays?: number + readonly compare?: boolean readonly regressionMode?: 'off' | 'warn' | 'fail' readonly prComment?: CiMeasurementsComparisonStepOptions['prComment'] readonly permissions?: GitHubWorkflowArgs['jobs'][string]['permissions'] } +const defaultDevenvPerfGatePolicy = (probeId: string): CiMeasurementGatePolicy => { + if (probeId === 'shell_eval_traced') { + return { + enabled: false, + minBaselineSources: 10, + minCurrentSamples: 3, + warnRatio: 1.25, + failRatio: 1.5, + warnAbs: 1.5, + failAbs: 3, + noiseFloor: 0.5, + statisticalToleranceRatio: 0.2, + statisticalToleranceAbs: 1, + } + } + if (probeId === 'tasks_list' || probeId === 'processes_help') { + return { + enabled: true, + comparisonMode: 'paired', + minPairedSamples: 7, + minBaselineSources: 10, + minCurrentSamples: 5, + warnRatio: 1.25, + failRatio: 1.5, + warnAbs: 0.05, + failAbs: 0.15, + noiseFloor: 0.03, + statisticalToleranceRatio: 0.1, + statisticalToleranceAbs: 0.03, + } + } + if (probeId === 'task_check_quick_forced') { + return { + enabled: true, + comparisonMode: 'paired', + minPairedSamples: 3, + minBaselineSources: 10, + minCurrentSamples: 3, + warnRatio: 1.15, + failRatio: 1.3, + warnAbs: 1.5, + failAbs: 4, + noiseFloor: 0.75, + statisticalToleranceRatio: 0.15, + statisticalToleranceAbs: 1, + } + } + return { + enabled: true, + comparisonMode: 'paired', + minPairedSamples: 5, + minBaselineSources: 10, + minCurrentSamples: 5, + warnRatio: 1.1, + failRatio: 1.2, + warnAbs: 0.25, + failAbs: 1, + noiseFloor: 0.1, + statisticalToleranceRatio: 0.1, + statisticalToleranceAbs: 0.25, + } +} + +const devenvPerfGatePolicy = (probe: Pick) => ({ + ...defaultDevenvPerfGatePolicy(probe.id), + ...probe.gate, +}) + const devenvPerfProbeLine = (probe: DevenvPerfProbe) => { const args = probe.command.map(shellSingleQuote).join(' ') const trace = probe.traceOutput ?? '' - const repetitions = Math.max(1, Math.floor(probe.repetitions ?? 1)) - return `measure ${shellSingleQuote(probe.id)} ${shellSingleQuote(probe.label)} ${shellSingleQuote(probe.group ?? '')} ${shellSingleQuote(probe.description ?? '')} ${shellSingleQuote(trace)} ${shellSingleQuote(String(repetitions))} ${args}` + const gatePolicy = devenvPerfGatePolicy(probe) + const metadata = JSON.stringify({ + path: probe.path ?? [], + dimensions: probe.dimensions ?? {}, + }) + const defaultRepetitions = gatePolicy.enabled ? gatePolicy.minCurrentSamples : 1 + const repetitions = Math.max(1, Math.floor(probe.repetitions ?? defaultRepetitions)) + const defaultWarmupRepetitions = gatePolicy.enabled && repetitions > 1 ? 1 : 0 + const warmupRepetitions = Math.max( + 0, + Math.floor(probe.warmupRepetitions ?? defaultWarmupRepetitions), + ) + return `measure ${shellSingleQuote(probe.id)} ${shellSingleQuote(probe.label)} ${shellSingleQuote(probe.group ?? '')} ${shellSingleQuote(probe.description ?? '')} ${shellSingleQuote(trace)} ${shellSingleQuote(String(warmupRepetitions))} ${shellSingleQuote(String(repetitions))} ${shellSingleQuote(JSON.stringify(gatePolicy))} ${shellSingleQuote(metadata)} ${args}` } const defaultDevenvPerfTaskProbe = (probe: DevenvPerfTaskProbe): DevenvPerfProbe => { @@ -141,67 +456,174 @@ const defaultDevenvPerfTaskProbe = (probe: DevenvPerfTaskProbe): DevenvPerfProbe const label = typeof probe === 'string' ? undefined : probe.label const group = typeof probe === 'string' ? undefined : probe.group const description = typeof probe === 'string' ? undefined : probe.description + const path = typeof probe === 'string' ? undefined : probe.path + const dimensions = typeof probe === 'string' ? undefined : probe.dimensions + const extraArgs = typeof probe === 'string' ? [] : (probe.extraArgs ?? []) + const warmupRepetitions = typeof probe === 'string' ? undefined : probe.warmupRepetitions const repetitions = typeof probe === 'string' ? undefined : probe.repetitions + const gate = typeof probe === 'string' ? undefined : probe.gate return { id: id ?? `task_${task.replaceAll(':', '_')}`, label: label ?? task, group: group ?? 'devenv tasks', + path, description: description ?? `Runs the devenv task '${task}' in before mode without the TUI.`, + dimensions, + warmupRepetitions, repetitions, - command: ['$DEVENV_BIN', 'tasks', 'run', task, '--mode', 'before', '--no-tui', '--show-output'], + gate, + command: [ + '$DEVENV_BIN', + 'tasks', + 'run', + task, + '--mode', + 'before', + '--no-tui', + '--show-output', + ...extraArgs, + ], } } +const devenvPerfProbes = ( + opts: Required>, +): readonly DevenvPerfProbe[] => [ + { + id: 'shell_eval_traced', + label: 'Shell eval with OTEL trace', + group: 'devenv shell', + description: 'Evaluates the dev shell with native devenv JSON tracing enabled.', + command: ['$DEVENV_SHELL_TRACE_COMMAND'], + traceOutput: '$ARTIFACT_DIR/traces/shell_eval_traced.json', + }, + { + id: 'shell_eval_warm', + label: 'Warm shell eval', + group: 'devenv shell', + description: 'Evaluates a warm dev shell without reloading direnv state.', + warmupRepetitions: 1, + repetitions: 5, + command: ['$DEVENV_BIN', 'shell', '--no-reload', '--', 'true'], + }, + { + id: 'tasks_list', + label: 'devenv tasks list', + group: 'devenv cli', + description: 'Lists devenv tasks to measure task graph loading overhead.', + warmupRepetitions: 1, + repetitions: 9, + command: ['$DEVENV_BIN', 'tasks', 'list'], + }, + { + id: 'processes_help', + label: 'devenv processes --help', + group: 'devenv cli', + description: 'Loads the devenv processes command help path.', + warmupRepetitions: 1, + repetitions: 9, + command: ['$DEVENV_BIN', 'processes', '--help'], + }, + ...opts.taskProbes.map(defaultDevenvPerfTaskProbe), + ...opts.probes, +] + +const devenvPerfRequiredBaselineObservations = ( + probes: readonly DevenvPerfProbe[], +): readonly CiMeasurementRequiredBaselineObservation[] => + probes + .map((probe) => ({ + id: `devenv.${probe.id}.duration`, + minSources: devenvPerfGatePolicy(probe).minBaselineSources ?? 1, + enabled: devenvPerfGatePolicy(probe).enabled ?? true, + })) + .filter((probe) => probe.enabled) + .map(({ id, minSources }) => ({ id, minSources })) + +const ciMeasurementToolBootstrapScript = String.raw`ensure_ci_measurement_tool() { + tool_name="$1" + nix_attr="$2" + if command -v "$tool_name" >/dev/null 2>&1; then + return 0 + fi + if ! command -v nix >/dev/null 2>&1; then + return 1 + fi + if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then + while IFS= read -r tool_path; do + [ -n "$tool_path" ] || continue + [ -d "$tool_path/bin" ] || continue + export PATH="$tool_path/bin:$PATH" + if command -v "$tool_name" >/dev/null 2>&1; then + return 0 + fi + done </dev/null 2>&1 +} + +require_ci_measurement_tool() { + tool_name="$1" + nix_attr="$2" + if ensure_ci_measurement_tool "$tool_name" "$nix_attr"; then + return 0 + fi + echo "::error::$tool_name is not available; unable to produce CI measurement artifact" + exit 1 +} +` + const renderDevenvPerfScript = ( opts: Required>, ) => { - const probes: readonly DevenvPerfProbe[] = [ - { - id: 'shell_eval_traced', - label: 'Shell eval with OTEL trace', - group: 'devenv shell', - description: 'Evaluates the dev shell with native devenv JSON tracing enabled.', - command: [ - '$DEVENV_BIN', - '--trace-to', - 'json:file:$trace_file', - 'shell', - '--no-reload', - '--', - 'true', - ], - traceOutput: '$ARTIFACT_DIR/traces/shell_eval_traced.json', - }, - { - id: 'shell_eval_warm', - label: 'Warm shell eval', - group: 'devenv shell', - description: 'Evaluates a warm dev shell without reloading direnv state.', - repetitions: 3, - command: ['$DEVENV_BIN', 'shell', '--no-reload', '--', 'true'], - }, - { - id: 'tasks_list', - label: 'devenv tasks list', - group: 'devenv cli', - description: 'Lists devenv tasks to measure task graph loading overhead.', - repetitions: 5, - command: ['$DEVENV_BIN', 'tasks', 'list'], - }, - { - id: 'processes_help', - label: 'devenv processes --help', - group: 'devenv cli', - description: 'Loads the devenv processes command help path.', - repetitions: 5, - command: ['$DEVENV_BIN', 'processes', '--help'], - }, - ...opts.taskProbes.map(defaultDevenvPerfTaskProbe), - ...opts.probes, - ] + const probes = devenvPerfProbes(opts) return String.raw`set -euo pipefail +${ciMeasurementToolBootstrapScript} +require_ci_measurement_tool awk gawk.out +require_ci_measurement_tool jq jq.bin + +ARTIFACT_DIR="$(mkdir -p "$ARTIFACT_DIR" && cd "$ARTIFACT_DIR" && pwd -P)" +CI_MEASUREMENT_HEAD_DIR="${dollar}{CI_MEASUREMENT_HEAD_DIR:-$PWD}" +CI_MEASUREMENT_BASE_DIR="${dollar}{CI_MEASUREMENT_BASE_DIR:-${dollar}{RUNNER_TEMP:-/tmp}/ci-measurement-base}" +CI_MEASUREMENT_PAIRED_ENABLED=0 +CI_MEASUREMENT_ORDER_SEED="${dollar}{CI_MEASUREMENT_ORDER_SEED:-${dollar}{GITHUB_RUN_ID:-local}-${dollar}{GITHUB_RUN_ATTEMPT:-0}-${dollar}{GITHUB_SHA:-unknown}}" + +prepare_paired_base_worktree() { + if [ "${dollar}{GITHUB_EVENT_NAME:-}" != "pull_request" ]; then + return 0 + fi + if [ -n "${dollar}{CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" ]; then + return 0 + fi + if [ ! -f "${dollar}{GITHUB_EVENT_PATH:-}" ]; then + return 0 + fi + + local base_sha + base_sha="$(jq -r '.pull_request.base.sha // empty' "$GITHUB_EVENT_PATH")" + if [ -z "$base_sha" ]; then + echo "::notice::paired wall-clock baseline unavailable: pull_request.base.sha missing" + return 0 + fi + + rm -rf "$CI_MEASUREMENT_BASE_DIR" + git worktree prune >/dev/null 2>&1 || true + if git fetch --no-tags --depth=1 origin "$base_sha" \ + && git worktree add --detach "$CI_MEASUREMENT_BASE_DIR" "$base_sha" >/dev/null; then + CI_MEASUREMENT_PAIRED_ENABLED=1 + echo "::notice::paired wall-clock baseline prepared at $CI_MEASUREMENT_BASE_DIR ($base_sha)" + else + echo "::warning::paired wall-clock baseline unavailable: failed to prepare base worktree $base_sha" + CI_MEASUREMENT_PAIRED_ENABLED=0 + fi +} + +prepare_paired_base_worktree + mkdir -p "$ARTIFACT_DIR/traces" { @@ -240,6 +662,8 @@ json_append_timing() { local stdout="$7" local stderr="$8" local trace="$9" + local gate_policy="${dollar}{10}" + local metadata_json="${dollar}{11}" local samples_file="$ARTIFACT_DIR/$id.samples.json" if [ "$first" -eq 0 ]; then @@ -254,13 +678,44 @@ json_append_timing() { --arg group "$group" \ --arg description "$description" \ --argjson status "$status" \ - --argjson durationMs "$duration_ms" \ - --arg stdout "$stdout" \ - --arg stderr "$stderr" \ - --arg trace "$trace" \ - '($samples[0] // []) as $sampleList - | ($sampleList | map(select(.status == 0) | .durationMs)) as $successfulDurations - | { + --argjson durationMs "$duration_ms" \ + --arg stdout "$stdout" \ + --arg stderr "$stderr" \ + --arg trace "$trace" \ + --argjson gatePolicy "$gate_policy" \ + --argjson metadata "$metadata_json" \ + 'def median: + sort as $sorted + | ($sorted | length) as $count + | if $count == 0 then null + elif ($count % 2) == 1 then $sorted[($count / 2 | floor)] + else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2) + end; + def percentile($p): + sort as $sorted + | ($sorted | length) as $count + | if $count == 0 then null + else $sorted[(($p * ($count - 1)) | floor)] + end; + ($samples[0] // []) as $sampleList + | ($sampleList | map(select((.subject // "head") == "head" and .phase != "warmup" and .status == 0) | .durationMs)) as $successfulDurations + | ($sampleList | map(select((.subject // "head") == "head" and .phase == "warmup"))) as $warmupSamples + | ($sampleList | map(select((.subject // "head") == "head" and .phase == "measured" and .status == 0 and .pairIndex != null))) as $headSamples + | ($sampleList | map(select(.subject == "base" and .phase == "measured" and .status == 0 and .pairIndex != null))) as $baseSamples + | ( + $headSamples + | map(. as $head | $baseSamples[]? | select(.pairIndex == $head.pairIndex) | { + pairIndex: $head.pairIndex, + currentDurationMs: $head.durationMs, + baselineDurationMs: .durationMs, + deltaMs: ($head.durationMs - .durationMs) + }) + ) as $pairedSamples + | ($pairedSamples | map(.currentDurationMs)) as $pairedCurrentDurations + | ($pairedSamples | map(.baselineDurationMs)) as $pairedBaselineDurations + | ($pairedSamples | map(.deltaMs)) as $pairedDeltaDurations + | ($pairedDeltaDurations | median) as $pairedDeltaMedian + | { id:$id, name:$id, label:$label, @@ -271,12 +726,34 @@ json_append_timing() { stdout:$stdout, stderr:$stderr, trace:(if $trace == "" then null else $trace end), - statistics: { + metadata:$metadata, + gatePolicy:$gatePolicy, + statistics: { sampleCount: ($sampleList | length), + warmupCount: ($warmupSamples | length), + measuredSampleCount: ( + $sampleList + | map(select((.subject // "head") == "head" and .phase != "warmup")) + | length + ), successfulSampleCount: ($successfulDurations | length), minDurationMs: ($successfulDurations | min), maxDurationMs: ($successfulDurations | max), - medianDurationMs: $durationMs + medianDurationMs: $durationMs, + pairedSampleCount: ($pairedSamples | length), + pairedCurrentMedianDurationMs: ($pairedCurrentDurations | median), + pairedBaselineMedianDurationMs: ($pairedBaselineDurations | median), + pairedDeltaMedianDurationMs: $pairedDeltaMedian, + pairedDeltaMinDurationMs: ($pairedDeltaDurations | min), + pairedDeltaMaxDurationMs: ($pairedDeltaDurations | max), + pairedDeltaP25DurationMs: ($pairedDeltaDurations | percentile(0.25)), + pairedDeltaP75DurationMs: ($pairedDeltaDurations | percentile(0.75)), + pairedDeltaMadDurationMs: ( + if $pairedDeltaMedian == null then null + else ($pairedDeltaDurations | map(. - $pairedDeltaMedian | if . < 0 then -. else . end) | median) + end + ), + pairedDeltaSampleDurationMs: $pairedDeltaDurations }, samples:$sampleList }' \ @@ -287,10 +764,13 @@ measure() { local id="$1" local label="$2" local group="$3" - local description="$4" - local trace_file="$5" - local repetitions="$6" - shift 6 + local description="$4" + local trace_file="$5" + local warmup_repetitions="$6" + local repetitions="$7" + local gate_policy="$8" + local metadata_json="$9" + shift 9 case "$trace_file" in '$ARTIFACT_DIR'*) trace_file="${dollar}{ARTIFACT_DIR}${dollar}{trace_file#'$ARTIFACT_DIR'}" ;; esac @@ -303,11 +783,24 @@ measure() { if ! [[ "$repetitions" =~ ^[0-9]+$ ]] || [ "$repetitions" -lt 1 ]; then repetitions=1 fi + if ! [[ "$warmup_repetitions" =~ ^[0-9]+$ ]] || [ "$warmup_repetitions" -lt 0 ]; then + warmup_repetitions=0 + fi printf '[' >"$samples_file" local sample_first=1 - local sample_index sample_stdout sample_stderr sample_trace expanded - for sample_index in $(seq 1 "$repetitions"); do + local sample_index measured_index total_repetitions phase sample_stdout sample_stderr sample_trace expanded + local order_offset + order_offset="$(printf '%s' "$CI_MEASUREMENT_ORDER_SEED:$id" | cksum | awk '{ print $1 % 2 }')" + total_repetitions=$((warmup_repetitions + repetitions)) + for sample_index in $(seq 1 "$total_repetitions"); do + if [ "$sample_index" -le "$warmup_repetitions" ]; then + phase="warmup" + measured_index="" + else + phase="measured" + measured_index=$((sample_index - warmup_repetitions)) + fi sample_stdout="$ARTIFACT_DIR/$id.$sample_index.stdout" sample_stderr="$ARTIFACT_DIR/$id.$sample_index.stderr" sample_trace="" @@ -318,19 +811,65 @@ measure() { fi fi - started="$(date +%s%3N)" - set +e expanded=() for arg in "$@"; do case "$arg" in '$DEVENV_BIN') expanded+=("${dollar}{DEVENV_BIN:?DEVENV_BIN not set}") ;; + '$DEVENV_SHELL_TRACE_COMMAND') + if "${dollar}{DEVENV_BIN:?DEVENV_BIN not set}" --help 2>&1 | grep -q -- '--trace-to'; then + expanded+=("${dollar}{DEVENV_BIN:?DEVENV_BIN not set}" "--trace-to" "json:file:$sample_trace" "shell" "--no-reload" "--" "true") + elif "${dollar}{DEVENV_BIN:?DEVENV_BIN not set}" --help 2>&1 | grep -q -- '--trace-format'; then + expanded+=("${dollar}{DEVENV_BIN:?DEVENV_BIN not set}" "--trace-format" "json" "shell" "--no-reload" "--" "true") + sample_trace="" + else + expanded+=("${dollar}{DEVENV_BIN:?DEVENV_BIN not set}" "shell" "--no-reload" "--" "true") + sample_trace="" + fi + ;; '$ARTIFACT_DIR'*) expanded+=("${dollar}{ARTIFACT_DIR}${dollar}{arg#'$ARTIFACT_DIR'}") ;; 'json:file:$trace_file') expanded+=("json:file:$sample_trace") ;; '$trace_file') expanded+=("file:$sample_trace") ;; *) expanded+=("$arg") ;; esac done - "${dollar}{expanded[@]}" >"$sample_stdout" 2>"$sample_stderr" + + local base_ran_before_head=0 base_stdout base_stderr base_started base_ended base_status base_duration_ms + if [ "$phase" = "measured" ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ $(((measured_index + order_offset) % 2)) -eq 0 ]; then + base_ran_before_head=1 + base_stdout="$ARTIFACT_DIR/$id.$sample_index.base.stdout" + base_stderr="$ARTIFACT_DIR/$id.$sample_index.base.stderr" + base_started="$(date +%s%3N)" + set +e + (cd "$CI_MEASUREMENT_BASE_DIR" && "${dollar}{expanded[@]}") >"$base_stdout" 2>"$base_stderr" + base_status=$? + set -e + base_ended="$(date +%s%3N)" + base_duration_ms=$((base_ended - base_started)) + + if [ "$sample_first" -eq 0 ]; then + printf ',' >>"$samples_file" + fi + sample_first=0 + jq -cn \ + --argjson index "$sample_index" \ + --arg measuredIndex "$measured_index" \ + --argjson status "$base_status" \ + --argjson durationMs "$base_duration_ms" \ + --arg stdout "$base_stdout" \ + --arg stderr "$base_stderr" \ + --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \ + '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"base-head",orderSeed:$orderSeed}' \ + >>"$samples_file" + + if [ "$base_status" -ne 0 ]; then + echo "::warning::$id paired baseline sample $measured_index failed after ${dollar}{base_duration_ms}ms; this pair is excluded from wall-clock gating" + tail -40 "$base_stderr" || true + fi + fi + + started="$(date +%s%3N)" + set +e + (cd "$CI_MEASUREMENT_HEAD_DIR" && "${dollar}{expanded[@]}") >"$sample_stdout" 2>"$sample_stderr" status=$? set -e ended="$(date +%s%3N)" @@ -342,14 +881,47 @@ measure() { sample_first=0 jq -cn \ --argjson index "$sample_index" \ + --arg measuredIndex "$measured_index" \ + --arg phase "$phase" \ --argjson status "$status" \ --argjson durationMs "$duration_ms" \ --arg stdout "$sample_stdout" \ --arg stderr "$sample_stderr" \ --arg trace "$sample_trace" \ - '{index:$index,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end)}' \ + --arg order "$(if [ "$phase" = "measured" ] && [ "$base_ran_before_head" -eq 1 ]; then printf base-head; else printf head-base; fi)" \ + --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \ + '{index:$index,measuredIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),pairIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),subject:"head",phase:$phase,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end),order:(if $phase == "measured" then $order else null end),orderSeed:(if $phase == "measured" then $orderSeed else null end)}' \ >>"$samples_file" + if [ "$phase" = "measured" ] && [ "$status" -eq 0 ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ "$base_ran_before_head" -eq 0 ]; then + base_stdout="$ARTIFACT_DIR/$id.$sample_index.base.stdout" + base_stderr="$ARTIFACT_DIR/$id.$sample_index.base.stderr" + base_started="$(date +%s%3N)" + set +e + (cd "$CI_MEASUREMENT_BASE_DIR" && "${dollar}{expanded[@]}") >"$base_stdout" 2>"$base_stderr" + base_status=$? + set -e + base_ended="$(date +%s%3N)" + base_duration_ms=$((base_ended - base_started)) + + printf ',' >>"$samples_file" + jq -cn \ + --argjson index "$sample_index" \ + --arg measuredIndex "$measured_index" \ + --argjson status "$base_status" \ + --argjson durationMs "$base_duration_ms" \ + --arg stdout "$base_stdout" \ + --arg stderr "$base_stderr" \ + --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \ + '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"head-base",orderSeed:$orderSeed}' \ + >>"$samples_file" + + if [ "$base_status" -ne 0 ]; then + echo "::warning::$id paired baseline sample $measured_index failed after ${dollar}{base_duration_ms}ms; this pair is excluded from wall-clock gating" + tail -40 "$base_stderr" || true + fi + fi + stdout="$sample_stdout" stderr="$sample_stderr" trace_file="$sample_trace" @@ -360,18 +932,24 @@ measure() { done printf ']\n' >>"$samples_file" - status="$(jq -r 'map(.status) | max // 0' "$samples_file")" - duration_ms="$(jq -r 'map(select(.status == 0) | .durationMs) as $values | if ($values | length) == 0 then (map(.durationMs) | max // 0) else ($values | sort | .[(length - 1) / 2 | floor]) end' "$samples_file")" + status="$(jq -r 'map(select((.subject // "head") == "head") | .status) | max // 0' "$samples_file")" + duration_ms="$(jq -r 'map(select((.subject // "head") == "head" and .phase != "warmup" and .status == 0) | .durationMs) as $values | if ($values | length) == 0 then (map(select((.subject // "head") == "head") | .durationMs) | max // 0) else ($values | sort | .[(length - 1) / 2 | floor]) end' "$samples_file")" cp "$stdout" "$ARTIFACT_DIR/$id.stdout" 2>/dev/null || true cp "$stderr" "$ARTIFACT_DIR/$id.stderr" 2>/dev/null || true - json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file" + json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file" "$gate_policy" "$metadata_json" if [ "$status" -ne 0 ]; then - echo "::error::$id failed after ${dollar}{duration_ms}ms; stderr tail follows" + if [ "${dollar}{CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" = "1" ]; then + echo "::warning::$id failed after ${dollar}{duration_ms}ms; keeping earlier successful baseline probes and excluding this failed probe from numeric observations" + else + echo "::error::$id failed after ${dollar}{duration_ms}ms; stderr tail follows" + fi tail -80 "$stderr" || true - return "$status" + if [ "${dollar}{CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" != "1" ]; then + return "$status" + fi fi } @@ -410,8 +988,8 @@ jq -n \ --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ --arg repository "${dollar}{GITHUB_REPOSITORY:-unknown}" \ --arg branchKind "${dollar}{GITHUB_EVENT_NAME:-unknown}" \ - --arg ref "${dollar}{GITHUB_REF:-unknown}" \ - --arg headSha "${dollar}{GITHUB_SHA:-unknown}" \ + --arg ref "${dollar}{CI_MEASUREMENT_SUBJECT_REF:-${dollar}{GITHUB_REF:-unknown}}" \ + --arg headSha "${dollar}{CI_MEASUREMENT_SUBJECT_SHA:-${dollar}{GITHUB_SHA:-unknown}}" \ --arg baseSha "${dollar}{GITHUB_BASE_SHA:-}" \ --arg runnerName "${dollar}{RUNNER_NAME:-unknown}" \ --arg runnerOs "${dollar}{RUNNER_OS:-unknown}" \ @@ -425,11 +1003,16 @@ jq -n \ --arg traceId "${dollar}{TRACE_ID:-}" \ --arg devenvRev "${dollar}{DEVENV_REV:-unknown}" \ --arg otelServiceName "${dollar}{OTEL_SERVICE_NAME:-unknown}" \ + --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \ --arg targetSystem "${dollar}{DEVENV_SYSTEM:-${dollar}{RUNNER_OS:-unknown}}" \ '{ schemaVersion: $schemaVersion, generatedAt: $generatedAt, - producer: { name: "effect-utils-ci-measurement", version: 1 }, + producer: { + name: "effect-utils-ci-measurement", + version: 2, + measurementProtocol: "devenv-perf-warm-median-v2" + }, subject: { repo: $repository, branchKind: (if $branchKind == "" then "unknown" else $branchKind end), @@ -451,28 +1034,113 @@ jq -n \ target: { kind: "devenv", id: "dev-shell", name: "dev-shell", label: "Dev shell", group: "devenv", system: $targetSystem }, observations: ( $timings[0] + | map(select(.status == 0)) | map({ id: ("devenv." + .id + ".duration"), label: .label, group: .group, + path: (.metadata.path // []), + description: .description, + measurementKind: (if (.gatePolicy.enabled == false) then "diagnostic" else "wall-clock" end), name: ("devenv." + .id + ".duration"), unit: "seconds", value: (.durationMs / 1000), + policy: .gatePolicy, + comparison: { + mode: (.gatePolicy.comparisonMode // "historical"), + pairedSampleCount: (.statistics.pairedSampleCount // 0), + baseline: ( + if (.statistics.pairedBaselineMedianDurationMs // null) == null + then null + else (.statistics.pairedBaselineMedianDurationMs / 1000) + end + ) + }, statistics: { sampleCount: (.statistics.sampleCount // 1), + warmupCount: (.statistics.warmupCount // 0), + measuredSampleCount: (.statistics.measuredSampleCount // (.statistics.sampleCount // 1)), successfulSampleCount: (.statistics.successfulSampleCount // (if .status == 0 then 1 else 0 end)), min: ((.statistics.minDurationMs // .durationMs) / 1000), max: ((.statistics.maxDurationMs // .durationMs) / 1000), - median: ((.statistics.medianDurationMs // .durationMs) / 1000) + median: ((.statistics.medianDurationMs // .durationMs) / 1000), + pairedSampleCount: (.statistics.pairedSampleCount // 0), + pairedCurrentMedian: ( + if (.statistics.pairedCurrentMedianDurationMs // null) == null + then null + else (.statistics.pairedCurrentMedianDurationMs / 1000) + end + ), + pairedBaselineMedian: ( + if (.statistics.pairedBaselineMedianDurationMs // null) == null + then null + else (.statistics.pairedBaselineMedianDurationMs / 1000) + end + ), + pairedDeltaMedian: ( + if (.statistics.pairedDeltaMedianDurationMs // null) == null + then null + else (.statistics.pairedDeltaMedianDurationMs / 1000) + end + ), + pairedDeltaMin: ( + if (.statistics.pairedDeltaMinDurationMs // null) == null + then null + else (.statistics.pairedDeltaMinDurationMs / 1000) + end + ), + pairedDeltaMax: ( + if (.statistics.pairedDeltaMaxDurationMs // null) == null + then null + else (.statistics.pairedDeltaMaxDurationMs / 1000) + end + ), + pairedDeltaP25: ( + if (.statistics.pairedDeltaP25DurationMs // null) == null + then null + else (.statistics.pairedDeltaP25DurationMs / 1000) + end + ), + pairedDeltaP75: ( + if (.statistics.pairedDeltaP75DurationMs // null) == null + then null + else (.statistics.pairedDeltaP75DurationMs / 1000) + end + ), + pairedDeltaMad: ( + if (.statistics.pairedDeltaMadDurationMs // null) == null + then null + else (.statistics.pairedDeltaMadDurationMs / 1000) + end + ), + pairedDeltaSamples: ((.statistics.pairedDeltaSampleDurationMs // []) | map(. / 1000)) }, - dimensions: { + dimensions: ((.metadata.dimensions // {}) + { probe: .id, probeLabel: .label, status: .status, sampleCount: (.statistics.sampleCount // 1), + warmupCount: (.statistics.warmupCount // 0), + measuredSampleCount: (.statistics.measuredSampleCount // (.statistics.sampleCount // 1)), + pairedSampleCount: (.statistics.pairedSampleCount // 0), + pairedOrderProtocol: ( + if (.statistics.pairedSampleCount // 0) > 0 + then "balanced-seeded-alternating-v1" + else null + end + ), + pairedOrderSeed: ( + if (.statistics.pairedSampleCount // 0) > 0 + then $orderSeed + else null + end + ), + measurementProtocol: "devenv-perf-warm-median-v2", + aggregation: "median", + phase: "warm", devenvRev: $devenvRev, otelServiceName: $otelServiceName - } + }) }) ), artifacts: [ @@ -490,109 +1158,6 @@ jq -n \ } }' >"$ARTIFACT_DIR/measurements.json" -compare_baseline() { - local baseline_path="${dollar}{DEVENV_PERF_BASELINE_SUMMARY:-$ARTIFACT_DIR/baseline/summary.json}" - local mode="${dollar}{DEVENV_PERF_REGRESSION_MODE:-warn}" - - if [ "$mode" = "off" ]; then - jq -n --argjson schemaVersion 1 --arg status skipped --arg mode "$mode" '{schemaVersion:$schemaVersion, status:$status, mode:$mode, checks:{}}' >"$ARTIFACT_DIR/perf-comparison.json" - return 0 - fi - - if [ ! -f "$baseline_path" ]; then - jq -n \ - --argjson schemaVersion 1 \ - --arg status baseline_missing \ - --arg mode "$mode" \ - --arg baseline "$baseline_path" \ - '{schemaVersion:$schemaVersion, status:$status, mode:$mode, baseline:$baseline, checks:{}}' \ - >"$ARTIFACT_DIR/perf-comparison.json" - echo "::notice::devenv perf baseline not found at $baseline_path; recorded current measurements only" - return 0 - fi - - jq -n \ - --slurpfile current "$ARTIFACT_DIR/summary.json" \ - --slurpfile baseline "$baseline_path" \ - --argjson schemaVersion 1 \ - --arg mode "$mode" \ - --arg baselinePath "$baseline_path" \ - ' - def budget($name): - if $name == "shell_eval_traced" then - {warnRatio:1.25, failRatio:1.5, warnMs:1500, failMs:3000} - elif $name == "shell_eval_warm" then - {warnRatio:1.5, failRatio:2.0, warnMs:500, failMs:1000} - elif $name == "tasks_list" or $name == "processes_help" then - {warnRatio:2.0, failRatio:3.0, warnMs:250, failMs:1000} - else - {warnRatio:1.5, failRatio:2.0, warnMs:1000, failMs:3000} - end; - def classify($name; $current; $baseline): - budget($name) as $b - | ($current - $baseline) as $delta - | (if $baseline > 0 then ($current / $baseline) else null end) as $ratio - | ( - if $baseline <= 0 then "unknown" - elif ($delta > $b.failMs and $current > ($baseline * $b.failRatio)) then "fail" - elif ($delta > $b.warnMs and $current > ($baseline * $b.warnRatio)) then "warn" - else "pass" - end - ) as $status - | {status:$status, currentMs:$current, baselineMs:$baseline, deltaMs:$delta, ratio:$ratio, budget:$b}; - ($current[0].checks // {}) as $currentChecks - | ($baseline[0].checks // {}) as $baselineChecks - | ( - $currentChecks - | to_entries - | map( - .key as $name - | .value as $current - | ($baselineChecks[$name] // null) as $base - | { - key: $name, - value: ( - if $base == null then - {status:"missing_baseline", currentMs:$current.durationMs} - elif ($current.status != 0) then - {status:"current_failed", currentMs:$current.durationMs, baselineMs:$base.durationMs} - elif ($base.status != 0) then - {status:"baseline_failed", currentMs:$current.durationMs, baselineMs:$base.durationMs} - else - classify($name; $current.durationMs; $base.durationMs) - end - ) - } - ) - | from_entries - ) as $checks - | ( - if any($checks[]; .status == "fail") then "fail" - elif any($checks[]; .status == "warn") then "warn" - elif any($checks[]; .status == "missing_baseline") then "partial" - else "pass" - end - ) as $status - | {schemaVersion:$schemaVersion, status:$status, mode:$mode, baseline:$baselinePath, checks:$checks} - ' >"$ARTIFACT_DIR/perf-comparison.json" - - local status - status="$(jq -r '.status' "$ARTIFACT_DIR/perf-comparison.json")" - case "$status:$mode" in - fail:fail) - echo "::error::devenv perf regression detected" - jq . "$ARTIFACT_DIR/perf-comparison.json" - return 1 - ;; - fail:*|warn:*) - echo "::warning::devenv perf regression threshold exceeded" - jq . "$ARTIFACT_DIR/perf-comparison.json" - ;; - esac -} - -compare_baseline - if [ -n "${dollar}{GITHUB_STEP_SUMMARY:-}" ]; then { echo "### Devenv perf" @@ -603,12 +1168,6 @@ if [ -n "${dollar}{GITHUB_STEP_SUMMARY:-}" ]; then echo "" echo "- Artifact directory: \`$ARTIFACT_DIR\`" echo "- OTEL service: \`${dollar}{OTEL_SERVICE_NAME:-unknown}\`" - echo "" - echo "#### Regression comparison" - echo "" - if [ -f "$ARTIFACT_DIR/perf-comparison.json" ]; then - jq -r '["- Status: " + .status, "- Mode: " + .mode, "- Baseline: " + (.baseline // "none")] | .[]' "$ARTIFACT_DIR/perf-comparison.json" - fi } >>"$GITHUB_STEP_SUMMARY" fi @@ -628,6 +1187,19 @@ export const devenvPerfBenchmarkStep = ( }), }) as const +const ciMeasurementBaselineSeedRunsJson = (opts: GitHubPreviousArtifactStepOptions) => + JSON.stringify( + opts.seedRuns ?? + opts.seedRunIds?.map((runId) => ({ + runId, + source: 'manual-backfill', + })) ?? + [], + ) + +const ciMeasurementRequiredObservationsJson = (opts: GitHubPreviousArtifactStepOptions) => + JSON.stringify(opts.requiredObservations ?? []) + export const downloadPreviousGitHubArtifactStep = (opts: GitHubPreviousArtifactStepOptions) => ({ name: `Download previous artifact: ${opts.artifactName}`, @@ -638,41 +1210,115 @@ export const downloadPreviousGitHubArtifactStep = (opts: GitHubPreviousArtifactS BASELINE_OUTPUT_DIR: opts.outputDir, BASELINE_WORKFLOW_NAME: opts.workflowName ?? '${{ github.workflow }}', BASELINE_BRANCH: opts.branch ?? '${{ github.base_ref || github.ref_name }}', - BASELINE_SEED_RUN_IDS: opts.seedRunIds?.join(' ') ?? '', + BASELINE_SEED_RUNS_JSON: ciMeasurementBaselineSeedRunsJson(opts), BASELINE_MAX_RUNS: String(opts.maxRuns ?? 5), + BASELINE_MAX_CANDIDATE_RUNS: String( + opts.maxCandidateRuns ?? Math.max((opts.maxRuns ?? 5) * 3, 20), + ), + BASELINE_REQUIRED_OBSERVATIONS_JSON: ciMeasurementRequiredObservationsJson(opts), + BASELINE_DOWNLOAD_TIMEOUT_SECONDS: String(opts.downloadTimeoutSeconds ?? 120), }, run: String.raw`set -euo pipefail mkdir -p "$BASELINE_OUTPUT_DIR" -if ! command -v gh >/dev/null 2>&1; then - echo "::notice::gh is not available; skipping previous artifact download" - exit 0 +if command -v gh >/dev/null 2>&1; then + GH_BIN="$(command -v gh)" +else + echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix" + if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then + echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download" + exit 0 + fi fi +echo "Using GitHub CLI: $GH_BIN" repo="${dollar}{GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}" workflow="${dollar}{BASELINE_WORKFLOW_NAME:-CI}" branch="${dollar}{BASELINE_BRANCH:-${dollar}{GITHUB_BASE_REF:-${dollar}{GITHUB_REF_NAME:-main}}}" +seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json" +required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json" +printf '%s' "${dollar}{BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file" +printf '%s' "${dollar}{BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file" +if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \ + "$seed_runs_file" >/dev/null; then + echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields" + exit 1 +fi +if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \ + "$required_observations_file" >/dev/null; then + echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields" + exit 1 +fi +seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")" +required_observation_count="$(jq 'length' "$required_observations_file")" +max_candidate_runs="${dollar}{BASELINE_MAX_CANDIDATE_RUNS:-${dollar}{BASELINE_MAX_RUNS:-5}}" +if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then + max_candidate_runs=1 +fi candidate_runs="$( - gh run list \ + "$GH_BIN" run list \ --repo "$repo" \ --workflow "$workflow" \ --branch "$branch" \ --event push \ --status success \ --json databaseId,headSha \ - --limit 20 \ + --limit "$max_candidate_runs" \ --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]' )" -candidate_runs="$candidate_runs -$BASELINE_SEED_RUN_IDS" +candidate_runs="$seed_run_ids +$candidate_runs" max_runs="${dollar}{BASELINE_MAX_RUNS:-5}" if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then max_runs=1 fi +download_timeout_seconds="${dollar}{BASELINE_DOWNLOAD_TIMEOUT_SECONDS:-120}" +if ! [[ "$download_timeout_seconds" =~ ^[0-9]+$ ]] || [ "$download_timeout_seconds" -lt 1 ]; then + download_timeout_seconds=120 +fi + +write_baseline_observation_counts() { + local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt" + local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" + find "$BASELINE_OUTPUT_DIR" \ + -mindepth 2 \ + -maxdepth 2 \ + -name measurements.json \ + -type f \ + -print \ + | sort >"$measurement_index" || true + + if [ -s "$measurement_index" ]; then + xargs -r jq -s \ + --slurpfile required "$required_observations_file" \ + ' + ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts + | ($required[0] // []) as $requiredRows + | { + counts: $counts, + required: ( + $requiredRows + | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)}) + ) + } + ' <"$measurement_index" >"$counts_file" + else + jq -n --slurpfile required "$required_observations_file" \ + '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file" + fi +} + +baseline_requirements_satisfied() { + if [ "$required_observation_count" -eq 0 ]; then + return 1 + fi + write_baseline_observation_counts + jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null +} run_id="" artifact_name="" @@ -688,16 +1334,23 @@ for candidate_run in $candidate_runs; do if grep -qxF "$candidate_run" "$seen_runs_file"; then continue fi - printf '%s\n' "$candidate_run" >>"$seen_runs_file" - if [ "$(wc -l <"$downloaded_runs_file" | tr -d ' ')" -ge "$max_runs" ]; then + downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')" + if [ "$downloaded_count" -ge "$max_runs" ]; then + if baseline_requirements_satisfied; then + break + fi + echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history" + fi + if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then break fi + printf '%s\n' "$candidate_run" >>"$seen_runs_file" artifact_json="$( - gh api "repos/$repo/actions/runs/$candidate_run/artifacts" \ - --jq '.artifacts + "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \ + | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts | map(select(.expired == false)) - | map(select(.name == env.BASELINE_ARTIFACT_NAME or (.name | startswith(env.BASELINE_ARTIFACT_NAME + "-")))) + | map(select(.name == $artifactName or (.name | startswith($artifactName + "-")))) | sort_by(.created_at // "") | reverse | .[0] // empty' @@ -708,7 +1361,7 @@ for candidate_run in $candidate_runs; do current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')" current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run" mkdir -p "$current_output_dir" - if gh run download "$candidate_run" \ + if timeout "$download_timeout_seconds" "$GH_BIN" run download "$candidate_run" \ --repo "$repo" \ --name "$current_artifact_name" \ --dir "$current_output_dir"; then @@ -725,11 +1378,19 @@ for candidate_run in $candidate_runs; do '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \ >>"$downloaded_runs_file" else - echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run" + status="$?" + rm -rf "$current_output_dir" + if [ "$status" -eq 124 ]; then + echo "::notice::timed out after ${dollar}{download_timeout_seconds}s downloading baseline artifact $current_artifact_name from run $candidate_run; skipping candidate" + else + echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run (exit $status)" + fi fi fi done +write_baseline_observation_counts + if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch" exit 0 @@ -737,6 +1398,8 @@ fi jq -n \ --slurpfile runs "$downloaded_runs_file" \ + --slurpfile seedRuns "$seed_runs_file" \ + --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \ --argjson schemaVersion 1 \ --arg repository "$repo" \ --arg workflow "$workflow" \ @@ -753,7 +1416,9 @@ jq -n \ runId: $runId, artifactName: $artifactName, artifactId: $artifactId, - runs: $runs + seedRuns: ($seedRuns[0] // []), + runs: $runs, + observationCounts: ($observationCounts[0] // null) }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json" echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR" @@ -762,8 +1427,9 @@ echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact export const devenvPerfArtifactStep = ( opts?: Pick, -) => - ({ +) => { + const artifactDir = opts?.artifactDir ?? 'tmp/devenv-perf-ci' + return { name: 'Upload devenv perf artifacts', if: 'always()', uses: 'actions/upload-artifact@v4', @@ -771,11 +1437,12 @@ export const devenvPerfArtifactStep = ( name: opts?.artifactName ?? 'devenv-perf-${{ github.job }}-${{ github.run_id }}-attempt-${{ github.run_attempt }}', - path: opts?.artifactDir ?? 'tmp/devenv-perf-ci', + path: [artifactDir, `!${artifactDir}/baseline/**`].join('\n'), 'if-no-files-found': 'error', 'retention-days': opts?.retentionDays ?? 30, }, - }) as const + } as const +} export const ciMeasurementsArtifactStep = (opts: CiMeasurementsArtifactStepOptions) => ({ @@ -784,7 +1451,7 @@ export const ciMeasurementsArtifactStep = (opts: CiMeasurementsArtifactStepOptio uses: 'actions/upload-artifact@v4', with: { name: opts.artifactName, - path: opts.path, + path: [opts.path, `!${opts.path}/baseline/**`].join('\n'), 'if-no-files-found': 'error', 'retention-days': opts.retentionDays ?? 30, }, @@ -801,6 +1468,10 @@ export const nixClosureMeasurementStep = (opts: NixClosureMeasurementStepOptions const targetLabel = opts.targetLabel ?? targetName const targetGroup = opts.targetGroup ?? 'nix closure' const buckets = JSON.stringify(opts.buckets ?? []) + const targetPath = JSON.stringify(opts.targetPath ?? []) + const gatePolicy = JSON.stringify(opts.gate ?? {}) + const targetDescription = + opts.targetDescription ?? 'Resolved Nix closure for the configured flake installable.' const targetSystemAssignment = opts.targetSystem === undefined ? `target_system="${dollar}{DEVENV_SYSTEM:-${dollar}{RUNNER_OS:-unknown}}"` @@ -821,10 +1492,11 @@ target_id=${shellSingleQuote(targetId)} target_name=${shellSingleQuote(targetName)} target_label=${shellSingleQuote(targetLabel)} target_group=${shellSingleQuote(targetGroup)} +target_description=${shellSingleQuote(targetDescription)} artifact_file=${artifactFileAssignment} ${targetSystemAssignment} -out_path="$(nix build --no-link --print-out-paths "$installable")" +out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")" path_info="$ARTIFACT_DIR/nix-closure-path-info.json" paths_file="$ARTIFACT_DIR/nix-closure-paths.json" @@ -837,8 +1509,8 @@ jq -n \ --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ --arg repository "${dollar}{GITHUB_REPOSITORY:-unknown}" \ --arg branchKind "${dollar}{GITHUB_EVENT_NAME:-unknown}" \ - --arg ref "${dollar}{GITHUB_REF:-unknown}" \ - --arg headSha "${dollar}{GITHUB_SHA:-unknown}" \ + --arg ref "${dollar}{CI_MEASUREMENT_SUBJECT_REF:-${dollar}{GITHUB_REF:-unknown}}" \ + --arg headSha "${dollar}{CI_MEASUREMENT_SUBJECT_SHA:-${dollar}{GITHUB_SHA:-unknown}}" \ --arg baseSha "${dollar}{GITHUB_BASE_SHA:-}" \ --arg runnerName "${dollar}{RUNNER_NAME:-unknown}" \ --arg runnerOs "${dollar}{RUNNER_OS:-unknown}" \ @@ -854,9 +1526,12 @@ jq -n \ --arg targetId "$target_id" \ --arg targetLabel "$target_label" \ --arg targetGroup "$target_group" \ + --arg targetDescription "$target_description" \ --arg targetSystem "$target_system" \ --arg outPath "$out_path" \ --argjson buckets ${shellSingleQuote(buckets)} \ + --argjson targetPath ${shellSingleQuote(targetPath)} \ + --argjson gatePolicy ${shellSingleQuote(gatePolicy)} \ ' ($paths[0] // []) as $closurePaths | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize @@ -868,12 +1543,16 @@ jq -n \ id: "nix.closure.bucket.nar_size", label: (($bucket.label // $bucket.name) + " closure size"), group: "nix closure buckets", + path: ($targetPath + ["buckets", $bucket.name]), + description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex), + measurementKind: "deterministic", unit: "bytes", value: ( $closurePaths | map(select(.path | test($bucket.pathRegex)) | .narSize) | add // 0 ), + policy: $gatePolicy, dimensions: { bucket: $bucket.name } } )) as $bucketObservations @@ -899,24 +1578,32 @@ jq -n \ traceId: $traceId, runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass } }, - target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, system: $targetSystem }, + target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem }, observations: ([ { id: "nix.closure.nar_size", label: "Total closure size", group: "nix closure", + path: ($targetPath + ["total", "nar-size"]), + description: ("Total NAR size for all paths in " + $targetDescription), name: "nix.closure.nar_size", + measurementKind: "deterministic", unit: "bytes", value: $totalNarSize, + policy: $gatePolicy, dimensions: { bucket: "total" } }, { id: "nix.closure.path_count", label: "Total closure path count", group: "nix closure", + path: ($targetPath + ["total", "path-count"]), + description: ("Number of store paths in " + $targetDescription), name: "nix.closure.path_count", + measurementKind: "deterministic", unit: "count", value: $pathCount, + policy: $gatePolicy, dimensions: { bucket: "total" } } ] + $bucketObservations), @@ -936,6 +1623,281 @@ cat "$artifact_file" } as const } +export const nixClosureMeasurementSteps = (opts: NixClosureMeasurementsStepsOptions) => { + const artifactDir = opts.artifactDir ?? 'tmp/nix-closure-measurements' + const baselineArtifactName = opts.baselineArtifactName ?? opts.artifactName + const buckets = opts.buckets ?? defaultNixClosureMeasurementBuckets + const compare = opts.compare ?? true + + return [ + ...(compare + ? [ + downloadPreviousGitHubArtifactStep({ + artifactName: baselineArtifactName, + outputDir: `${artifactDir}/baseline`, + seedRuns: opts.baselineSeedRuns, + seedRunIds: opts.baselineSeedRunIds, + maxRuns: opts.baselineMaxRuns, + maxCandidateRuns: opts.baselineMaxCandidateRuns, + }), + ] + : []), + ...opts.targets.map((target) => + nixClosureMeasurementStep({ + installable: target.installable, + targetId: target.id, + targetName: target.name ?? target.id, + targetLabel: target.label, + targetGroup: target.group, + targetPath: target.path, + targetDescription: target.description, + targetSystem: target.system, + artifactDir: `${artifactDir}/current/${target.id}`, + buckets: target.buckets ?? buckets, + gate: target.gate, + }), + ), + ...(compare + ? [ + compareCiMeasurementsStep({ + currentDir: `${artifactDir}/current`, + baselineDir: `${artifactDir}/baseline`, + outputFile: `${artifactDir}/measurement-comparison.json`, + regressionMode: opts.regressionMode ?? 'warn', + prComment: opts.prComment, + }), + ] + : []), + ciMeasurementsArtifactStep({ + artifactName: opts.artifactName, + path: artifactDir, + retentionDays: opts.retentionDays, + }), + ] as const +} + +export const nixClosureMeasurementsJob = (opts: NixClosureMeasurementsJobOptions) => + ({ + ...(opts.ifExpr === undefined ? {} : { if: opts.ifExpr }), + 'runs-on': opts.runsOn ?? linuxX64Runner, + ...(opts.timeoutMinutes === undefined ? {} : { 'timeout-minutes': opts.timeoutMinutes }), + ...(opts.permissions === undefined ? {} : { permissions: opts.permissions }), + defaults: bashShellDefaults, + env: { + ...standardCIEnv, + ...opts.env, + }, + steps: [ + ...(opts.setupSteps ?? [checkoutStep(), installNixStep(), validateNixStoreStep]), + ...nixClosureMeasurementSteps(opts), + ], + }) as const + +export const sourceShapeMeasurementStep = (opts: SourceShapeMeasurementStepOptions) => { + const artifactDir = opts.artifactDir ?? 'tmp/ci-measurements' + const artifactFileAssignment = + opts.artifactFile === undefined + ? '"$ARTIFACT_DIR/measurements.json"' + : shellSingleQuote(opts.artifactFile) + const targetName = opts.targetName ?? 'source shape' + const targetId = opts.targetId ?? targetName + const targetLabel = opts.targetLabel ?? targetName + const targetGroup = opts.targetGroup ?? 'source shape' + const targetPath = JSON.stringify(opts.targetPath ?? ['source']) + const scopes = JSON.stringify(opts.scopes) + const targetSystemAssignment = + opts.targetSystem === undefined + ? `target_system="${dollar}{DEVENV_SYSTEM:-${dollar}{RUNNER_OS:-unknown}}"` + : `target_system=${shellSingleQuote(opts.targetSystem)}` + + return { + name: `Measure source shape: ${targetName}`, + shell: 'bash', + env: { + ARTIFACT_DIR: artifactDir, + RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}', + }, + run: String.raw`set -euo pipefail + +${ciMeasurementToolBootstrapScript} +require_ci_measurement_tool node nodejs + +mkdir -p "$ARTIFACT_DIR" +target_id=${shellSingleQuote(targetId)} +target_name=${shellSingleQuote(targetName)} +target_label=${shellSingleQuote(targetLabel)} +target_group=${shellSingleQuote(targetGroup)} +artifact_file=${artifactFileAssignment} +${targetSystemAssignment} + +SCOPES_JSON=${shellSingleQuote(scopes)} \ +TARGET_PATH_JSON=${shellSingleQuote(targetPath)} \ +TARGET_ID="$target_id" \ +TARGET_NAME="$target_name" \ +TARGET_LABEL="$target_label" \ +TARGET_GROUP="$target_group" \ +TARGET_SYSTEM="$target_system" \ +node <<'NODE' >"$artifact_file" +const cp = require('node:child_process') +const fs = require('node:fs') +const path = require('node:path') + +const normalize = (value) => { + const normalized = value.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+$/, '') + return normalized === '.' ? '' : normalized +} +const scopes = JSON.parse(process.env.SCOPES_JSON || '[]') +const targetPath = JSON.parse(process.env.TARGET_PATH_JSON || '["source"]') +const gitFiles = cp + .execFileSync('git', ['ls-files', '-z'], { encoding: 'buffer' }) + .toString('utf8') + .split('\0') + .filter(Boolean) + .map(normalize) + +const includesPath = (file, candidates) => { + if (!Array.isArray(candidates) || candidates.length === 0) return true + return candidates.map(normalize).some((candidate) => candidate === '' || file === candidate || file.startsWith(candidate + '/')) +} + +const excludesPath = (file, candidates) => + Array.isArray(candidates) && + candidates.map(normalize).some((candidate) => candidate !== '' && (file === candidate || file.startsWith(candidate + '/'))) + +const matchesExtension = (file, extensions) => { + if (!Array.isArray(extensions) || extensions.length === 0) return true + const ext = path.extname(file).toLowerCase() + return extensions.map((extension) => extension.toLowerCase()).some((extension) => ext === extension) +} + +const countLines = (file) => { + const buffer = fs.readFileSync(file) + if (buffer.includes(0)) return undefined + if (buffer.length === 0) return 0 + let lines = 0 + for (const byte of buffer) { + if (byte === 10) lines += 1 + } + return buffer[buffer.length - 1] === 10 ? lines : lines + 1 +} + +const observations = [] +const scopeSummaries = [] + +for (const scope of scopes) { + const root = normalize(scope.root || '.') + const includePaths = Array.isArray(scope.includePaths) && scope.includePaths.length > 0 ? scope.includePaths : [root] + const files = gitFiles + .filter((file) => includesPath(file, includePaths)) + .filter((file) => !excludesPath(file, scope.excludePaths)) + .filter((file) => matchesExtension(file, scope.includeExtensions)) + + let lineCount = 0 + let measuredFileCount = 0 + for (const file of files) { + const lines = countLines(file) + if (lines === undefined) continue + lineCount += lines + measuredFileCount += 1 + } + + const group = scope.group || 'source shape' + const scopePath = Array.isArray(scope.path) ? scope.path : ['source', scope.id] + const policy = scope.gate || { enabled: false, minBaselineSources: 3, minCurrentSamples: 1 } + observations.push( + { + id: 'source.lines', + label: scope.label + ' lines', + group, + path: scopePath, + description: 'Tracked non-binary source lines in the configured scope.', + measurementKind: 'deterministic', + name: 'source.lines', + unit: 'lines', + value: lineCount, + dimensions: { scope: scope.id }, + policy, + statistics: { sampleCount: 1, measuredSampleCount: measuredFileCount }, + }, + { + id: 'source.files', + label: scope.label + ' files', + group, + path: scopePath, + description: 'Tracked non-binary source files in the configured scope.', + measurementKind: 'deterministic', + name: 'source.files', + unit: 'count', + value: measuredFileCount, + dimensions: { scope: scope.id }, + policy, + statistics: { sampleCount: 1, measuredSampleCount: measuredFileCount }, + }, + ) + scopeSummaries.push({ + id: scope.id, + label: scope.label, + root, + includePaths, + excludePaths: scope.excludePaths || [], + includeExtensions: scope.includeExtensions || [], + fileCount: measuredFileCount, + lineCount, + }) +} + +const artifact = { + schemaVersion: 1, + generatedAt: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z'), + producer: { + name: 'effect-utils-ci-measurement', + version: 1, + measurementProtocol: 'source-shape-v1', + }, + subject: { + repo: process.env.GITHUB_REPOSITORY || 'unknown', + branchKind: process.env.GITHUB_EVENT_NAME || 'unknown', + ref: process.env.CI_MEASUREMENT_SUBJECT_REF || process.env.GITHUB_REF || 'unknown', + headSha: process.env.CI_MEASUREMENT_SUBJECT_SHA || process.env.GITHUB_SHA || 'unknown', + baseSha: process.env.GITHUB_BASE_SHA || '', + }, + execution: { + provider: process.env.GITHUB_RUN_ID && process.env.GITHUB_RUN_ID !== 'unknown' ? 'github-actions' : 'local', + workflow: 'CI', + job: process.env.GITHUB_JOB || 'unknown', + runId: process.env.GITHUB_RUN_ID || 'unknown', + runAttempt: process.env.GITHUB_RUN_ATTEMPT || 'unknown', + taskId: process.env.CROSSTASK_TASK_ID || '', + attemptId: process.env.CROSSTASK_ATTEMPT_ID || '', + traceId: process.env.TRACE_ID || '', + runner: { + name: process.env.RUNNER_NAME || 'unknown', + os: process.env.RUNNER_OS || 'unknown', + arch: process.env.RUNNER_ARCH || 'unknown', + class: process.env.RUNNER_CLASS || 'unknown', + }, + }, + target: { + kind: 'source-shape', + id: process.env.TARGET_ID, + name: process.env.TARGET_NAME, + label: process.env.TARGET_LABEL, + group: process.env.TARGET_GROUP, + path: targetPath, + system: process.env.TARGET_SYSTEM, + }, + observations, + details: { scopes: scopeSummaries }, +} + +process.stdout.write(JSON.stringify(artifact, null, 2) + '\n') +NODE + +cat "$artifact_file" +`, + } as const +} + export const compareCiMeasurementsStep = (opts?: CiMeasurementsComparisonStepOptions) => ({ name: 'Compare CI measurements with baseline', @@ -952,9 +1914,13 @@ export const compareCiMeasurementsStep = (opts?: CiMeasurementsComparisonStepOpt CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: String(opts?.prComment?.maxHistory ?? 20), CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: opts?.prComment?.assetBranch ?? 'ci-measurement-assets', - ...(opts?.prComment?.tokenExpression === undefined + ...(opts?.prComment?.publicAssetCommand === undefined ? {} - : { GH_TOKEN: opts.prComment.tokenExpression }), + : { CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND: opts.prComment.publicAssetCommand }), + ...(opts?.prComment?.publicAssetEnv ?? {}), + ...(opts?.prComment?.enabled === true + ? { GH_TOKEN: opts.prComment.tokenExpression ?? '${{ github.token }}' } + : {}), }, run: String.raw`set -euo pipefail @@ -975,8 +1941,10 @@ fi current_index="$(mktemp)" baseline_index="$(mktemp)" -find "$current_dir" -path "$baseline_dir" -prune -o -name measurements.json -type f -print | sort >"$current_index" || true -find "$baseline_dir" -name measurements.json -type f -print | sort >"$baseline_index" || true +find "$current_dir" -name baseline -type d -prune -o -name measurements.json -type f -print | sort >"$current_index" || true +{ + find "$baseline_dir" -name baseline -type d ! -path "$baseline_dir" -prune -o -name measurements.json -type f -print +} | sort -u >"$baseline_index" || true if [ ! -s "$current_index" ]; then echo "::error::no current measurements.json files found under $current_dir" @@ -1003,7 +1971,7 @@ jq -n \ def identity_dimensions: (.dimensions // {}) | to_entries - | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount"] | index($key) | not)) + | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not)) | sort_by(.key) | map("\(.key)=\(.value|tostring)") | join(","); @@ -1026,6 +1994,15 @@ jq -n \ else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2) end; + def percentile($p): + sort as $sorted + | ($sorted | length) as $count + | if $count == 0 then null + else $sorted[(($p * ($count - 1)) | floor)] + end; + + def abs_value: if . < 0 then -. else . end; + def observations_by_key($docs): reduce $docs[]? as $doc ({}; @@ -1042,29 +2019,48 @@ jq -n \ def observation_stats($items): ($items | map(.observation.value)) as $values - | ($items | map(.observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount + | ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues + | ($items | map(.observation.statistics.pairedDeltaMedian // empty)) as $pairedDeltaMedianValues + | ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values + | ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values + | ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues + | ($items | map(.observation.statistics.pairedDeltaSamples // []) | add // []) as $pairedDeltaSampleValues + | ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount + | ($values | median) as $median | { target: ($items[0].target // {}), observation: ($items[-1].observation // {}), - value: ($values | median), + measurementKind: ($items[-1].observation.measurementKind // null), + value: $median, min: ($values | min), max: ($values | max), + p25: ($values | percentile(0.25)), + p75: ($values | percentile(0.75)), + p95: ($values | percentile(0.95)), + mad: ($values | map(. - $median | if . < 0 then -. else . end) | median), sourceCount: ($items | length), sampleCount: $sampleCount, + pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0), + pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end), + pairedDeltaMedianValue: (if ($pairedDeltaMedianValues | length) == 0 then null else ($pairedDeltaMedianValues | median) end), + pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end), + pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end), + pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end), + pairedDeltaSampleValues: $pairedDeltaSampleValues, generatedAt: ($items[-1].generatedAt // null) }; def budget($metric; $unit): if $metric == "nix.closure.nar_size" then - {warnRatio:1.10, failRatio:1.25, warnAbs:52428800, failAbs:209715200} + {warnRatio:1.05, failRatio:1.10, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10485760} elif $metric == "nix.closure.bucket.nar_size" then - {warnRatio:1.15, failRatio:1.35, warnAbs:52428800, failAbs:209715200} + {warnRatio:1.10, failRatio:1.20, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.05, statisticalToleranceAbs:10485760} elif $metric == "nix.closure.path_count" then - {warnRatio:1.10, failRatio:1.25, warnAbs:100, failAbs:500} + {warnRatio:1.05, failRatio:1.10, warnAbs:100, failAbs:500, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10} elif $unit == "seconds" then - {warnRatio:1.25, failRatio:1.50, warnAbs:1.5, failAbs:3.0} + {warnRatio:1.10, failRatio:1.20, warnAbs:0.25, failAbs:1, statisticalToleranceRatio:0.10, statisticalToleranceAbs:0.25} else - {warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3} + {warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3, statisticalToleranceRatio:0.10, statisticalToleranceAbs:1} end; def noise_floor($metric; $unit): @@ -1073,13 +2069,68 @@ jq -n \ elif $unit == "seconds" then 0.1 else 0 end; - def abs_value: if . < 0 then -. else . end; - - def classify($metric; $unit; $current; $baseline; $baselineMin; $baselineMax; $currentSamples; $baselineSources): + def default_policy($metric; $unit): budget($metric; $unit) as $b | noise_floor($metric; $unit) as $noise + | $b + { + enabled:true, + comparisonMode:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then "budget" else "historical" end), + minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then 1 else 10 end), + minCurrentSamples:(if $unit == "seconds" then 3 else 1 end), + minPairedSamples:(if $unit == "seconds" then 5 else 0 end), + noiseFloor:$noise + }; + def observation_policy($obs): + default_policy($obs.name // "unknown"; $obs.unit // "unknown") + ($obs.policy // {}); + def policy_enabled($policy): + if ($policy | has("enabled")) then $policy.enabled else true end; + + def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad; $pairedDeltaValues): + $policy as $b + | ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode + | ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise | ($current - $baseline) as $delta + | (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta + | (($policy.pairedEvidenceQuantile // 0.25) | tonumber) as $pairedEvidenceQuantile | (if $baseline > 0 then ($current / $baseline) else null end) as $ratio + | (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr + | (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr + | (($pairedDeltaP75 // $evidenceDelta) - ($pairedDeltaP25 // $evidenceDelta)) as $pairedDeltaIqr + | ([ + $noise, + (($policy.statisticalToleranceAbs // 0) | tonumber), + (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end), + (($baselineMad // 0) * 3), + (($iqr // 0) * 1.5) + ] | max) as $robustTolerance + | (if $currentSamples > 1 then ([ + $noise, + (($policy.statisticalToleranceAbs // 0) | tonumber), + (if $current > 0 then ($current * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end), + (($currentMad // 0) * 3), + (($currentIqr // 0) * 1.5) + ] | max) else 0 end) as $currentRobustTolerance + | ([ + $noise, + (($policy.statisticalToleranceAbs // 0) | tonumber), + (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end), + (($pairedDeltaMad // 0) * 3), + (($pairedDeltaIqr // 0) * 1.5) + ] | max) as $pairedDeltaTolerance + | ($baseline + $robustTolerance) as $robustUpper + | ($baseline - $robustTolerance) as $robustLower + | ($current + $currentRobustTolerance) as $currentRobustUpper + | ($current - $currentRobustTolerance) as $currentRobustLower + | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile($pairedEvidenceQuantile)) else ($evidenceDelta - $pairedDeltaTolerance) end) as $evidenceDeltaLower + | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile(1 - $pairedEvidenceQuantile)) else ($evidenceDelta + $pairedDeltaTolerance) end) as $evidenceDeltaUpper + | ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget + | ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget + | ($comparisonMode != "paired") as $needsHistoricalBaselineCount + | ( + ($current >= $robustLower and $current <= $robustUpper) + or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower) + ) as $withinRobustBand + | ($comparisonMode == "historical" and $measurementKind != "deterministic") as $canUseRobustBandSuppression | ( $baselineMin != null and $baselineMax != null @@ -1088,35 +2139,90 @@ jq -n \ ) as $withinBaselineRange | ( if $baseline <= 0 then "unknown" + elif $comparisonMode == "paired" and $evidenceDeltaLower > $failBudget then "fail" + elif $comparisonMode == "paired" and $evidenceDeltaLower > $warnBudget then "warn" + elif $comparisonMode == "paired" then "pass" elif ($delta > $b.failAbs and $current > ($baseline * $b.failRatio)) then "fail" elif ($delta > $b.warnAbs and $current > ($baseline * $b.warnRatio)) then "warn" else "pass" end ) as $thresholdStatus + | ( + policy_enabled($policy) == true + and $baseline > 0 + and (if $needsHistoricalBaselineCount then $baselineSources >= ($policy.minBaselineSources // 1) else true end) + and $currentSamples >= ($policy.minCurrentSamples // 1) + and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end) + and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end) + ) as $gateable + | ( + if (policy_enabled($policy) != true) then "disabled" + elif $baseline <= 0 then "missing_baseline" + elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count" + elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count" + elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count" + elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta" + else "eligible" + end + ) as $gateReason | ( if $baseline <= 0 then "unknown" + elif (policy_enabled($policy) != true) then "diagnostic" elif ($delta | abs_value) <= $noise then "noise_floor" - elif ($withinBaselineRange and $thresholdStatus == "pass") then "within_baseline_range" - elif ($baselineSources < 3 or $currentSamples < 3) then "low_sample_count" + elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count" + elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count" + elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count" + elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta" + elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain" + elif ($canUseRobustBandSuppression and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band" elif $thresholdStatus == "pass" then "within_budget" else "threshold_exceeded" end ) as $confidence | ( - if $confidence == "threshold_exceeded" then $thresholdStatus + if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus elif $thresholdStatus == "unknown" then "unknown" else "pass" end ) as $status | ( if $baseline <= 0 then "unknown" + elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then "unchanged" + elif $comparisonMode == "paired" and $evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0 then "unchanged" + elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved" + elif $comparisonMode == "paired" then "regressed" elif ($delta | abs_value) <= $noise then "unchanged" - elif ($withinBaselineRange and $thresholdStatus == "pass") then "unchanged" + elif $canUseRobustBandSuppression and $withinRobustBand then "unchanged" elif $delta < 0 then "improved" else "regressed" + end + ) as $direction + | ( + if $baseline <= 0 then null + elif (policy_enabled($policy) != true) then null + elif $comparisonMode == "paired" and ($evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0) then 0 + elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0 + elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget + elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget) + elif $canUseRobustBandSuppression and $withinRobustBand then 0 + elif ($delta | abs_value) <= $noise then 0 + elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget + elif ($confidence == "threshold_exceeded" and $delta < 0) then -(([0, ($robustLower - $currentRobustUpper), (-$delta)] | max) / $warnBudget) + elif $delta > 0 then ([0, ($currentRobustLower - $robustUpper)] | max) / $warnBudget + else -(([0, ($robustLower - $currentRobustUpper)] | max) / $warnBudget) + end + ) as $semanticImpactScore + | ( + if (policy_enabled($policy) != true) then "diagnostic" + elif $semanticImpactScore == null then "unknown" + elif $semanticImpactScore == 0 then "neutral" + elif $semanticImpactScore >= ($failBudget / $warnBudget) then "fail_boundary" + elif $semanticImpactScore >= 1 then "warn_boundary" + elif $semanticImpactScore > 0 then "below_warn_boundary" + else "improvement" end - ) as $direction - | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,confidence:$confidence,direction:$direction}; + ) as $semanticImpactKind + | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance,pairedEvidenceQuantile:$pairedEvidenceQuantile,pairedEvidenceProtocol:(if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then "paired-delta-quantile-v1" elif $comparisonMode == "paired" then "paired-summary-robust-band-v1" else null end)}; (observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs | (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs @@ -1127,38 +2233,74 @@ jq -n \ .key as $key | .value as $currentValue | ($baselineObs[$key] // null) as $baselineValue + | ($currentValue.observation | observation_policy(.)) as $policy + | ($policy.comparisonMode // (if ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "deterministic" or ($currentValue.observation.unit // "") != "seconds" then "budget" elif ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode + | ($currentValue.pairedBaselineValue // null) as $pairedBaselineValue + | (if $comparisonMode == "paired" and $pairedBaselineValue != null then { + value: $pairedBaselineValue, + min: $pairedBaselineValue, + max: $pairedBaselineValue, + p25: $pairedBaselineValue, + p75: $pairedBaselineValue, + p95: $pairedBaselineValue, + mad: 0, + sourceCount: $currentValue.pairedSampleCount + } else $baselineValue end) as $effectiveBaselineValue | { key: $key, value: ( - if $baselineValue == null then + if $effectiveBaselineValue == null then { status: "missing_baseline", target: $currentValue.target, observation: $currentValue.observation, - current: $currentValue.value, - currentSamples: $currentValue.sampleCount, - baselineSources: 0, - confidence: "missing_baseline", - direction: "unknown" - } - else - classify( - $currentValue.observation.name; - $currentValue.observation.unit; - $currentValue.value; - $baselineValue.value; - $baselineValue.min; - $baselineValue.max; - $currentValue.sampleCount; - $baselineValue.sourceCount - ) + { + current: $currentValue.value, + currentSamples: $currentValue.sampleCount, + baselineSources: 0, + gatePolicy: $policy, + comparisonMode: $comparisonMode, + gateable: false, + gateReason: "missing_baseline", + confidence: "missing_baseline", + direction: "unknown" + } + else + classify( + $currentValue.observation.name; + $currentValue.observation.unit; + ($currentValue.observation.measurementKind // $currentValue.measurementKind); + $policy; + $currentValue.value; + $currentValue.p25; + $currentValue.p75; + $currentValue.mad; + $effectiveBaselineValue.value; + $effectiveBaselineValue.min; + $effectiveBaselineValue.max; + $effectiveBaselineValue.p25; + $effectiveBaselineValue.p75; + $effectiveBaselineValue.p95; + $effectiveBaselineValue.mad; + $currentValue.sampleCount; + $effectiveBaselineValue.sourceCount; + $currentValue.pairedSampleCount; + $currentValue.pairedDeltaMedianValue; + $currentValue.pairedDeltaP25Value; + $currentValue.pairedDeltaP75Value; + $currentValue.pairedDeltaMadValue; + ($currentValue.pairedDeltaSampleValues // []) + ) + { target: $currentValue.target, observation: $currentValue.observation, - currentSamples: $currentValue.sampleCount, - baselineSources: $baselineValue.sourceCount, - baselineMin: $baselineValue.min, - baselineMax: $baselineValue.max - } + currentSamples: $currentValue.sampleCount, + baselineSources: $effectiveBaselineValue.sourceCount, + baselineMin: $effectiveBaselineValue.min, + baselineMax: $effectiveBaselineValue.max, + baselineP25: $effectiveBaselineValue.p25, + baselineP75: $effectiveBaselineValue.p75, + baselineP95: $effectiveBaselineValue.p95 + ,baselineMad: $effectiveBaselineValue.mad + } end ) } @@ -1168,14 +2310,38 @@ jq -n \ | ( if any($comparisons[]?; .status == "fail") then "fail" elif any($comparisons[]?; .status == "warn") then "warn" - elif any($comparisons[]?; .status == "missing_baseline") then "partial" + elif any($comparisons[]?; + (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end) + and (.gateReason == "missing_baseline" + or .gateReason == "low_baseline_count" + or .gateReason == "low_current_sample_count" + or .gateReason == "low_paired_sample_count" + or .gateReason == "missing_paired_delta") + ) then "partial" else "pass" end ) as $status + | ( + [$comparisons[]?] + | { + enabledCount: (map(select((if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end))) | length), + gateableCount: (map(select(.gateable == true)) | length), + missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length), + lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length), + lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length), + lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length), + missingPairedDeltaCount: (map(select(.gateReason == "missing_paired_delta")) | length) + } + | . + { + nonGateableCount: (.enabledCount - .gateableCount), + enforceable: (.enabledCount == .gateableCount) + } + ) as $readiness | { schemaVersion:$schemaVersion, status:$status, mode:$mode, + readiness:$readiness, currentDir:$currentDir, baselineDir:$baselineDir, comparisons:$comparisons @@ -1202,7 +2368,7 @@ case "$status:$mode" in echo "::warning::CI measurement regression threshold exceeded" ;; partial:*) - echo "::notice::CI measurement baseline is missing for one or more observations" + echo "::notice::CI measurement comparison is partial because one or more enabled observations are not gateable" ;; esac @@ -1210,10 +2376,10 @@ if [ -n "${dollar}{GITHUB_STEP_SUMMARY:-}" ]; then { echo "### ${dollar}{CI_MEASUREMENT_PR_COMMENT_TITLE:-CI Measurements}" echo "" - jq -r '"- Status: " + .status + "\n- Mode: " + .mode + "\n- Baseline: " + .baselineDir' "$comparison_file" + jq -r '"- Status: " + .status + "\n- Gate: " + (if .mode == "fail" then "enforced" elif .mode == "warn" then "advisory" elif .mode == "off" then "off" else (.mode // "unknown") end) + "\n- Baseline: " + .baselineDir' "$comparison_file" echo "" - echo "| Status | Target | Observation | Current | Baseline | Delta | Ratio |" - echo "| --- | --- | --- | ---: | ---: | ---: | ---: |" + echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |" + echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |" jq -r ' .comparisons | to_entries @@ -1227,9 +2393,10 @@ if [ -n "${dollar}{GITHUB_STEP_SUMMARY:-}" ]; then | .[:20] | .[] | .value as $v - | [ - $v.status, - (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")), + | [ + $v.status, + (if ($v.gateable // false) then "yes" else ($v.gateReason // "no") end), + (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")), ($v.observation.name // "unknown"), (($v.current // $v.observation.value // 0) | tostring), (($v.baseline // "") | tostring), @@ -1241,18 +2408,47 @@ if [ -n "${dollar}{GITHUB_STEP_SUMMARY:-}" ]; then } >>"$GITHUB_STEP_SUMMARY" fi -if [ "${dollar}{CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && [ "${dollar}{GITHUB_EVENT_NAME:-}" = "pull_request" ]; then +${opts?.prComment?.enabled === true ? String.raw`if [ "${dollar}{CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ]; then + if [ "${dollar}{GITHUB_EVENT_NAME:-}" != "pull_request" ]; then + echo "::notice::CI measurement PR comments are produced only by pull_request workflows; skipping comment for event ${dollar}{GITHUB_EVENT_NAME:-unknown}" + exit 0 + fi + can_render_pr_comment=true - if ! command -v gh >/dev/null 2>&1; then - echo "::notice::gh is not available; skipping CI measurement PR comment" + + ensure_ci_measurement_tool() { + tool_name="$1" + nix_attr="$2" + if command -v "$tool_name" >/dev/null 2>&1; then + return 0 + fi + if ! command -v nix >/dev/null 2>&1; then + return 1 + fi + if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then + export PATH="$tool_out/bin:$PATH" + fi + command -v "$tool_name" >/dev/null 2>&1 + } + + if ! ensure_ci_measurement_tool gh gh; then + echo "::error::gh is not available; unable to publish required CI measurement PR comment" can_render_pr_comment=false fi - if ! command -v jq >/dev/null 2>&1; then - echo "::notice::jq is not available; skipping CI measurement PR comment" + if ! ensure_ci_measurement_tool node nodejs; then + echo "::error::node is not available; unable to publish required CI measurement PR comment" can_render_pr_comment=false fi + if ! command -v jq >/dev/null 2>&1; then + if ensure_ci_measurement_tool jq jq; then + : + else + echo "::error::jq is not available; unable to publish required CI measurement PR comment" + can_render_pr_comment=false + fi + fi if [ -z "${dollar}{GH_TOKEN:-${dollar}{GITHUB_TOKEN:-}}" ]; then - echo "::notice::GH_TOKEN/GITHUB_TOKEN is not set; skipping CI measurement PR comment" + echo "::error::GH_TOKEN/GITHUB_TOKEN is not set; unable to publish required CI measurement PR comment" can_render_pr_comment=false fi @@ -1262,10 +2458,14 @@ if [ "${dollar}{CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && [ "${do pr_number="$(jq -r '.pull_request.number // empty' "$event_path")" fi if [ "$can_render_pr_comment" = "true" ] && [ -z "$pr_number" ]; then - echo "::notice::pull request number is unavailable; skipping CI measurement PR comment" + echo "::error::pull request number is unavailable; unable to publish required CI measurement PR comment" can_render_pr_comment=false fi + if [ "$can_render_pr_comment" != "true" ]; then + exit 1 + fi + if [ "$can_render_pr_comment" = "true" ]; then repo="${dollar}{GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}" comment_tmp_dir="$(mktemp -d)" @@ -1273,6 +2473,9 @@ if [ "${dollar}{CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && [ "${do comment_body="$comment_tmp_dir/comment.md" comment_id_file="$comment_tmp_dir/comment-id.txt" chart_file="$comment_tmp_dir/perf-change-vs-baseline.svg" + chart_dark_file="$comment_tmp_dir/perf-change-vs-baseline-dark.svg" + chart_png_file="$comment_tmp_dir/perf-change-vs-baseline.png" + chart_dark_png_file="$comment_tmp_dir/perf-change-vs-baseline-dark.png" renderer_script="$comment_tmp_dir/render-ci-measurement-comment.mjs" if ! gh api "repos/$repo/issues/$pr_number/comments" --paginate >"$comments_json"; then @@ -1286,21 +2489,44 @@ if [ "${dollar}{CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && [ "${do if [ -z "$asset_title" ]; then asset_title="ci-measurements" fi - asset_head_sha="${dollar}{GITHUB_HEAD_SHA:-${dollar}{GITHUB_SHA:-unknown}}" + asset_head_sha="${dollar}{CI_MEASUREMENT_SUBJECT_SHA:-${dollar}{GITHUB_HEAD_SHA:-${dollar}{GITHUB_SHA:-unknown}}}" asset_run_id="${dollar}{GITHUB_RUN_ID:-local}" asset_run_attempt="${dollar}{GITHUB_RUN_ATTEMPT:-0}" - asset_path="ci-measurements/pr-$pr_number/${dollar}{asset_head_sha}/run-${dollar}{asset_run_id}-attempt-${dollar}{asset_run_attempt}/${dollar}{asset_title}.svg" + asset_svg_path="ci-measurements/pr-$pr_number/${dollar}{asset_head_sha}/run-${dollar}{asset_run_id}-attempt-${dollar}{asset_run_attempt}/${dollar}{asset_title}.svg" + asset_png_path="ci-measurements/pr-$pr_number/${dollar}{asset_head_sha}/run-${dollar}{asset_run_id}-attempt-${dollar}{asset_run_attempt}/${dollar}{asset_title}.png" + asset_dark_png_path="ci-measurements/pr-$pr_number/${dollar}{asset_head_sha}/run-${dollar}{asset_run_id}-attempt-${dollar}{asset_run_attempt}/${dollar}{asset_title}-dark.png" + public_asset_command="${dollar}{CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND:-}" + repo_private="$(gh api "repos/$repo" --jq '.private // false' 2>/dev/null || printf 'true')" + require_public_asset=false + if [ "$repo_private" = "true" ]; then + require_public_asset=true + fi if [ "${dollar}{GITHUB_SERVER_URL:-https://github.com}" = "https://github.com" ]; then - chart_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_path" + github_raw_chart_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_png_path" + github_raw_chart_dark_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_dark_png_path" + github_raw_chart_source_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_svg_path" else - chart_url="${dollar}{GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_path" + github_raw_chart_url="${dollar}{GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_png_path" + github_raw_chart_dark_url="${dollar}{GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_dark_png_path" + github_raw_chart_source_url="${dollar}{GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_svg_path" + fi + if [ "$repo_private" = "true" ]; then + chart_url="" + chart_dark_url="" + chart_source_url="" + else + chart_url="$github_raw_chart_url" + chart_dark_url="$github_raw_chart_dark_url" + chart_source_url="$github_raw_chart_source_url" fi export CI_MEASUREMENT_PR_COMMENT_CHART_URL="$chart_url" + export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="$chart_dark_url" + export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="$chart_source_url" cat > "$renderer_script" <<'EOF' import { readFileSync, writeFileSync } from 'node:fs' -const [comparisonPath, commentsPath, bodyPath, commentIdPath, chartPath] = process.argv.slice(2) +const [comparisonPath, commentsPath, bodyPath, commentIdPath, chartPath, chartDarkPath] = process.argv.slice(2) const title = process.env.CI_MEASUREMENT_PR_COMMENT_TITLE || 'CI Measurements' const maxRows = Number.parseInt(process.env.CI_MEASUREMENT_PR_COMMENT_MAX_ROWS || '10', 10) const maxHistory = Number.parseInt(process.env.CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY || '20', 10) @@ -1308,13 +2534,20 @@ const repo = process.env.GITHUB_REPOSITORY || 'unknown' const runId = process.env.GITHUB_RUN_ID || '' const runAttempt = process.env.GITHUB_RUN_ATTEMPT || '' const sha = process.env.GITHUB_SHA || '' -const headSha = process.env.GITHUB_HEAD_SHA || sha +const headSha = process.env.CI_MEASUREMENT_SUBJECT_SHA || process.env.GITHUB_HEAD_SHA || sha const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com' const workflow = process.env.GITHUB_WORKFLOW || 'CI' const job = process.env.GITHUB_JOB || '' const chartUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_URL || '' - -const marker = '' +const chartDarkUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL || '' +const chartSourceUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL || '' + +const markerScope = (process.env.CI_MEASUREMENT_PR_COMMENT_MARKER || title) + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') || 'default' +const marker = '' +const legacyMarker = '' const statePrefix = '' const stateTag = 'ci-measurement-comment-state' @@ -1325,7 +2558,9 @@ const comments = JSON.parse(readFileSync(commentsPath, 'utf8')) if (!Array.isArray(comments)) throw new Error('comments response must be an array') const existing = comments.find((comment) => { - return typeof comment?.body === 'string' && comment.body.includes(marker) + if (typeof comment?.body !== 'string') return false + return comment.body.includes(marker) || + (comment.body.includes(legacyMarker) && comment.body.includes('## ' + title)) }) const extractState = (body) => { @@ -1372,15 +2607,139 @@ const formatRatio = (value) => { return formatNumber(Math.round((value - 1) * 1000) / 10) + '%' } -const formatResult = (row) => { - if (row.confidence === 'low_sample_count') return 'gray needs repeat' - if (row.status === 'fail') return 'red regression' - if (row.status === 'warn') return 'yellow regression' - if (row.status === 'missing_baseline') return 'gray no baseline' - if (row.confidence === 'noise_floor') return 'gray noise floor' - if (row.confidence === 'within_baseline_range') return 'gray within range' - if (row.direction === 'improved') return 'green improved' - return 'gray unchanged' +const formatSemanticImpact = (value) => { + if (value === null || value === undefined || Number.isNaN(value)) return 'n/a' + if (Math.abs(value) < 0.005) return '0.00x' + const sign = value > 0 ? '+' : '' + return sign + formatNumber(Math.round(value * 100) / 100) + 'x' +} + +const formatRowImpact = (row) => { + if (row.confidence === 'diagnostic' || row.gateReason === 'disabled' || row.semanticImpactKind === 'diagnostic') { + return 'diagnostic' + } + return formatSemanticImpact(row.semanticImpactScore) +} + +const formatEvidence = (row) => { + const unit = row.observation?.unit + if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') { + const quantile = typeof row.pairedEvidenceQuantile === 'number' + ? Math.round(row.pairedEvidenceQuantile * 100) + : 25 + return (row.confidence || 'unknown') + + '
paired n=' + (row.pairedSamples ?? 0) + + ', ' + quantile + '-' + (100 - quantile) + '% delta ' + + formatValue(row.evidenceDeltaLower, unit) + + ' - ' + formatValue(row.evidenceDeltaUpper, unit) + + '' + } + return (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '' +} + +const interpretation = (row) => { + if (row.confidence === 'low_baseline_count') return { + label: 'Needs more baseline', + detail: 'Not enough compatible baseline runs to make this gate trustworthy.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'low_current_sample_count') return { + label: 'Needs repeat', + detail: 'Current run has too few successful measured samples.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'low_paired_sample_count') return { + label: 'Needs paired evidence', + detail: 'Wall-clock gates require same-run base/head samples before they can block merges.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'missing_paired_delta') return { + label: 'Needs paired delta stats', + detail: 'Wall-clock gates require per-pair delta statistics, not only paired medians.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'paired_uncertain') return { + label: 'Uncertain wall-clock movement', + detail: 'The paired median moved, but the paired delta band still crosses the configured budget.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'diagnostic') return { + label: 'Diagnostic only', + detail: 'Shown for investigation, but intentionally excluded from gating.', + tone: 'diagnostic', + color: '#a78bfa', + } + if (row.status === 'fail') return { + label: 'Regression - blocks merge', + detail: 'Worse than the configured fail threshold with enough samples.', + tone: 'bad', + color: '#ef4444', + } + if (row.status === 'warn') return { + label: 'Regression - review', + detail: 'Worse than the configured warning threshold.', + tone: 'warn', + color: '#f59e0b', + } + if (row.status === 'missing_baseline') return { + label: 'No baseline yet', + detail: 'Current value is measured, but no comparable baseline exists.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'noise_floor') return { + label: 'Too small to matter', + detail: 'The absolute change is below the noise floor for this metric.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'within_baseline_range') return { + label: 'Historical range only', + detail: 'Inside the full historical min/max range, but this range is not used to pass a gate.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.confidence === 'within_robust_band' || row.confidence === 'within_baseline_distribution') return { + label: 'Within noise band', + detail: 'Current and baseline robust noise bands overlap.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.direction === 'improved' && typeof row.semanticImpactScore === 'number' && row.semanticImpactScore <= -1) return { + label: 'Meaningfully lower', + detail: 'Lower than baseline by enough to cross the configured review threshold.', + tone: 'good', + color: '#10b981', + } + if (row.direction === 'improved') return { + label: 'Slightly lower, ok', + detail: 'Lower than baseline, but still inside the configured review budget.', + tone: 'neutral', + color: '#94a3b8', + } + if (row.direction === 'regressed') return { + label: 'Slightly higher, ok', + detail: 'Higher than baseline but still inside the configured budget.', + tone: 'neutral', + color: '#94a3b8', + } + return { + label: 'Unchanged', + detail: 'No meaningful movement from baseline.', + tone: 'neutral', + color: '#94a3b8', + } +} + +const formatGate = (row) => { + if (row.gateable) return 'yes' + const reason = row.gateReason || row.confidence || 'unknown' + return 'no
' + reason + '' } const escapeCell = (value) => String(value ?? '-').replaceAll('|', '\\|').replaceAll('\n', '
') @@ -1402,6 +2761,8 @@ const humanProbe = (row) => { task_pnpm_install: 'pnpm:install', task_genie_run: 'genie:run', task_check_quick: 'check:quick', + task_check_quick_warm: 'Warm cached check:quick', + task_check_quick_forced: 'Forced check:quick', } if (probe && labels[probe]) return labels[probe] if (name.startsWith('devenv.') && name.endsWith('.duration')) { @@ -1410,6 +2771,22 @@ const humanProbe = (row) => { return name } +const semanticPath = (row) => { + const parts = [ + ...(Array.isArray(row.target?.path) ? row.target.path : []), + row.target?.group, + ...(Array.isArray(row.observation?.path) ? row.observation.path : []), + row.observation?.group, + ].filter((value) => typeof value === 'string' && value.length > 0) + const seen = new Set() + const unique = parts.filter((part) => { + if (seen.has(part)) return false + seen.add(part) + return true + }) + return unique.length > 0 ? unique.join(' / ') : '-' +} + const chartProbe = (row) => { if (row.observation?.label) return row.observation.label const probe = row.observation?.dimensions?.probe @@ -1421,6 +2798,8 @@ const chartProbe = (row) => { task_pnpm_install: 'pnpm:install', task_genie_run: 'genie:run', task_check_quick: 'check:quick', + task_check_quick_warm: 'Warm cached check:quick', + task_check_quick_forced: 'Forced check:quick', } if (probe && labels[probe]) return labels[probe] return humanProbe(row) @@ -1438,40 +2817,165 @@ const dimensions = (row) => { const rank = (row) => { if (row.status === 'fail') return 0 if (row.status === 'warn') return 1 - if (row.status === 'missing_baseline') return 2 - return 3 + if (row.status === 'missing_baseline') return 3 + return 2 } const allRows = Object.values(comparison.comparisons || {}).sort((left, right) => { const byRank = rank(left) - rank(right) if (byRank !== 0) return byRank - return (right.delta || 0) - (left.delta || 0) + const leftImpact = typeof left.semanticImpactScore === 'number' ? Math.abs(left.semanticImpactScore) : 0 + const rightImpact = typeof right.semanticImpactScore === 'number' ? Math.abs(right.semanticImpactScore) : 0 + if (rightImpact !== leftImpact) return rightImpact - leftImpact + const leftDelta = typeof left.delta === 'number' ? Math.abs(left.delta) : 0 + const rightDelta = typeof right.delta === 'number' ? Math.abs(right.delta) : 0 + if (rightDelta !== leftDelta) return rightDelta - leftDelta + return humanProbe(left).localeCompare(humanProbe(right)) }) +const protocolLabel = (() => { + const protocols = new Set( + allRows + .map((row) => row.observation?.dimensions?.measurementProtocol) + .filter((value) => typeof value === 'string' && value.length > 0), + ) + return protocols.size > 0 ? Array.from(protocols).join(', ') : 'legacy' +})() const visibleLimit = Number.isFinite(maxRows) && maxRows > 0 ? maxRows : 10 const comparableRows = allRows.filter((row) => typeof row.baseline === 'number') const hasComparableBaseline = comparableRows.length > 0 +const isDiagnosticRow = (row) => + row.status === 'missing_baseline' || + row.confidence === 'diagnostic' || + row.gateReason === 'disabled' || + row.semanticImpactKind === 'diagnostic' || + (!row.gateable && typeof row.baseline !== 'number') +const isZeroImpactRow = (row) => + typeof row.semanticImpactScore === 'number' && + !Number.isNaN(row.semanticImpactScore) && + Math.abs(row.semanticImpactScore) < 0.005 +const actionableComparableRows = comparableRows.filter((row) => !isDiagnosticRow(row)) const visibleRows = (hasComparableBaseline - ? allRows.filter((row) => typeof row.baseline === 'number') - : allRows.slice().sort((left, right) => (right.current || 0) - (left.current || 0)) + ? actionableComparableRows + : allRows.filter((row) => !isDiagnosticRow(row)).sort((left, right) => (right.current || 0) - (left.current || 0)) ).slice(0, visibleLimit) +const nonZeroImpactRows = actionableComparableRows.filter((row) => !isZeroImpactRow(row)) +const zeroImpactRows = actionableComparableRows.filter(isZeroImpactRow) +const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit) +const diagnosticRows = allRows.filter(isDiagnosticRow) + +const baselineToCurrent = (row) => { + const unit = row.observation?.unit + return formatValue(row.baseline, unit) + ' -> ' + formatValue(row.current, unit) +} + +const rawChange = (row) => { + const unit = row.observation?.unit + return formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio) +} + +const confidenceSummary = (row) => { + const unit = row.observation?.unit + if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') { + const quantile = typeof row.pairedEvidenceQuantile === 'number' + ? Math.round(row.pairedEvidenceQuantile * 100) + : 25 + return 'paired n=' + (row.pairedSamples ?? 0) + + ', ' + quantile + '-' + (100 - quantile) + '% delta ' + + formatValue(row.evidenceDeltaLower, unit) + + '..' + formatValue(row.evidenceDeltaUpper, unit) + } + return (row.confidence || 'unknown') + ', baseline n=' + (row.baselineSources ?? 0) + ', current n=' + (row.currentSamples ?? 1) +} + +const scanDecision = (row) => { + if (row.status === 'fail') return 'regression blocks' + if (row.status === 'warn') return 'regression review' + if (row.status === 'missing_baseline') return 'needs baseline' + if (row.direction === 'improved') return 'faster' + if (row.direction === 'regressed') return 'no material impact' + return 'unchanged' +} + +const scanTable = (rows) => { + if (rows.length === 0) return 'No non-zero actionable measurement impact detected.' + return [ + '| What changed? | Probe | Baseline -> current | Raw change | Impact | Confidence |', + '| --- | --- | --- | ---: | ---: | --- |', + ...rows.map((row) => { + return '| ' + [ + scanDecision(row), + humanProbe(row), + baselineToCurrent(row), + rawChange(row), + formatRowImpact(row), + confidenceSummary(row), + ].map(escapeCell).join(' | ') + ' |' + }), + ].join('\n') +} + +const zeroImpactTable = (rows) => { + if (rows.length === 0) return 'No zero-impact measurements.' + return [ + '| Probe | Baseline -> current | Raw change | Impact | Gate | Evidence | Why hidden |', + '| --- | --- | ---: | ---: | --- | --- | --- |', + ...rows.map((row) => { + const meaning = interpretation(row) + return '| ' + [ + humanProbe(row), + baselineToCurrent(row), + rawChange(row), + formatRowImpact(row), + row.gateable ? 'yes' : (row.gateReason || 'no'), + confidenceSummary(row), + meaning.label, + ].map(escapeCell).join(' | ') + ' |' + }), + ].join('\n') +} + +const diagnosticTable = (rows) => { + if (rows.length === 0) return 'No diagnostic or ungated measurements.' + return [ + '| Probe | Current | Baseline | Impact | Gate | Reason | Evidence |', + '| --- | ---: | ---: | ---: | --- | --- | --- |', + ...rows.map((row) => { + return '| ' + [ + humanProbe(row), + formatValue(row.current, row.observation?.unit), + formatValue(row.baseline, row.observation?.unit), + formatRowImpact(row), + row.gateable ? 'yes' : (row.gateReason || row.status || 'no'), + interpretation(row).label, + confidenceSummary(row), + ].map(escapeCell).join(' | ') + ' |' + }), + ].join('\n') +} const comparisonTable = (rows) => { if (rows.length === 0) return 'No measurement regressions detected.' return [ - '| Probe | Baseline | Current | Change | Result | Confidence |', - '| --- | ---: | ---: | ---: | --- | --- |', + '| Group | Measurement | Baseline | Current | Raw change | Impact | Meaning | Gate | Evidence |', + '| --- | --- | ---: | ---: | ---: | ---: | --- | --- | --- |', ...rows.map((row) => { const unit = row.observation?.unit - const baselineRange = typeof row.baselineMin === 'number' && typeof row.baselineMax === 'number' && row.baselineMin !== row.baselineMax - ? '
range ' + formatValue(row.baselineMin, unit) + ' - ' + formatValue(row.baselineMax, unit) + '' + const baselineRange = typeof row.baselineRobustLower === 'number' && typeof row.baselineRobustUpper === 'number' && row.baselineRobustLower !== row.baselineRobustUpper + ? '
noise band ' + formatValue(row.baselineRobustLower, unit) + ' - ' + formatValue(row.baselineRobustUpper, unit) + '' + : typeof row.baselineMin === 'number' && typeof row.baselineMax === 'number' && row.baselineMin !== row.baselineMax + ? '
range ' + formatValue(row.baselineMin, unit) + ' - ' + formatValue(row.baselineMax, unit) + '' : '' + const meaning = interpretation(row) return '| ' + [ + semanticPath(row), humanProbe(row), formatValue(row.baseline, unit) + baselineRange, formatValue(row.current, unit), formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio), - formatResult(row), - (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '', + formatRowImpact(row), + meaning.label + '
' + meaning.detail + '', + formatGate(row), + formatEvidence(row), ].map(escapeCell).join(' | ') + ' |' }), ].join('\n') @@ -1480,10 +2984,10 @@ const comparisonTable = (rows) => { const currentOnlyTable = (rows) => { if (rows.length === 0) return 'No current measurements found.' return [ - '| Probe | Current |', - '| --- | ---: |', + '| Group | Measurement | Current |', + '| --- | --- | ---: |', ...rows.map((row) => { - return '| ' + [humanProbe(row), formatValue(row.current, row.observation?.unit)].map(escapeCell).join(' | ') + ' |' + return '| ' + [semanticPath(row), humanProbe(row), formatValue(row.current, row.observation?.unit)].map(escapeCell).join(' | ') + ' |' }), ].join('\n') } @@ -1491,12 +2995,13 @@ const currentOnlyTable = (rows) => { const allMeasurementsTable = (rows) => { if (rows.length === 0) return 'No measurement regressions detected.' return [ - '| Status | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio |', - '| --- | --- | --- | --- | ---: | ---: | ---: | ---: |', + '| Status | Gate | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio | Impact |', + '| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: |', ...rows.map((row) => { const unit = row.observation?.unit return '| ' + [ row.status, + row.gateable ? 'yes' : (row.gateReason || 'no'), row.target?.label || row.target?.name || 'unknown', row.observation?.label || row.observation?.name || 'unknown', dimensions(row), @@ -1504,11 +3009,38 @@ const allMeasurementsTable = (rows) => { formatValue(row.current, unit), formatDelta(row.delta, unit), formatRatio(row.ratio), + formatRowImpact(row), ].map(escapeCell).join(' | ') + ' |' }), ].join('\n') } +const sourceMeasurement = (row) => ({ + id: row.observation?.dimensions?.probe || row.observation?.name || humanProbe(row), + label: humanProbe(row), + group: semanticPath(row), + status: row.status, + direction: row.direction, + gateable: row.gateable, + gateReason: row.gateReason, + confidence: row.confidence, + comparisonMode: row.comparisonMode, + unit: row.observation?.unit, + baseline: row.baseline ?? null, + current: row.current ?? null, + delta: row.delta ?? null, + ratio: row.ratio ?? null, + semanticImpactScore: row.semanticImpactScore ?? null, + semanticImpactKind: row.semanticImpactKind ?? null, + baselineSources: row.baselineSources ?? null, + currentSamples: row.currentSamples ?? null, + pairedSamples: row.pairedSamples ?? null, + evidenceDeltaLower: row.evidenceDeltaLower ?? null, + evidenceDeltaUpper: row.evidenceDeltaUpper ?? null, + pairedEvidenceQuantile: row.pairedEvidenceQuantile ?? null, + dimensions: row.observation?.dimensions || {}, +}) + const truncate = (value, maxLength) => { const text = String(value) if (text.length <= maxLength) return text @@ -1516,70 +3048,121 @@ const truncate = (value, maxLength) => { return text.slice(0, Math.max(0, maxLength - 3)) + '...' } -const renderPerfChangeSvg = (rows) => { +const renderPerfChangeSvg = (rows, theme = 'adaptive') => { const chartRows = rows - .filter((row) => row.observation?.unit === 'seconds') .filter((row) => typeof row.current === 'number' && typeof row.baseline === 'number') - .filter((row) => typeof row.ratio === 'number') - .sort((left, right) => ((left.ratio || 1) - 1) - ((right.ratio || 1) - 1)) + .filter((row) => row.gateable === true) + .filter((row) => typeof row.semanticImpactScore === 'number') + .sort((left, right) => (left.semanticImpactScore || 0) - (right.semanticImpactScore || 0)) .slice(0, visibleLimit) if (chartRows.length === 0) return '' - const percentages = chartRows.map((row) => ((row.ratio || 1) - 1) * 100) - const minPct = Math.min(-1, ...percentages) - const maxPct = Math.max(1, ...percentages) - const lower = Math.floor(minPct) - const upper = Math.ceil(maxPct) + const impactScores = chartRows.map((row) => row.semanticImpactScore || 0) + const minImpact = Math.min(-1, ...impactScores) + const maxImpact = Math.max(1, ...impactScores) + const lower = Math.floor(minImpact) + const upper = Math.ceil(maxImpact) const span = upper - lower || 1 - const width = 900 - const rowHeight = 42 - const height = 96 + chartRows.length * rowHeight + 34 - const labelX = 238 - const plotX = 260 - const plotWidth = 342 - const percentX = 626 - const nominalX = 704 - const topY = 78 + const width = 1040 + const rowHeight = 46 + const height = 112 + chartRows.length * rowHeight + 34 + const labelX = 230 + const plotX = 252 + const plotWidth = 320 + const impactX = 596 + const nominalX = 672 + const meaningX = 804 + const topY = 92 const barHeight = 18 const zeroX = plotX + ((0 - lower) / span) * plotWidth + const themeCss = theme === 'dark' + ? [ + ' .chart-bg { fill: #0d1117; }', + ' .chart-border { fill: none; stroke: #30363d; }', + ' .chart-title { fill: #f0f6fc; }', + ' .chart-muted { fill: #8b949e; }', + ' .chart-axis { stroke: #8b949e; }', + ' .chart-label { fill: #c9d1d9; }', + ' .chart-value { fill: #8b949e; }', + ' .chart-track { fill: #21262d; }', + ] + : [ + ' .chart-bg { fill: #ffffff; }', + ' .chart-border { fill: none; stroke: #d0d7de; }', + ' .chart-title { fill: #24292f; }', + ' .chart-muted { fill: #57606a; }', + ' .chart-axis { stroke: #8c959f; }', + ' .chart-label { fill: #24292f; }', + ' .chart-value { fill: #57606a; }', + ' .chart-track { fill: #f6f8fa; }', + ...(theme === 'adaptive' + ? [ + ' @media (prefers-color-scheme: dark) {', + ' .chart-bg { fill: #0d1117; }', + ' .chart-border { stroke: #30363d; }', + ' .chart-title { fill: #f0f6fc; }', + ' .chart-muted { fill: #8b949e; }', + ' .chart-axis { stroke: #8b949e; }', + ' .chart-label { fill: #c9d1d9; }', + ' .chart-value { fill: #8b949e; }', + ' .chart-track { fill: #21262d; }', + ' }', + ] + : []), + ] const svg = [ '', '', - '', - 'Perf change vs baseline (%)', - 'faster', - 'slower', - 'baseline -> current', - '', + '', + '', + '', + 'Actionable measurement impact', + '0 means no actionable PR impact; 1x reaches the warning budget.', + 'improved', + 'regressed', + 'impact', + 'baseline -> current', + 'meaning', + '', ] for (const [index, row] of chartRows.entries()) { - const pct = ((row.ratio || 1) - 1) * 100 + const impact = row.semanticImpactScore || 0 const y = topY + index * rowHeight - const valueWidth = Math.max(2, Math.abs(pct) / span * plotWidth) - const x = pct < 0 ? zeroX - valueWidth : zeroX - const color = pct < 0 ? '#20d6a3' : '#fb6b6b' - const formattedPct = (pct > 0 ? '+' : '') + formatNumber(Math.round(pct * 10) / 10) + '%' + const valueWidth = Math.max(2, Math.abs(impact) / span * plotWidth) + const x = impact < 0 ? zeroX - valueWidth : zeroX + const meaning = interpretation(row) + const color = meaning.color + const formattedImpact = formatSemanticImpact(impact) const label = chartProbe(row) const nominal = formatValue(row.baseline, row.observation?.unit).replaceAll(' ', '') + ' -> ' + formatValue(row.current, row.observation?.unit).replaceAll(' ', '') + const barOpacity = meaning.tone === 'neutral' ? '0.65' : '1' + const dash = meaning.tone === 'diagnostic' ? ' stroke-dasharray="3 3"' : '' svg.push( - '' + escapeXml(label) + '' + escapeXml(truncate(label, 30)) + '', - '', - '', - '' + escapeXml(formattedPct) + '', - '' + escapeXml(nominal) + '' + escapeXml(truncate(nominal, 24)) + '', + '' + escapeXml(label) + '' + escapeXml(truncate(label, 28)) + '', + '', + '', + '' + escapeXml(formattedImpact) + '', + '' + escapeXml(nominal) + '' + escapeXml(truncate(nominal, 21)) + '', + '' + escapeXml(meaning.detail) + '' + escapeXml(truncate(meaning.label, 30)) + '', ) } svg.push( - '0%', + '0', '', ) return svg.join('\n') } const statusWord = comparison.status || 'unknown' +const readiness = comparison.readiness || {} +const readinessLabel = readiness.enforceable + ? 'enforceable' + : 'partial (' + (readiness.gateableCount ?? 0) + '/' + (readiness.enabledCount ?? 0) + ' enabled observations gateable)' const runUrl = runId ? serverUrl + '/' + repo + '/actions/runs/' + runId : undefined const shortSha = (headSha || sha || 'unknown').slice(0, 7) const existingState = extractState(existing?.body) @@ -1597,22 +3180,35 @@ const currentRun = { status: row.status, target: row.target?.label || row.target?.name || 'unknown', observation: row.observation?.label || row.observation?.name || 'unknown', + meaning: interpretation(row).label, dimensions: dimensions(row).replaceAll('
', ', '), baseline: formatValue(row.baseline, row.observation?.unit), current: formatValue(row.current, row.observation?.unit), delta: formatDelta(row.delta, row.observation?.unit), ratio: formatRatio(row.ratio), + impact: formatSemanticImpact(row.semanticImpactScore), })), } -const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha) +const hasComparableHistory = (run) => Array.isArray(run.visibleRows) && run.visibleRows.some((row) => + row.status !== 'missing_baseline' && + row.baseline !== 'n/a' && + row.ratio !== 'n/a' +) +const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha && hasComparableHistory(run)) const historyLimit = Number.isFinite(maxHistory) && maxHistory > 0 ? maxHistory : 20 const state = { _tag: stateTag, schemaVersion, title, runs: [currentRun, ...previousRuns].slice(0, historyLimit) } +const gateModeLabel = (mode) => { + if (mode === 'fail') return 'enforced' + if (mode === 'warn') return 'advisory' + if (mode === 'off') return 'off' + return mode || 'unknown' +} const historyRows = state.runs.slice(1).map((run) => { const link = run.runUrl ? '[' + run.shortSha + '](' + run.runUrl + ')' : run.shortSha const top = Array.isArray(run.visibleRows) && run.visibleRows.length > 0 - ? run.visibleRows.slice(0, 3).map((row) => row.status + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
') + ? run.visibleRows.slice(0, 3).map((row) => (row.meaning || row.status) + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
') : 'No regressions' - return '| ' + [link, run.status, run.mode, top].map(escapeCell).join(' | ') + ' |' + return '| ' + [link, run.status, gateModeLabel(run.mode), top].map(escapeCell).join(' | ') + ' |' }) const runLink = runUrl ? '[workflow run](' + runUrl + ')' : 'workflow run unavailable' @@ -1621,26 +3217,102 @@ const baselineLabel = baselineProvenance?.runId ? '[main run ' + baselineProvenance.runId + '](' + serverUrl + '/' + repo + '/actions/runs/' + baselineProvenance.runId + ')' + (Array.isArray(baselineProvenance.runs) && baselineProvenance.runs.length > 1 ? ' + ' + (baselineProvenance.runs.length - 1) + ' older baseline runs' : '') : 'not available' -const chartSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows) : '' +const sourceOfTruth = { + schemaVersion, + title, + status: statusWord, + gate: gateModeLabel(comparison.mode), + readiness: readinessLabel, + commit: { + shortSha, + sha: headSha || sha || 'unknown', + }, + run: { + id: runId || null, + attempt: runAttempt || null, + url: runUrl || null, + }, + baseline: baselineProvenance || null, + protocol: protocolLabel, + chart: { + meaning: 'semantic-impact', + zeroImpactMeaning: 'no actionable PR impact after budgets, noise floor, and robust evidence checks', + svg: chartSourceUrl || null, + lightPng: chartUrl || null, + darkPng: chartDarkUrl || null, + }, + measurements: allRows.map(sourceMeasurement), +} +const chartSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows) : '' +const chartDarkSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows, 'dark') : '' if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg) -const chartMarkdown = chartUrl && chartSvg ? '![Perf change vs baseline chart](' + chartUrl + ')' : '' +if (chartDarkPath && chartDarkSvg) writeFileSync(chartDarkPath, chartDarkSvg) +const chartImageMarkdown = chartUrl && chartSvg + ? (chartDarkUrl + ? '\n' + + ' \n' + + ' \n' + + ' Measurement change vs baseline chart\n' + + '' + : '![Measurement change vs baseline chart](' + chartUrl + ')') + : '' +const chartMarkdown = chartImageMarkdown + ? chartImageMarkdown + + (chartSourceUrl ? '\n\n[SVG source](' + chartSourceUrl + ')' : '') + : '' + +const regressionCount = allRows.filter((row) => row.status === 'fail' || row.status === 'warn').length +const improvementCount = comparableRows.filter((row) => row.direction === 'improved' && !isZeroImpactRow(row)).length +const neutralCount = zeroImpactRows.length + diagnosticRows.length +const humanSummary = hasComparableBaseline + ? regressionCount > 0 + ? String(regressionCount) + ' regression' + (regressionCount === 1 ? '' : 's') + ' need review.' + : improvementCount > 0 + ? 'No regressions. ' + String(improvementCount) + ' probe' + (improvementCount === 1 ? '' : 's') + ' got faster; ' + String(neutralCount) + ' neutral or ungated row' + (neutralCount === 1 ? '' : 's') + ' are collapsed below.' + : 'No regressions. Comparable movement is below the semantic impact threshold; neutral rows are collapsed below.' + : 'No compatible baseline was available, so this run shows current measurements only.' const summaryLines = [ '## ' + title, '', - '- Status: ' + statusWord, - '- Mode: ' + (comparison.mode || 'unknown'), - '- Commit: ' + shortSha, - '- Run: ' + runLink, - '- Baseline: ' + baselineLabel, + '**' + statusWord + '** - ' + gateModeLabel(comparison.mode) + ' gate - readiness ' + readinessLabel + ' - commit ' + shortSha + ' - protocol ' + protocolLabel + '', '', - hasComparableBaseline - ? 'Chart: performance change versus baseline median. Green is faster, red is slower, gray is within noise or baseline range.' - : 'No compatible baseline was available, so this run shows current measurements only.', + '> ' + humanSummary, '', chartMarkdown, '', - hasComparableBaseline ? comparisonTable(visibleRows) : currentOnlyTable(visibleRows), + hasComparableBaseline + ? scanTable(visibleNonZeroImpactRows) + : currentOnlyTable(visibleRows), +] + +if (hasComparableBaseline && zeroImpactRows.length > 0) { + summaryLines.push( + '', + '
', + 'Unchanged / 0-impact measurements (' + zeroImpactRows.length + ')', + '', + 'These rows had compatible baseline data, but their semantic impact rounded to 0.00x because the movement was below the configured budget, below the noise floor, or inside the robust noise band.', + '', + zeroImpactTable(zeroImpactRows), + '', + '
', + ) +} + +if (diagnosticRows.length > 0) { + summaryLines.push( + '', + '
', + 'Diagnostic / ungated measurements (' + diagnosticRows.length + ')', + '', + diagnosticTable(diagnosticRows), + '', + '
', + ) +} + +summaryLines.push( '', '
', 'All measurements', @@ -1648,7 +3320,7 @@ const summaryLines = [ allMeasurementsTable(allRows), '', '
', -] +) if (historyRows.length > 0) { summaryLines.push( @@ -1656,7 +3328,7 @@ if (historyRows.length > 0) { '
', 'Previous runs', '', - '| Commit | Status | Mode | Top changes |', + '| Commit | Status | Gate | Top changes |', '| --- | --- | --- | --- |', ...historyRows, '', @@ -1664,14 +3336,50 @@ if (historyRows.length > 0) { ) } +summaryLines.push( + '', + '
', + 'Source-of-truth JSON', + '', + '~~~json', + JSON.stringify(sourceOfTruth, null, 2), + '~~~', + '', + '
', +) + summaryLines.push('', marker, statePrefix + JSON.stringify(state, null, 2) + stateSuffix) writeFileSync(bodyPath, summaryLines.join('\n') + '\n') writeFileSync(commentIdPath, existing?.id ? String(existing.id) : '') EOF - node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" + node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file" if [ -s "$chart_file" ]; then + if [ "$require_public_asset" = "true" ] && [ -z "$public_asset_command" ]; then + echo "::error::CI measurement chart was rendered for a private repository, but CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND is not configured. Private raw GitHub URLs cannot be embedded in PR comments." + exit 1 + fi + + if ensure_ci_measurement_tool resvg resvg; then + resvg_font_args=() + if command -v nix >/dev/null 2>&1; then + if font_out="$(nix build --no-link --print-out-paths nixpkgs#dejavu_fonts 2>/dev/null)"; then + resvg_font_args+=(--use-fonts-dir "$font_out/share/fonts/truetype") + fi + fi + if ! resvg --background '#ffffff' "${dollar}{resvg_font_args[@]}" "$chart_file" "$chart_png_file"; then + echo "::notice::unable to render CI measurement chart PNG" + rm -f "$chart_png_file" + fi + if [ -s "$chart_dark_file" ] && ! resvg --background '#0d1117' "${dollar}{resvg_font_args[@]}" "$chart_dark_file" "$chart_dark_png_file"; then + echo "::notice::unable to render dark CI measurement chart PNG" + rm -f "$chart_dark_png_file" + fi + else + echo "::notice::resvg is not available; skipping embedded CI measurement chart PNG" + fi + if ! gh api "repos/$repo/git/ref/heads/$asset_branch" >/dev/null 2>&1; then default_branch_sha="$(gh api "repos/$repo/git/ref/heads/${dollar}{GITHUB_BASE_REF:-main}" --jq '.object.sha' 2>/dev/null || true)" if [ -z "$default_branch_sha" ]; then @@ -1682,25 +3390,84 @@ EOF fi fi chart_content="$(base64 <"$chart_file" | tr -d '\n')" - if ! gh api "repos/$repo/contents/$asset_path" --method PUT --field message="Update CI measurement chart for PR #$pr_number" --field content="$chart_content" --field branch="$asset_branch" >/dev/null; then - echo "::notice::unable to upload CI measurement chart asset" - sed -i.bak '/!\[Perf change vs baseline chart\]/d' "$comment_body" + if ! gh api "repos/$repo/contents/$asset_svg_path" --method PUT --field message="Update CI measurement chart SVG for PR #$pr_number" --field content="$chart_content" --field branch="$asset_branch" >/dev/null; then + echo "::notice::unable to upload CI measurement chart SVG asset" + if [ -z "$public_asset_command" ]; then + sed -i.bak '/\[SVG source\]/d' "$comment_body" + fi + fi + if [ -s "$chart_png_file" ]; then + chart_png_content="$(base64 <"$chart_png_file" | tr -d '\n')" + if ! gh api "repos/$repo/contents/$asset_png_path" --method PUT --field message="Update CI measurement chart PNG for PR #$pr_number" --field content="$chart_png_content" --field branch="$asset_branch" >/dev/null; then + echo "::notice::unable to upload CI measurement chart PNG asset" + if [ -z "$public_asset_command" ]; then + sed -i.bak '/!\[Measurement change vs baseline chart\]/d; /!\[Perf change vs baseline chart\]/d; //,/<\\/picture>/d' "$comment_body" + fi + fi + else + sed -i.bak '/!\[Measurement change vs baseline chart\]/d; /!\[Perf change vs baseline chart\]/d; //,/<\\/picture>/d' "$comment_body" + fi + if [ -s "$chart_dark_png_file" ]; then + chart_dark_png_content="$(base64 <"$chart_dark_png_file" | tr -d '\n')" + if ! gh api "repos/$repo/contents/$asset_dark_png_path" --method PUT --field message="Update dark CI measurement chart PNG for PR #$pr_number" --field content="$chart_dark_png_content" --field branch="$asset_branch" >/dev/null; then + echo "::notice::unable to upload dark CI measurement chart PNG asset" + if [ -z "$public_asset_command" ]; then + export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="" + node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file" + fi + fi + fi + + if [ -n "$public_asset_command" ] && [ -s "$chart_png_file" ]; then + if public_chart_url="$(bash -c "$public_asset_command" _ "$chart_png_file" png)" && [ -n "$public_chart_url" ]; then + chart_url="$public_chart_url" + export CI_MEASUREMENT_PR_COMMENT_CHART_URL="$chart_url" + else + echo "::notice::unable to publish CI measurement chart PNG to public asset host" + export CI_MEASUREMENT_PR_COMMENT_CHART_URL="" + fi + if [ -s "$chart_dark_png_file" ] && public_chart_dark_url="$(bash -c "$public_asset_command" _ "$chart_dark_png_file" png)" && [ -n "$public_chart_dark_url" ]; then + chart_dark_url="$public_chart_dark_url" + export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="$chart_dark_url" + else + echo "::notice::unable to publish dark CI measurement chart PNG to public asset host" + export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="" + fi + if public_chart_source_url="$(bash -c "$public_asset_command" _ "$chart_file" svg)" && [ -n "$public_chart_source_url" ]; then + chart_source_url="$public_chart_source_url" + export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="$chart_source_url" + else + echo "::notice::unable to publish CI measurement chart SVG to public asset host" + export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="" + fi + if [ "$require_public_asset" = "true" ] && [ -z "$chart_url" ]; then + echo "::error::unable to publish CI measurement chart PNG to a public asset host for private repository $repo" + exit 1 + fi + if [ "$require_public_asset" = "true" ] && [ -s "$chart_dark_png_file" ] && [ -z "$chart_dark_url" ]; then + echo "::error::unable to publish dark CI measurement chart PNG to a public asset host for private repository $repo" + exit 1 + fi + node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file" fi fi comment_id="$(cat "$comment_id_file")" + comment_payload_file="$comment_body.payload.json" + node -e "const fs=require('node:fs'); fs.writeFileSync(process.argv[2], JSON.stringify({ body: fs.readFileSync(process.argv[1], 'utf8') }))" "$comment_body" "$comment_payload_file" if [ -n "$comment_id" ]; then - if ! gh api "repos/$repo/issues/comments/$comment_id" --method PATCH --field body="$(cat "$comment_body")" >/dev/null; then + if ! gh api "repos/$repo/issues/comments/$comment_id" --method PATCH --input "$comment_payload_file" >/dev/null; then echo "::notice::unable to update CI measurement PR comment" fi else - if ! gh api "repos/$repo/issues/$pr_number/comments" --method POST --field body="$(cat "$comment_body")" >/dev/null; then + if ! gh api "repos/$repo/issues/$pr_number/comments" --method POST --input "$comment_payload_file" >/dev/null; then echo "::notice::unable to create CI measurement PR comment" fi fi fi fi fi +` : ''} if [ "$exit_code" -ne 0 ]; then exit "$exit_code" @@ -1714,6 +3481,11 @@ export const devenvPerfJob = (opts?: DevenvPerfJobOptions) => { opts?.artifactName ?? 'devenv-perf-${{ github.job }}-${{ github.run_id }}-attempt-${{ github.run_attempt }}' const baselineArtifactName = opts?.baselineArtifactName ?? opts?.artifactName + const compare = opts?.compare ?? true + const probes = devenvPerfProbes({ + taskProbes: opts?.taskProbes ?? [], + probes: opts?.probes ?? [], + }) return { 'runs-on': opts?.runsOn ?? linuxX64Runner, @@ -1723,7 +3495,6 @@ export const devenvPerfJob = (opts?: DevenvPerfJobOptions) => { ...standardCIEnv, ARTIFACT_DIR: artifactDir, OTEL_SERVICE_NAME: 'devenv-perf-ci', - DEVENV_PERF_REGRESSION_MODE: opts?.regressionMode ?? 'warn', RUNNER_CLASS: (opts?.runsOn ?? linuxX64Runner).join(','), ...opts?.env, }, @@ -1734,27 +3505,34 @@ export const devenvPerfJob = (opts?: DevenvPerfJobOptions) => { preparePinnedDevenvStep, validateNixStoreStep, ]), - ...(baselineArtifactName === undefined - ? [] - : [ + ...(compare && baselineArtifactName !== undefined + ? [ downloadPreviousGitHubArtifactStep({ artifactName: baselineArtifactName, outputDir: `${artifactDir}/baseline`, + seedRuns: opts?.baselineSeedRuns, seedRunIds: opts?.baselineSeedRunIds, maxRuns: opts?.baselineMaxRuns, + maxCandidateRuns: opts?.baselineMaxCandidateRuns, + requiredObservations: devenvPerfRequiredBaselineObservations(probes), }), - ]), + ] + : []), devenvPerfBenchmarkStep({ taskProbes: opts?.taskProbes, probes: opts?.probes, }), - compareCiMeasurementsStep({ - currentDir: artifactDir, - baselineDir: `${artifactDir}/baseline`, - outputFile: `${artifactDir}/measurement-comparison.json`, - regressionMode: opts?.regressionMode ?? 'warn', - prComment: opts?.prComment, - }), + ...(compare + ? [ + compareCiMeasurementsStep({ + currentDir: artifactDir, + baselineDir: `${artifactDir}/baseline`, + outputFile: `${artifactDir}/measurement-comparison.json`, + regressionMode: opts?.regressionMode ?? 'warn', + prComment: opts?.prComment, + }), + ] + : []), devenvPerfArtifactStep({ artifactDir, artifactName, diff --git a/genie/ci-workflow/setup.ts b/genie/ci-workflow/setup.ts index f7c2e7b49..95091eb64 100644 --- a/genie/ci-workflow/setup.ts +++ b/genie/ci-workflow/setup.ts @@ -16,6 +16,7 @@ import { runDevenvTasksBefore, shellSingleQuote, standardCIEnv, + withGcRaceRetry, workspaceLocalNixCachePath, workspaceLocalNixCacheRoot, type NixBinaryCache, @@ -735,7 +736,7 @@ export const validateColdPnpmDepsStep = ({ ? '' : ` --option substituters ${shellSingleQuote(substituters.join(' '))}` - return [ + const command = [ 'set -euo pipefail', `for attr in ${flakeRefs.map(shellSingleQuote).join(' ')}; do`, ' echo "::group::rebuild-check $attr"', @@ -747,6 +748,8 @@ export const validateColdPnpmDepsStep = ({ ' echo "::endgroup::"', 'done', ].join('\n') + + return withGcRaceRetry({ command, label: name }) })(), }) diff --git a/genie/ci-workflow/shared.ts b/genie/ci-workflow/shared.ts index 3d68f870a..a4ba6ec2a 100644 --- a/genie/ci-workflow/shared.ts +++ b/genie/ci-workflow/shared.ts @@ -49,30 +49,99 @@ export const standardCIEnv = { } as const /** - * Cancel superseded CI workflow runs for the same PR or branch. + * Cancel superseded CI jobs for the same event, ref, and job id. * - * The group key intentionally does not include the job name so a new push - * cancels the entire older workflow run rather than letting stale sibling jobs - * continue consuming runner capacity. + * This is intentionally job-level, not workflow-level. GitHub can wedge + * workflow_dispatch runs before job creation; when that happens, the run has no + * check runs, no logs, and the API may return 500 for cancellation. Keeping + * concurrency at job level lets workflow evaluation materialize visible jobs + * before any scarce-runner throttling applies. + * + * Code validation is a branch-protection signal for the latest PR head. Keeping + * older code-triggered pull_request jobs alive can consume scarce runners after + * a newer head exists, so jobs with the same id still cancel superseded work. + * + * Measurement baseline backfills are keyed by their subject ref and do not + * cancel in-progress runs so several historical refs can be backfilled without + * canceling each other. + * + * Manual dispatches are intentionally keyed by run id. They are operator probes + * and baseline/debug tools, not the authoritative PR-comment path. + * + * Merge-queue label churn is different: only the mq:ci-admitted label event is + * allowed to materialize full PR CI. Other label events do not change the + * commit under test and must not cancel an already-running validation run. */ +export const ciJobConcurrency = (jobId: string, opts?: { readonly matrix?: boolean }) => + ({ + group: + "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}" + + `-${jobId}` + + (opts?.matrix === true ? '-${{ strategy.job-index }}' : ''), + 'cancel-in-progress': + "${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}", + }) as const + +const isMatrixJob = (job: GitHubWorkflowArgs['jobs'][string]) => + typeof job.strategy === 'object' && job.strategy !== null && 'matrix' in job.strategy + +const workflowDispatchBaselineRefInput = { + description: + 'Optional ref/SHA to checkout before running CI measurement jobs. Used to backfill comparable baseline artifacts.', + required: false, + default: '', + type: 'string', +} as const + +const withJobConcurrencyDispatchInputs = (on: GitHubWorkflowArgs['on']): GitHubWorkflowArgs['on'] => { + if (typeof on !== 'object' || on === null || !('workflow_dispatch' in on) || on.workflow_dispatch === null) { + return on + } + + return { + ...on, + workflow_dispatch: { + ...on.workflow_dispatch, + inputs: { + measurement_baseline_ref: workflowDispatchBaselineRefInput, + ...on.workflow_dispatch.inputs, + }, + }, + } +} + +const withDefaultJobConcurrency = (jobs: GitHubWorkflowArgs['jobs']): GitHubWorkflowArgs['jobs'] => + Object.fromEntries( + Object.entries(jobs).map(([jobId, job]) => [ + jobId, + job.concurrency === undefined + ? { ...job, concurrency: ciJobConcurrency(jobId, { matrix: isMatrixJob(job) }) } + : job, + ]), + ) + export const ciWorkflowConcurrency = { - group: '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}', - 'cancel-in-progress': true, + group: + "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}", + 'cancel-in-progress': + "${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}", } as const /** * Standard wrapper for composed CI workflows. * - * This keeps cancellation policy centralized in `effect-utils` instead of - * making each consumer remember to wire `concurrency` by hand. Repos can still - * override the policy by passing an explicit `concurrency` field. + * This keeps cancellation policy centralized in `effect-utils`. Repos can still + * override the workflow-level policy by passing an explicit `concurrency` + * field, and individual jobs can opt out or provide their own `concurrency`. */ export const ciWorkflow = (args: GitHubWorkflowArgs) => - (({ concurrency, actionlint, ...rest }) => + (({ concurrency, actionlint, jobs, on, ...rest }) => githubWorkflow({ - concurrency: concurrency ?? ciWorkflowConcurrency, - actionlint: actionlint ?? defaultActionlintConfig, ...rest, + on: concurrency === undefined ? withJobConcurrencyDispatchInputs(on) : on, + ...(concurrency === undefined ? {} : { concurrency }), + actionlint: actionlint ?? defaultActionlintConfig, + jobs: concurrency === undefined ? withDefaultJobConcurrency(jobs) : jobs, }))(args) export type NixConfigOptions = { @@ -230,7 +299,7 @@ run_nix_gc_race_retry() { local max="${dollar}{NIX_GC_RACE_MAX_RETRIES:-10}" local heartbeat="${dollar}{CI_PROGRESS_HEARTBEAT_SECONDS:-60}" local attempt=1 - local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit + local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit start="$(date +%s)" @@ -278,7 +347,7 @@ run_nix_gc_race_retry() { if [ "$rc" -eq 0 ]; then echo "::notice::[ci] completed $task in $elapsed s" if [ "$attempt" -gt 1 ]; then - write_summary success "Recovered from Nix GC race after retry" + write_summary success "Recovered from transient Nix failure after retry" else write_summary success fi @@ -294,18 +363,22 @@ run_nix_gc_race_retry() { tr -d '[:space:]' || true) saw_invalid_path=false saw_cachix_signature=false + saw_fetch_signature=false [ -n "$path" ] && saw_invalid_path=true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true + printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true rm -f "$log" - if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then - echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race" - write_summary failure "No Nix GC race signature detected" + if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then + echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure" + write_summary failure "No transient Nix failure signature detected" return "$rc" fi - if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then + if [ "$saw_fetch_signature" = true ]; then + echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache" + elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path" elif [ "$saw_cachix_signature" = true ]; then echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)" @@ -320,8 +393,8 @@ run_nix_gc_race_retry() { now=$(date +%s) elapsed=$((now - start)) - echo "::error::Nix GC race retry exhausted for $task ($max attempts)" - write_summary failure "Nix GC race retry exhausted" + echo "::error::Transient Nix retry exhausted for $task ($max attempts)" + write_summary failure "Transient Nix retry exhausted" return 1 }` diff --git a/genie/deploy-preview/netlify.ts b/genie/deploy-preview/netlify.ts index 6c6a1df87..b9fbb6511 100644 --- a/genie/deploy-preview/netlify.ts +++ b/genie/deploy-preview/netlify.ts @@ -363,14 +363,15 @@ export const netlifyStorybookCommentStep = (site: string, deployModeScript: stri 'if [ ! -s /tmp/storybook-preview-comment.md ]; then', ' exit 0', 'fi', + 'node -e "const fs=require(\'node:fs\'); fs.writeFileSync(process.argv[2], JSON.stringify({ body: fs.readFileSync(process.argv[1], \'utf8\') }))" /tmp/storybook-preview-comment.md /tmp/storybook-preview-comment-payload.json', `export NIX_CONFIG="\${NIX_CONFIG:+$NIX_CONFIG$'\\n'}access-tokens = github.com=\${GH_TOKEN}"`, 'if [ "${{ github.event_name }}" != "pull_request" ]; then', ' exit 0', 'fi', 'if [ -n "$comment_id" ]; then', - ' nix run nixpkgs#gh -- api "repos/$GH_REPO/issues/comments/$comment_id" --method PATCH --field body="$(cat /tmp/storybook-preview-comment.md)" >/dev/null', + ' nix run nixpkgs#gh -- api "repos/$GH_REPO/issues/comments/$comment_id" --method PATCH --input /tmp/storybook-preview-comment-payload.json >/dev/null', 'else', - ' nix run nixpkgs#gh -- api "repos/$GH_REPO/issues/${{ github.event.pull_request.number }}/comments" --method POST --field body="$(cat /tmp/storybook-preview-comment.md)" >/dev/null', + ' nix run nixpkgs#gh -- api "repos/$GH_REPO/issues/${{ github.event.pull_request.number }}/comments" --method POST --input /tmp/storybook-preview-comment-payload.json >/dev/null', 'fi', ].join('\n'), }) diff --git a/genie/external.ts b/genie/external.ts index 10b3cbe99..7c787997b 100644 --- a/genie/external.ts +++ b/genie/external.ts @@ -640,6 +640,9 @@ export { devenvPerfArtifactStep, devenvPerfBenchmarkStep, devenvPerfJob, + defaultNixClosureMeasurementBuckets, + nixClosureMeasurementSteps, + nixClosureMeasurementsJob, pnpmStateSetupStep, restorePnpmStateStep, savePnpmStateStep, @@ -659,6 +662,10 @@ export { type DevenvPerfJobOptions, type DevenvPerfProbe, type DevenvPerfTaskProbe, + type NixClosureMeasurementBucket, + type NixClosureMeasurementTarget, + type NixClosureMeasurementsJobOptions, + type NixClosureMeasurementsStepsOptions, type NixBinaryCache, type RunnerProfile, } from './ci-workflow.ts' diff --git a/megarepo.lock b/megarepo.lock index 98f82c06e..dc5cc40f0 100644 --- a/megarepo.lock +++ b/megarepo.lock @@ -4,9 +4,9 @@ "effect": { "url": "https://github.com/effect-ts/effect", "ref": "main", - "commit": "1a63ec87cd295972b05b51c9b4ad2db9567dc994", + "commit": "3585f25110fca7af6aeec3f59c3fc05b20c8d316", "pinned": false, - "lockedAt": "2026-05-12T02:00:43.137Z" + "lockedAt": "2026-05-20T14:02:25.587Z" } } } diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts index 2e056bea7..d7e153031 100644 --- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts +++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts @@ -383,14 +383,18 @@ describe('ci workflow devenv perf helpers', () => { expect(ciWorkflowSource).toContain('export type DevenvPerfProbe') expect(ciWorkflowSource).toContain('export type DevenvPerfTaskProbe') expect(ciWorkflowSource).toContain('export const nixClosureMeasurementStep') + expect(ciWorkflowSource).toContain('export const nixClosureMeasurementSteps') + expect(ciWorkflowSource).toContain('export const nixClosureMeasurementsJob') + expect(ciWorkflowSource).toContain('export const defaultNixClosureMeasurementBuckets') expect(ciWorkflowSource).toContain('export type NixClosureMeasurementBucket') + expect(ciWorkflowSource).toContain('export type NixClosureMeasurementTarget') }) it('emits the standard warm shell and task-list probes with native trace artifacts', () => { expect(generatedCiWorkflowYamlSource).toContain('devenv-perf:') expect(generatedCiWorkflowYamlSource).toContain('OTEL_SERVICE_NAME: devenv-perf-ci') expect(generatedCiWorkflowYamlSource).toContain( - "measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '1'", + "measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '0' '1'", ) expect(generatedCiWorkflowYamlSource).toContain('--trace-to') expect(generatedCiWorkflowYamlSource).toContain('json:file:$trace_file') @@ -398,7 +402,7 @@ describe('ci workflow devenv perf helpers', () => { expect(generatedCiWorkflowYamlSource).toContain("measure 'shell_eval_warm' 'Warm shell eval'") expect(generatedCiWorkflowYamlSource).toContain("measure 'tasks_list' 'devenv tasks list'") expect(generatedCiWorkflowYamlSource).toContain( - "'Loads the devenv processes command help path.' '' '5'", + "'Loads the devenv processes command help path.' '' '1' '9'", ) }) @@ -415,17 +419,32 @@ describe('ci workflow devenv perf helpers', () => { expect(generatedCiWorkflowYamlSource).toContain('probeLabel: .label') expect(generatedCiWorkflowYamlSource).toContain('sampleCount: (.statistics.sampleCount // 1)') expect(generatedCiWorkflowYamlSource).toContain('baselineSources') - expect(generatedCiWorkflowYamlSource).toContain('low_sample_count') + expect(generatedCiWorkflowYamlSource).toContain('low_baseline_count') + expect(generatedCiWorkflowYamlSource).toContain('low_current_sample_count') + expect(generatedCiWorkflowYamlSource).toContain('low_paired_sample_count') + expect(generatedCiWorkflowYamlSource).toContain('readiness:$readiness') + expect(generatedCiWorkflowYamlSource).toContain( + 'enforceable: (.enabledCount == .gateableCount)', + ) expect(generatedCiWorkflowYamlSource).toContain('within_baseline_range') expect(generatedCiWorkflowYamlSource).toContain( - 'elif ($baselineSources < 3 or $currentSamples < 3) then "low_sample_count"', + 'elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"', ) expect(generatedCiWorkflowYamlSource).toContain( - 'if $confidence == "threshold_exceeded" then $thresholdStatus', + 'elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"', ) - expect(ciWorkflowSource).toContain( - "if (row.confidence === 'low_sample_count') return 'gray needs repeat'", + expect(generatedCiWorkflowYamlSource).toContain( + 'if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus', ) + expect(generatedCiWorkflowYamlSource).toContain( + 'elif ($canUseRobustBandSuppression and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"', + ) + expect(ciWorkflowSource).toContain("label: 'Needs more baseline'") + expect(ciWorkflowSource).toContain("label: 'Needs repeat'") + expect(ciWorkflowSource).toContain("label: 'Needs paired evidence'") + expect(ciWorkflowSource).toContain("label: 'Too small to matter'") + expect(ciWorkflowSource).toContain("label: 'Within noise band'") + expect(ciWorkflowSource).toContain("label: 'Meaningfully lower'") expect(generatedCiWorkflowYamlSource).toContain('RUNNER_CLASS:') expect(generatedCiWorkflowYamlSource).toContain('namespace-profile-linux-x86-64') expect(ciWorkflowSource).toContain('nix.closure.nar_size') @@ -434,38 +453,117 @@ describe('ci workflow devenv perf helpers', () => { expect(ciWorkflowSource).toContain('artifact_file=${artifactFileAssignment}') expect(ciWorkflowSource).not.toContain('artifact_file=${shellSingleQuote(artifactFile)}') expect(ciWorkflowSource).toContain( - 'target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, system: $targetSystem }', + 'target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem }', ) expect(ciWorkflowSource).toContain('nix path-info --recursive --json "$out_path"') expect(ciWorkflowSource).toContain( 'topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])', ) expect(generatedCiWorkflowYamlSource).not.toContain('dev3') - expect(generatedCiWorkflowYamlSource).toContain('perf-comparison.json') - expect(generatedCiWorkflowYamlSource).toContain('DEVENV_PERF_REGRESSION_MODE') + expect(generatedCiWorkflowYamlSource).not.toContain('perf-comparison.json') + expect(generatedCiWorkflowYamlSource).not.toContain('DEVENV_PERF_REGRESSION_MODE') + expect(generatedCiWorkflowYamlSource).toContain('devenv-perf-warm-median-v2') expect(generatedCiWorkflowYamlSource).toContain("CI_MEASUREMENT_PR_COMMENT_ENABLED: 'true'") expect(generatedCiWorkflowYamlSource).toContain( - 'CI_MEASUREMENT_PR_COMMENT_TITLE: Devenv Performance', + 'CI_MEASUREMENT_PR_COMMENT_TITLE: CI Measurements', ) - expect(generatedCiWorkflowYamlSource).toContain("BASELINE_SEED_RUN_IDS: '25710204667'") + expect(generatedCiWorkflowYamlSource).toContain('BASELINE_SEED_RUNS_JSON:') + expect(generatedCiWorkflowYamlSource).toContain('BASELINE_REQUIRED_OBSERVATIONS_JSON:') + expect(generatedCiWorkflowYamlSource).toContain('BASELINE_MAX_CANDIDATE_RUNS:') + expect(generatedCiWorkflowYamlSource).toContain("measure 'task_check_quick_warm'") + expect(generatedCiWorkflowYamlSource).toContain("measure 'task_check_quick_forced'") + expect(generatedCiWorkflowYamlSource).not.toContain('"id":"devenv.task_check_quick.duration"') + expect(ciWorkflowSource).toContain( + 'requiredObservations?: readonly CiMeasurementRequiredBaselineObservation[]', + ) + expect(ciWorkflowSource).toContain('baselineMaxCandidateRuns?: number') + expect(ciWorkflowSource).toContain('baseline_requirements_satisfied') + expect(ciWorkflowSource).toContain('observationCounts: ($observationCounts[0] // null)') + expect(generatedCiWorkflowYamlSource).toContain('"runId":"26085158592"') + expect(generatedCiWorkflowYamlSource).toContain('"label":"main baseline"') expect(generatedCiWorkflowYamlSource).toContain('Upload devenv perf artifacts') expect(generatedCiWorkflowYamlSource).toContain('retention-days: 30') expect(ciWorkflowSource).toContain("contents: 'write'") + expect(ciWorkflowSource).toContain('seedRuns?: readonly CiMeasurementBaselineSeedRun[]') expect(ciWorkflowSource).toContain('seedRunIds?: readonly string[]') + expect(ciWorkflowSource).toContain('baselineSeedRuns?: readonly CiMeasurementBaselineSeedRun[]') expect(ciWorkflowSource).toContain('baselineSeedRunIds?: readonly string[]') + expect(ciWorkflowSource).not.toContain('measurement_pr_number:') + expect(ciWorkflowSource).not.toContain('CI_MEASUREMENT_PR_COMMENT_PR_NUMBER') + expect(ciWorkflowSource).toContain( + 'CI measurement PR comments are produced only by pull_request workflows', + ) + expect(ciWorkflowSource).toContain('unable to publish required CI measurement PR comment') + expect(ciWorkflowSource).toContain('seedRuns: ($seedRuns[0] // [])') expect(ciWorkflowSource).toContain('baselineProvenance: ($baselineProvenance[0] // null)') expect(ciWorkflowSource).toContain( - '["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount"] | index($key) | not', + '["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not', ) expect(ciWorkflowSource).toContain('chart_file="$comment_tmp_dir/perf-change-vs-baseline.svg"') expect(ciWorkflowSource).toContain( - 'Chart: performance change versus baseline median. Green is faster, red is slower, gray is within noise or baseline range.', + 'chart_png_file="$comment_tmp_dir/perf-change-vs-baseline.png"', + ) + expect(ciWorkflowSource).toContain( + 'chart_dark_png_file="$comment_tmp_dir/perf-change-vs-baseline-dark.png"', + ) + expect(ciWorkflowSource).toContain( + 'No regressions. Comparable movement is below the semantic impact threshold; neutral rows are collapsed below.', + ) + expect(generatedCiWorkflowYamlSource).toContain( + 'github.workflow }}-${{ github.event_name }}-${{ github.ref }}', + ) + expect(generatedCiWorkflowYamlSource).not.toMatch(/^concurrency:/m) + expect(generatedCiWorkflowYamlSource).toContain('concurrency:\n group:') + expect(generatedCiWorkflowYamlSource).toContain('}}-typecheck') + expect(ciWorkflowSource).toContain('export const ciJobConcurrency = (jobId: string, opts?:') + expect(ciWorkflowSource).toContain("opts?.matrix === true ? '-${{ strategy.job-index }}' : ''") + expect(ciWorkflowSource).toContain('const isMatrixJob = (job: GitHubWorkflowArgs') + expect(generatedCiWorkflowYamlSource).toContain('}}-test-${{ strategy.job-index }}') + expect(generatedCiWorkflowYamlSource).toContain('}}-nix-check-${{ strategy.job-index }}') + expect(generatedCiWorkflowYamlSource).toContain("format('measurement-baseline-{0}'") + expect(generatedCiWorkflowYamlSource).not.toContain("format('measurement-pr-{0}-run-{1}'") + expect(generatedCiWorkflowYamlSource).not.toContain('inputs.measurement_pr_number') + expect(generatedCiWorkflowYamlSource).toContain("format('manual-run-{0}', github.run_id)") + expect(generatedCiWorkflowYamlSource).toContain("format('label-{0}', github.event.label.name)") + expect(generatedCiWorkflowYamlSource).toContain( + "inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request'", + ) + expect(ciWorkflowSource).toContain( + '| What changed? | Probe | Baseline -> current | Raw change | Impact | Confidence |', ) + expect(ciWorkflowSource).toContain( + 'const zeroImpactRows = actionableComparableRows.filter(isZeroImpactRow)', + ) + expect(ciWorkflowSource).toContain('Unchanged / 0-impact measurements (') + expect(ciWorkflowSource).toContain('Source-of-truth JSON') + expect(ciWorkflowSource).toContain('const sourceOfTruth = {') + expect(ciWorkflowSource).toContain('No non-zero actionable measurement impact detected.') + expect(ciWorkflowSource).toContain('readiness ') expect(ciWorkflowSource).toContain('renderPerfChangeSvg') - expect(ciWorkflowSource).toContain('Perf change vs baseline (%)') - expect(ciWorkflowSource).toContain('![Perf change vs baseline chart]') + expect(ciWorkflowSource).toContain('Actionable measurement impact') + expect(ciWorkflowSource).toContain( + '0 means no actionable PR impact; 1x reaches the warning budget.', + ) + expect(ciWorkflowSource).toContain('@media (prefers-color-scheme: dark)') + expect(ciWorkflowSource).toContain('.chart-bg { fill: #0d1117; }') + expect(ciWorkflowSource).toContain('') + expect(ciWorkflowSource).toContain('